mirror of
https://github.com/versity/scoutfs.git
synced 2026-04-30 09:56:55 +00:00
Compare commits
2 Commits
zab/v1_3_r
...
zab/cwskip
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5bea29a168 | ||
|
|
7a999f2657 |
@@ -2,59 +2,9 @@ Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.3
|
||||
v1.x
|
||||
\
|
||||
*Apr 7, 2022*
|
||||
|
||||
* **Fix rare server instability under heavy load**
|
||||
\
|
||||
Fixed a case of server instability under heavy load due to concurrent
|
||||
work fully exhausting metadata block allocation pools reserved for a
|
||||
single server transaction. This would cause brief interruption as the
|
||||
server shutdown and the next server started up and made progress as
|
||||
pending work was retried.
|
||||
|
||||
* **Fix slow fencing preventing server startup**
|
||||
\
|
||||
If a server had to process many fence requests with a slow fencing
|
||||
mechanism it could be interrupted before it finished. The server
|
||||
now makes sure heartbeat messages are sent while it is making progress
|
||||
on fencing requests so that other quorum members don't interrupt the
|
||||
process.
|
||||
|
||||
* **Performance improvement in getxattr and setxattr**
|
||||
\
|
||||
Kernel allocation patterns in the getxattr and setxattr
|
||||
implementations were causing significant contention between CPUs. Their
|
||||
allocation strategy was changed so that concurrent tasks can call these
|
||||
xattr methods without degrading performance.
|
||||
|
||||
---
|
||||
v1.2
|
||||
\
|
||||
*Mar 14, 2022*
|
||||
|
||||
* **Fix deadlock between fallocate() and read() system calls**
|
||||
\
|
||||
Fixed a lock inversion that could cause two tasks to deadlock if they
|
||||
performed fallocate() and read() on a file at the same time. The
|
||||
deadlock was uninterruptible so the machine needed to be rebooted. This
|
||||
was relatively rare as fallocate() is usually used to prepare files
|
||||
before they're used.
|
||||
|
||||
* **Fix instability from heavy file deletion workloads**
|
||||
\
|
||||
Fixed rare circumstances under which background file deletion cleanup
|
||||
tasks could try to delete a file while it is being deleted by another
|
||||
task. Heavy load across multiple nodes, either many files being deleted
|
||||
or large files being deleted, increased the chances of this happening.
|
||||
Heavy staging could cause this problem because staging can create many
|
||||
internal temporary files that need to be deleted.
|
||||
|
||||
---
|
||||
v1.1
|
||||
\
|
||||
*Feb 4, 2022*
|
||||
*TBD*
|
||||
|
||||
|
||||
* **Add scoutfs(1) change-quorum-config command**
|
||||
@@ -64,15 +14,6 @@ v1.1
|
||||
unmounted. This can be used to change the mounts that will
|
||||
participate in quorum and the IP addresses they use.
|
||||
|
||||
* **Fix Rare Risk of Item Cache Corruption**
|
||||
\
|
||||
Code review found a rare potential source of item cache corruption.
|
||||
If this happened it would look as though deleted parts of the filesystem
|
||||
returned, but only at the time they were deleted. Old deleted items are
|
||||
not affected. This problem only affected the item cache, never
|
||||
persistent storage. Unmounting and remounting would drop the bad item
|
||||
cache and resync it with the correct persistent data.
|
||||
|
||||
---
|
||||
v1.0
|
||||
\
|
||||
|
||||
@@ -13,6 +13,7 @@ scoutfs-y += \
|
||||
block.o \
|
||||
btree.o \
|
||||
client.o \
|
||||
cwskip.o \
|
||||
counters.o \
|
||||
data.o \
|
||||
dir.o \
|
||||
|
||||
@@ -1318,17 +1318,6 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
return lo;
|
||||
}
|
||||
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space)
|
||||
{
|
||||
unsigned int seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&alloc->seqlock);
|
||||
*avail_total = le32_to_cpu(alloc->avail.first_nr);
|
||||
*freed_space = list_block_space(alloc->freed.first_nr);
|
||||
} while (read_seqretry(&alloc->seqlock, seq));
|
||||
}
|
||||
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag)
|
||||
{
|
||||
|
||||
@@ -158,7 +158,6 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
|
||||
|
||||
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 nr);
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag);
|
||||
|
||||
|
||||
@@ -1875,12 +1875,11 @@ out:
|
||||
* set in btree items. They're only used for fs items written through
|
||||
* the item cache and forest of log btrees.
|
||||
*/
|
||||
int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_item_list *lst)
|
||||
int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
|
||||
scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg)
|
||||
{
|
||||
struct scoutfs_btree_item_desc desc;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_btree_block *bt;
|
||||
@@ -1889,44 +1888,46 @@ int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
int cmp;
|
||||
int ret = 0;
|
||||
|
||||
while (lst) {
|
||||
pos = iter_cb(sb, &desc, pos, arg);
|
||||
|
||||
while (pos) {
|
||||
ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
|
||||
&lst->key, lst->val_len, &bl, &kr, NULL);
|
||||
desc.key, desc.val_len, &bl, &kr, NULL);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
bt = bl->data;
|
||||
|
||||
do {
|
||||
item = leaf_item_hash_search(sb, bt, &lst->key);
|
||||
item = leaf_item_hash_search(sb, bt, desc.key);
|
||||
if (item) {
|
||||
/* try to merge delta values, _NULL not deleted; merge will */
|
||||
ret = scoutfs_forest_combine_deltas(&lst->key,
|
||||
ret = scoutfs_forest_combine_deltas(desc.key,
|
||||
item_val(bt, item),
|
||||
item_val_len(item),
|
||||
lst->val, lst->val_len);
|
||||
desc.val, desc.val_len);
|
||||
if (ret < 0) {
|
||||
scoutfs_block_put(sb, bl);
|
||||
goto out;
|
||||
}
|
||||
|
||||
item->seq = cpu_to_le64(lst->seq);
|
||||
item->flags = lst->flags;
|
||||
item->seq = cpu_to_le64(desc.seq);
|
||||
item->flags = desc.flags;
|
||||
|
||||
if (ret == 0)
|
||||
update_item_value(bt, item, lst->val, lst->val_len);
|
||||
update_item_value(bt, item, desc.val, desc.val_len);
|
||||
else
|
||||
ret = 0;
|
||||
} else {
|
||||
scoutfs_avl_search(&bt->item_root,
|
||||
cmp_key_item, &lst->key,
|
||||
cmp_key_item, desc.key,
|
||||
&cmp, &par, NULL, NULL);
|
||||
create_item(bt, &lst->key, lst->seq, lst->flags, lst->val,
|
||||
lst->val_len, par, cmp);
|
||||
create_item(bt, desc.key, desc.seq, desc.flags, desc.val,
|
||||
desc.val_len, par, cmp);
|
||||
}
|
||||
|
||||
lst = lst->next;
|
||||
} while (lst && scoutfs_key_compare(&lst->key, &kr.end) <= 0 &&
|
||||
mid_free_item_room(bt, lst->val_len));
|
||||
pos = iter_cb(sb, &desc, pos, arg);
|
||||
} while (pos && scoutfs_key_compare(desc.key, &kr.end) <= 0 &&
|
||||
mid_free_item_room(bt, desc.val_len));
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
}
|
||||
@@ -2449,7 +2450,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_btree_root *root, int free_budget)
|
||||
struct scoutfs_btree_root *root, int alloc_low)
|
||||
{
|
||||
u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT];
|
||||
struct scoutfs_block *bl = NULL;
|
||||
@@ -2459,15 +2460,11 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_avl_node *node;
|
||||
struct scoutfs_avl_node *next;
|
||||
struct scoutfs_key par_next;
|
||||
int nr_freed = 0;
|
||||
int nr_par;
|
||||
int level;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(free_budget <= 0))
|
||||
return -EINVAL;
|
||||
|
||||
if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos)))
|
||||
return -EIO; /* XXX corruption */
|
||||
|
||||
@@ -2542,7 +2539,8 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
while (node) {
|
||||
|
||||
/* make sure we can always free parents after leaves */
|
||||
if ((nr_freed + 1 + nr_par) > free_budget) {
|
||||
if (scoutfs_alloc_meta_low(sb, alloc,
|
||||
alloc_low + nr_par + 1)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
@@ -2556,7 +2554,6 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
le64_to_cpu(ref.blkno));
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
nr_freed++;
|
||||
|
||||
node = scoutfs_avl_next(&bt->item_root, node);
|
||||
if (node) {
|
||||
@@ -2572,7 +2569,6 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
blknos[i]);
|
||||
ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]);
|
||||
BUG_ON(ret); /* checked meta low, freed should fit */
|
||||
nr_freed++;
|
||||
}
|
||||
|
||||
/* restart walk past the subtree we just freed */
|
||||
|
||||
@@ -18,11 +18,24 @@ struct scoutfs_btree_item_ref {
|
||||
#define SCOUTFS_BTREE_ITEM_REF(name) \
|
||||
struct scoutfs_btree_item_ref name = {NULL,}
|
||||
|
||||
/* caller gives an item to the callback */
|
||||
/* btree gives an item to caller */
|
||||
typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
|
||||
struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, void *arg);
|
||||
|
||||
struct scoutfs_btree_item_desc {
|
||||
struct scoutfs_key *key;
|
||||
void *val;
|
||||
u64 seq;
|
||||
u8 flags;
|
||||
unsigned val_len;
|
||||
};
|
||||
|
||||
/* btree iterates through items from caller */
|
||||
typedef void *(*scoutfs_btree_item_iter_cb)(struct super_block *sb,
|
||||
struct scoutfs_btree_item_desc *desc,
|
||||
void *pos, void *arg);
|
||||
|
||||
/* simple singly-linked list of items */
|
||||
struct scoutfs_btree_item_list {
|
||||
struct scoutfs_btree_item_list *next;
|
||||
@@ -78,11 +91,9 @@ int scoutfs_btree_read_items(struct super_block *sb,
|
||||
struct scoutfs_key *start,
|
||||
struct scoutfs_key *end,
|
||||
scoutfs_btree_item_cb cb, void *arg);
|
||||
int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_item_list *lst);
|
||||
int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
|
||||
scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg);
|
||||
|
||||
int scoutfs_btree_parent_range(struct super_block *sb,
|
||||
struct scoutfs_btree_root *root,
|
||||
@@ -125,7 +136,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_btree_root *root, int free_budget);
|
||||
struct scoutfs_btree_root *root, int alloc_low);
|
||||
|
||||
void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
|
||||
|
||||
|
||||
@@ -477,15 +477,12 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
|
||||
struct super_block *sb = client->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
const bool am_quorum = opts->quorum_slot_nr >= 0;
|
||||
struct scoutfs_net_greeting greet;
|
||||
struct sockaddr_in sin;
|
||||
bool am_quorum;
|
||||
int ret;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
am_quorum = opts.quorum_slot_nr >= 0;
|
||||
|
||||
/* can unmount once server farewell handling removes our item */
|
||||
if (client->sending_farewell &&
|
||||
lookup_mounted_client_item(sb, sbi->rid) == 0) {
|
||||
|
||||
@@ -90,36 +90,27 @@
|
||||
EXPAND_COUNTER(forest_read_items) \
|
||||
EXPAND_COUNTER(forest_roots_next_hint) \
|
||||
EXPAND_COUNTER(forest_set_bloom_bits) \
|
||||
EXPAND_COUNTER(item_alloc_bytes) \
|
||||
EXPAND_COUNTER(item_clear_dirty) \
|
||||
EXPAND_COUNTER(item_create) \
|
||||
EXPAND_COUNTER(item_delete) \
|
||||
EXPAND_COUNTER(item_delta) \
|
||||
EXPAND_COUNTER(item_delta_written) \
|
||||
EXPAND_COUNTER(item_dirty) \
|
||||
EXPAND_COUNTER(item_free_bytes) \
|
||||
EXPAND_COUNTER(item_invalidate) \
|
||||
EXPAND_COUNTER(item_invalidate_page) \
|
||||
EXPAND_COUNTER(item_invalidate_item) \
|
||||
EXPAND_COUNTER(item_lookup) \
|
||||
EXPAND_COUNTER(item_mark_dirty) \
|
||||
EXPAND_COUNTER(item_next) \
|
||||
EXPAND_COUNTER(item_page_accessed) \
|
||||
EXPAND_COUNTER(item_page_alloc) \
|
||||
EXPAND_COUNTER(item_page_clear_dirty) \
|
||||
EXPAND_COUNTER(item_page_compact) \
|
||||
EXPAND_COUNTER(item_page_free) \
|
||||
EXPAND_COUNTER(item_page_lru_add) \
|
||||
EXPAND_COUNTER(item_page_lru_remove) \
|
||||
EXPAND_COUNTER(item_page_mark_dirty) \
|
||||
EXPAND_COUNTER(item_page_rbtree_walk) \
|
||||
EXPAND_COUNTER(item_page_split) \
|
||||
EXPAND_COUNTER(item_pcpu_add_replaced) \
|
||||
EXPAND_COUNTER(item_pcpu_page_hit) \
|
||||
EXPAND_COUNTER(item_pcpu_page_miss) \
|
||||
EXPAND_COUNTER(item_pcpu_page_miss_keys) \
|
||||
EXPAND_COUNTER(item_read_pages_split) \
|
||||
EXPAND_COUNTER(item_shrink_page) \
|
||||
EXPAND_COUNTER(item_shrink_page_dirty) \
|
||||
EXPAND_COUNTER(item_shrink_page_reader) \
|
||||
EXPAND_COUNTER(item_shrink_page_trylock) \
|
||||
EXPAND_COUNTER(item_shrink) \
|
||||
EXPAND_COUNTER(item_shrink_all) \
|
||||
EXPAND_COUNTER(item_shrink_exhausted) \
|
||||
EXPAND_COUNTER(item_shrink_read_search) \
|
||||
EXPAND_COUNTER(item_shrink_removed) \
|
||||
EXPAND_COUNTER(item_shrink_searched) \
|
||||
EXPAND_COUNTER(item_shrink_skipped) \
|
||||
EXPAND_COUNTER(item_shrink_write_search) \
|
||||
EXPAND_COUNTER(item_update) \
|
||||
EXPAND_COUNTER(item_write_dirty) \
|
||||
EXPAND_COUNTER(lock_alloc) \
|
||||
@@ -152,12 +143,11 @@
|
||||
EXPAND_COUNTER(net_recv_messages) \
|
||||
EXPAND_COUNTER(net_unknown_request) \
|
||||
EXPAND_COUNTER(orphan_scan) \
|
||||
EXPAND_COUNTER(orphan_scan_attempts) \
|
||||
EXPAND_COUNTER(orphan_scan_cached) \
|
||||
EXPAND_COUNTER(orphan_scan_error) \
|
||||
EXPAND_COUNTER(orphan_scan_item) \
|
||||
EXPAND_COUNTER(orphan_scan_omap_set) \
|
||||
EXPAND_COUNTER(quorum_candidate_server_stopping) \
|
||||
EXPAND_COUNTER(orphan_scan_read) \
|
||||
EXPAND_COUNTER(quorum_elected) \
|
||||
EXPAND_COUNTER(quorum_fence_error) \
|
||||
EXPAND_COUNTER(quorum_fence_leader) \
|
||||
|
||||
584
kmod/src/cwskip.c
Normal file
584
kmod/src/cwskip.c
Normal file
@@ -0,0 +1,584 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include "cwskip.h"
|
||||
|
||||
/*
|
||||
* This skip list is built to allow concurrent modification and limit
|
||||
* contention to the region of the list around the modification. All
|
||||
* node references are protected by RCU. Each node has a write_seq
|
||||
* that works like a seqlock, the big differences are that we nest them
|
||||
* and use trylock to acquire them.
|
||||
*
|
||||
* Readers sample the write_seqs of nodes containing links as they
|
||||
* traverse them, verifying that the node hasn't been modified before
|
||||
* traversing to the node referenced by the link.
|
||||
*
|
||||
* Writers remember the seqs of all the nodes they traversed to end up
|
||||
* at their final node. They try to acquire the lock of all the nodes
|
||||
* needed to modify the list at a given height. Their trylocks will
|
||||
* fail if any of the nodes have changed since their traversal.
|
||||
*
|
||||
* The interface is built around references to adjacent pairs of nodes
|
||||
* and their sequence numbers. This lets readers and writers traverse
|
||||
* through their local region of the list until they hit contention and
|
||||
* must start over with a full search.
|
||||
*
|
||||
* The caller is responsible for allocating and freeing nodes. The
|
||||
* interface is built around caller's objects which each have embedded
|
||||
* nodes.
|
||||
*/
|
||||
|
||||
/*
|
||||
* node_off is the positive offset of the cwskip node within the
|
||||
* container structs stored in the list. The node_off is subtracted
|
||||
* from node pointers to give the caller a pointer to their stored
|
||||
* container struct.
|
||||
*/
|
||||
void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
|
||||
unsigned long node_off)
|
||||
{
|
||||
memset(root, 0, sizeof(&root));
|
||||
root->cmp_fn = cmp_fn;
|
||||
root->node_off = node_off;
|
||||
}
|
||||
|
||||
/* This is completely racey and should be used accordingly. */
|
||||
bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SCOUTFS_CWSKIP_MAX_HEIGHT; i++) {
|
||||
if (root->node.links[i] != NULL)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a random height between 1 and max height, inclusive. Using
|
||||
* ffs means that each greater height relies on all lower height bits
|
||||
* being clear and we get the height distribution we want: 1 = 1/2,
|
||||
* 2 = 1/4, 3 = 1/8, etc.
|
||||
*/
|
||||
int scoutfs_cwskip_rand_height(void)
|
||||
{
|
||||
return ffs(prandom_u32() | (1 << (SCOUTFS_CWSKIP_MAX_HEIGHT - 1)));
|
||||
}
|
||||
|
||||
static void *node_container(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
return node ? (void *)((unsigned long)node - root->node_off) : NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the caller's containers for the given nodes. There isn't a
|
||||
* previous container when the previous node is the root's static
|
||||
* full-height node.
|
||||
*/
|
||||
static void set_containers(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *prev,
|
||||
struct scoutfs_cwskip_node *node, void **prev_cont, void **node_cont)
|
||||
{
|
||||
if (prev_cont)
|
||||
*prev_cont = (prev != &root->node) ? node_container(root, prev) : NULL;
|
||||
if (node_cont)
|
||||
*node_cont = node_container(root, node);
|
||||
}
|
||||
|
||||
static struct scoutfs_cwskip_node *node_read_begin(struct scoutfs_cwskip_node *node,
|
||||
unsigned int *seq)
|
||||
{
|
||||
if (node) {
|
||||
*seq = READ_ONCE(node->write_seq) & ~1U;
|
||||
smp_rmb();
|
||||
} else {
|
||||
*seq = 1; /* caller shouldn't use if we return null, being careful */
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
static bool node_read_retry(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
if (node) {
|
||||
smp_rmb();
|
||||
return READ_ONCE(node->write_seq) != seq;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* write_seq is only an int to reduce the size of nodes and full-height
|
||||
* seq arrays, it could be a long if archs have trouble with int
|
||||
* cmpxchg.
|
||||
*/
|
||||
static bool __node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
if (seq & 1)
|
||||
return false;
|
||||
|
||||
return cmpxchg(&node->write_seq, seq, seq + 1) == seq;
|
||||
}
|
||||
|
||||
static bool node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
bool locked = __node_trylock(node, seq);
|
||||
if (locked)
|
||||
smp_wmb();
|
||||
return locked;
|
||||
}
|
||||
|
||||
static void __node_unlock(struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
node->write_seq++;
|
||||
}
|
||||
|
||||
static void node_unlock(struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
__node_unlock(node);
|
||||
smp_wmb();
|
||||
}
|
||||
|
||||
/* return -1/1 to go left/right, never 0 */
|
||||
static int random_cmp(void *K, void *C)
|
||||
{
|
||||
return (int)(prandom_u32() & 2) - 1;
|
||||
}
|
||||
|
||||
static void cwskip_search(struct scoutfs_cwskip_root *root, void *key, int *node_cmp,
|
||||
struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_writer *wr,
|
||||
unsigned int *prev_seqs)
|
||||
{
|
||||
struct scoutfs_cwskip_node *prev;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
scoutfs_cwskip_cmp_t cmp_fn;
|
||||
unsigned int prev_seq;
|
||||
unsigned int node_seq;
|
||||
int level;
|
||||
int cmp;
|
||||
|
||||
if (key == NULL)
|
||||
cmp_fn = random_cmp;
|
||||
|
||||
restart:
|
||||
prev = node_read_begin(&root->node, &prev_seq);
|
||||
node = NULL;
|
||||
node_seq = 1;
|
||||
cmp = -1;
|
||||
|
||||
level = SCOUTFS_CWSKIP_MAX_HEIGHT - 1;
|
||||
while (prev && level >= 0) {
|
||||
node = node_read_begin(prev->links[level], &node_seq);
|
||||
if (!node) {
|
||||
cmp = -1;
|
||||
level--;
|
||||
continue;
|
||||
}
|
||||
|
||||
cmp = cmp_fn(key, node_container(root, node));
|
||||
if (cmp > 0) {
|
||||
if (node_read_retry(prev, prev_seq))
|
||||
goto restart;
|
||||
prev = node;
|
||||
prev_seq = node_seq;
|
||||
node = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (wr) {
|
||||
wr->prevs[level] = prev;
|
||||
prev_seqs[level] = prev_seq;
|
||||
}
|
||||
|
||||
level--;
|
||||
}
|
||||
|
||||
rd->prev = prev;
|
||||
rd->prev_seq = prev_seq;
|
||||
rd->node = node;
|
||||
rd->node_seq = node_seq;
|
||||
*node_cmp = cmp;
|
||||
}
|
||||
|
||||
static void init_reader(struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
memset(rd, 0, sizeof(struct scoutfs_cwskip_reader));
|
||||
rd->root = root;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and returns nodes that surround the search key.
|
||||
*
|
||||
* Either prev or null can be null if there are no nodes before or after
|
||||
* the search key. *node_cmp is set to the final comparison of the key
|
||||
* and the returned node's container key, it will be 0 if an exact match
|
||||
* is found.
|
||||
*
|
||||
* This starts an RCU read critical section and is fully concurrent with
|
||||
* both other readers and writers. The nodes won't be freed until
|
||||
* after the section so its always safe to reference them but their
|
||||
* contents might be nonsense if they're modified during the read.
|
||||
* Nothing learned from the list during the read section should have an
|
||||
* effect until after _read_valid has said it was OK.
|
||||
*
|
||||
* _read_valid can be called after referencing the nodes to see if they
|
||||
* were stable during the read. _read_next can be used to iterate
|
||||
* forward through the list without repeating the search. The caller
|
||||
* must always call a matching _read_end once they're done.
|
||||
*/
|
||||
void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
|
||||
void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd)
|
||||
__acquires(RCU) /* :/ */
|
||||
{
|
||||
init_reader(rd, root);
|
||||
|
||||
rcu_read_lock();
|
||||
cwskip_search(root, key, node_cmp, rd, NULL, NULL);
|
||||
set_containers(root, rd->prev, rd->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true of the nodes referenced by the reader haven't been
|
||||
* modified and any references of them were consistent. Thsi does not
|
||||
* end the reader critical section and can be called multiple times.
|
||||
*/
|
||||
bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd)
|
||||
{
|
||||
return !(node_read_retry(rd->prev, rd->prev_seq) &&
|
||||
node_read_retry(rd->node, rd->node_seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance from the current prev/node to the next pair of nodes in the
|
||||
* list. prev_cont is set to what node_cont was before the call.
|
||||
* node_cont is set to the next node after the current node_cont.
|
||||
*
|
||||
* This returns true if it found a next node and that its load of the
|
||||
* next pointer from node was valid and stable. Returning false means
|
||||
* that the caller should retry. There could be more items in the list.
|
||||
*/
|
||||
bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont)
|
||||
{
|
||||
struct scoutfs_cwskip_node *next;
|
||||
unsigned int next_seq;
|
||||
bool valid_next;
|
||||
|
||||
next = rd->node ? node_read_begin(rd->node->links[0], &next_seq) : NULL;
|
||||
valid_next = scoutfs_cwskip_read_valid(rd) && next;
|
||||
if (valid_next) {
|
||||
rd->prev = rd->node;
|
||||
rd->prev_seq = rd->node_seq;
|
||||
rd->node = next;
|
||||
rd->node_seq = next_seq;
|
||||
|
||||
set_containers(rd->root, rd->prev, rd->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
return valid_next;
|
||||
}
|
||||
|
||||
/*
|
||||
* End the critical section started with _read_begin.
|
||||
*/
|
||||
void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd)
|
||||
__releases(RCU) /* :/ */
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Higher locks are more likely to cause contention so we unlock them
|
||||
* first.
|
||||
*/
|
||||
static void writer_unlock(struct scoutfs_cwskip_writer *wr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = wr->locked_height - 1; i >= 0; i--) {
|
||||
if (i == 0 || (wr->prevs[i - 1] != wr->prevs[i]))
|
||||
__node_unlock(wr->prevs[i]);
|
||||
}
|
||||
|
||||
if (wr->node_locked)
|
||||
__node_unlock(wr->node);
|
||||
|
||||
smp_wmb();
|
||||
|
||||
wr->locked_height = 0;
|
||||
wr->node_locked = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* A search traversal has saved all the previous nodes at each level.
|
||||
*
|
||||
* We try to acquire the write_seq locks for all the prevs up to height
|
||||
* from the seqs that we read during the search. The search was
|
||||
* protected by read sections so the prevs represent a consistent
|
||||
* version of the list at some point in the past. If nodes have been
|
||||
* locked since we read them we won't be able to acquire the locks.
|
||||
* Nodes aren't re-inserted after removal so we shouldn't see nodes in
|
||||
* multiple places (which would deadlock).
|
||||
*
|
||||
* The same node can be in multiple prev slots. We're careful to only
|
||||
* try locking the lowest duplicate slot.
|
||||
*
|
||||
* We lock from the highest level down. This only matters when there's
|
||||
* contention. The higher nodes are more likely to see contention so
|
||||
* we want trylock to fail early to avoid useless locking churn on lower
|
||||
* nodes.
|
||||
*/
|
||||
static bool writer_trylock(struct scoutfs_cwskip_writer *wr, unsigned int *prev_seqs, int height)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(wr->locked_height != 0) ||
|
||||
WARN_ON_ONCE(height < 1 || height > ARRAY_SIZE(wr->prevs)))
|
||||
return false;
|
||||
|
||||
for (i = height - 1; i >= 0; i--) {
|
||||
if ((i == 0 || wr->prevs[i - 1] != wr->prevs[i]) &&
|
||||
!__node_trylock(wr->prevs[i], prev_seqs[i]))
|
||||
break;
|
||||
wr->locked_height++;
|
||||
}
|
||||
|
||||
if (i < height) {
|
||||
writer_unlock(wr);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* paranoid debugging verification */
|
||||
for (i = 0; i < wr->locked_height; i++) {
|
||||
BUG_ON(wr->prevs[i]->height <= i);
|
||||
BUG_ON(wr->node && i < wr->node->height && wr->prevs[i]->links[i] != wr->node);
|
||||
}
|
||||
|
||||
smp_mb();
|
||||
return true;
|
||||
}
|
||||
|
||||
static void init_writer(struct scoutfs_cwskip_writer *wr, struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
memset(wr, 0, sizeof(struct scoutfs_cwskip_writer));
|
||||
wr->root = root;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search for and return references to the two nodes that surround the
|
||||
* search key, with the nodes locked.
|
||||
*
|
||||
* Either node can be null if there are no nodes before or after the
|
||||
* search key. We still hold a lock on the static root node if the
|
||||
* search key falls before the first node in the list.
|
||||
*
|
||||
* If lock_height is 0 then the caller is saying that they just want to
|
||||
* lock the surrounding nodes and not modify their position in the list.
|
||||
* We only lock those two nodes. Any greater lock_height represents a
|
||||
* height that we need to lock so the caller can insert an allocated
|
||||
* node with that height.
|
||||
*
|
||||
* The caller can use the writer context to iterate through locked nodes
|
||||
* via the lowest level list that contains all nodes. If they hit a
|
||||
* node that's higher than the locked height in the writer then they
|
||||
* have to unlock and restart because we don't have the previous node
|
||||
* for that height. We set a min level that we lock to reduce the
|
||||
* possibility of hitting higher nodes and retrying.
|
||||
*/
|
||||
#define MIN_LOCKED_HEIGHT 4
|
||||
void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
|
||||
void **prev_cont, void **node_cont, int *node_cmp,
|
||||
struct scoutfs_cwskip_writer *wr)
|
||||
__acquires(RCU) /* :/ */
|
||||
{
|
||||
unsigned int prev_seqs[SCOUTFS_CWSKIP_MAX_HEIGHT];
|
||||
struct scoutfs_cwskip_reader rd;
|
||||
int node_height;
|
||||
int use_height;
|
||||
bool locked;
|
||||
|
||||
BUG_ON(WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT));
|
||||
|
||||
do {
|
||||
init_reader(&rd, root);
|
||||
init_writer(wr, root);
|
||||
|
||||
rcu_read_lock();
|
||||
cwskip_search(root, key, node_cmp, &rd, wr, NULL);
|
||||
|
||||
wr->node = rd.node;
|
||||
if (wr->node) {
|
||||
/* _trylock of prevs will issue barrier on success */
|
||||
if (!__node_trylock(wr->node, rd.node_seq)) {
|
||||
locked = false;
|
||||
continue;
|
||||
}
|
||||
wr->node_locked = true;
|
||||
node_height = wr->node->height;
|
||||
} else {
|
||||
node_height = 0;
|
||||
}
|
||||
|
||||
if (lock_height > 0)
|
||||
use_height = max3(MIN_LOCKED_HEIGHT, node_height, lock_height);
|
||||
else
|
||||
use_height = 1;
|
||||
|
||||
locked = writer_trylock(wr, prev_seqs, use_height);
|
||||
if (!locked)
|
||||
rcu_read_unlock();
|
||||
} while (!locked);
|
||||
|
||||
set_containers(root, wr->prevs[0], wr->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new node between the writer's two locked nodes. The
|
||||
* inserting node is locked and replaces the existing node in the writer
|
||||
* which is unlocked.
|
||||
*
|
||||
* The next node may not exist. The previous nodes will always exist
|
||||
* though they may be the static root node.
|
||||
*
|
||||
* The inserting node is visible to readers the moment we store the
|
||||
* first link to it in previous nodes. We first lock it with a write
|
||||
* barrier so that any readers will retry if they visit it before all
|
||||
* its links are updated and its unlocked.
|
||||
*
|
||||
* We don't unlock prevs that are higher than the inserting node. This
|
||||
* lets the caller continue iterating through nodes that are higher than
|
||||
* insertion but still under the locked height.
|
||||
*/
|
||||
void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *ins)
|
||||
{
|
||||
struct scoutfs_cwskip_node *node = wr->node;
|
||||
int i;
|
||||
|
||||
BUG_ON(ins->height > wr->locked_height);
|
||||
node_trylock(ins, ins->write_seq);
|
||||
|
||||
for (i = 0; i < ins->height; i++) {
|
||||
ins->links[i] = wr->prevs[i]->links[i];
|
||||
wr->prevs[i]->links[i] = ins;
|
||||
}
|
||||
|
||||
if (node)
|
||||
node_unlock(node);
|
||||
wr->node = ins;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the node in the writer from the list. The writers node
|
||||
* pointer is not advanced because we don't want this to be able to fail
|
||||
* if trylock on the next node fails. The caller can call _write_next
|
||||
* on this writer and it will try and iterate from prevs[0].
|
||||
*
|
||||
* The caller's removal argument must be the node pointer in the writer.
|
||||
* This is redundant but meant to communicate to the caller that they're
|
||||
* responsible for the node after removing it (presumably queueing it
|
||||
* for freeing before _write_end leaves rcu).
|
||||
*
|
||||
* Readers can be traversing our node as we modify its pointers and can
|
||||
* read a temporarily inconsistent state. We have the node locked so
|
||||
* the reader will immediately retry once the check the seqs after
|
||||
* hitting our node that's being removed.
|
||||
*/
|
||||
void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
int i;
|
||||
|
||||
BUG_ON(node != wr->node);
|
||||
BUG_ON(node->height > wr->locked_height);
|
||||
|
||||
for (i = 0; i < node->height; i++) {
|
||||
wr->prevs[i]->links[i] = node->links[i];
|
||||
node->links[i] = NULL;
|
||||
}
|
||||
|
||||
node_unlock(node);
|
||||
wr->node = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance through the list by setting prevs to node and node to the
|
||||
* next node in the list after locking it. Returns true only if there
|
||||
* was a next node that we were able to lock. Returning false can mean
|
||||
* that we weren't able to lock the next node and the caller should
|
||||
* retry a full search.
|
||||
*
|
||||
* This may be called after _write_remove clears node so we try to
|
||||
* iterate from prev if there is no node.
|
||||
*
|
||||
* If lock_height is greater than zero then the caller needs at least
|
||||
* that lock_height to insert a node of that height. If locked_height
|
||||
* doesn't cover it then we return false so the caller can retry
|
||||
* _write_begin with the needed height.
|
||||
*
|
||||
* Like insertion, we don't unlock prevs higher than the height of the
|
||||
* next node. They're not strictly needed to modify the next node but
|
||||
* we want to keep them locked so the caller can continue to iterate
|
||||
* through nodes up to the locked height.
|
||||
*/
|
||||
bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
|
||||
void **prev_cont, void **node_cont)
|
||||
{
|
||||
struct scoutfs_cwskip_node *next;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT))
|
||||
return false;
|
||||
|
||||
if (wr->node)
|
||||
next = rcu_dereference(wr->node->links[0]);
|
||||
else
|
||||
next = rcu_dereference(wr->prevs[0]->links[0]);
|
||||
|
||||
if (!next ||
|
||||
(lock_height > wr->locked_height) ||
|
||||
(lock_height > 0 && next->height > wr->locked_height) ||
|
||||
!__node_trylock(next, next->write_seq))
|
||||
return false;
|
||||
|
||||
if (!wr->node) {
|
||||
/* set next as missing node */
|
||||
wr->node = next;
|
||||
wr->node_locked = true;
|
||||
|
||||
} else {
|
||||
/* existing node becomes prevs for its height */
|
||||
__node_unlock(wr->prevs[0]);
|
||||
for (i = 0; i < wr->node->height; i++)
|
||||
wr->prevs[0] = wr->node;
|
||||
wr->node = next;
|
||||
}
|
||||
|
||||
smp_wmb(); /* next locked and prev unlocked */
|
||||
|
||||
set_containers(wr->root, wr->prevs[0], wr->node, prev_cont, node_cont);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr)
|
||||
__releases(RCU) /* :/ */
|
||||
{
|
||||
writer_unlock(wr);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
68
kmod/src/cwskip.h
Normal file
68
kmod/src/cwskip.h
Normal file
@@ -0,0 +1,68 @@
|
||||
#ifndef _SCOUTFS_CWSKIP_H_
|
||||
#define _SCOUTFS_CWSKIP_H_
|
||||
|
||||
/* A billion seems like a lot. */
|
||||
#define SCOUTFS_CWSKIP_MAX_HEIGHT 30
|
||||
|
||||
struct scoutfs_cwskip_node {
|
||||
int height;
|
||||
unsigned int write_seq;
|
||||
struct scoutfs_cwskip_node *links[];
|
||||
};
|
||||
|
||||
#define SCOUTFS_CWSKIP_FULL_NODE_BYTES \
|
||||
offsetof(struct scoutfs_cwskip_node, links[SCOUTFS_CWSKIP_MAX_HEIGHT + 1])
|
||||
|
||||
typedef int (*scoutfs_cwskip_cmp_t)(void *K, void *C);
|
||||
|
||||
struct scoutfs_cwskip_root {
|
||||
scoutfs_cwskip_cmp_t cmp_fn;
|
||||
unsigned long node_off;
|
||||
union {
|
||||
struct scoutfs_cwskip_node node;
|
||||
__u8 __full_root_node[SCOUTFS_CWSKIP_FULL_NODE_BYTES];
|
||||
};
|
||||
};
|
||||
|
||||
struct scoutfs_cwskip_reader {
|
||||
struct scoutfs_cwskip_root *root;
|
||||
struct scoutfs_cwskip_node *prev;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
unsigned int prev_seq;
|
||||
unsigned int node_seq;
|
||||
};
|
||||
|
||||
/*
|
||||
* The full height prevs array makes these pretty enormous :/.
|
||||
*/
|
||||
struct scoutfs_cwskip_writer {
|
||||
struct scoutfs_cwskip_root *root;
|
||||
bool node_locked;
|
||||
int locked_height;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
struct scoutfs_cwskip_node *prevs[SCOUTFS_CWSKIP_MAX_HEIGHT];
|
||||
};
|
||||
|
||||
void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
|
||||
unsigned long node_off);
|
||||
bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root);
|
||||
int scoutfs_cwskip_rand_height(void);
|
||||
|
||||
void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
|
||||
void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd);
|
||||
bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd);
|
||||
bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont);
|
||||
void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd);
|
||||
|
||||
void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
|
||||
void **prev_cont, void **node_cont, int *node_cmp,
|
||||
struct scoutfs_cwskip_writer *wr);
|
||||
void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *ins);
|
||||
void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *node);
|
||||
bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
|
||||
void **prev_cont, void **node_cont);
|
||||
void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr);
|
||||
|
||||
#endif
|
||||
@@ -983,6 +983,9 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
u64 last;
|
||||
s64 ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
/* XXX support more flags */
|
||||
if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
|
||||
ret = -EOPNOTSUPP;
|
||||
@@ -1000,22 +1003,18 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret)
|
||||
goto out_mutex;
|
||||
goto out;
|
||||
|
||||
inode_dio_wait(inode);
|
||||
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
||||
(offset + len > i_size_read(inode))) {
|
||||
ret = inode_newsize_ok(inode, offset + len);
|
||||
if (ret)
|
||||
goto out_extent;
|
||||
goto out;
|
||||
}
|
||||
|
||||
iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
@@ -1025,7 +1024,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
|
||||
if (ret)
|
||||
goto out_extent;
|
||||
goto out;
|
||||
|
||||
ret = fallocate_extents(sb, inode, iblock, last, lock);
|
||||
|
||||
@@ -1051,19 +1050,17 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
}
|
||||
|
||||
if (ret <= 0)
|
||||
goto out_extent;
|
||||
goto out;
|
||||
|
||||
iblock += ret;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
out_extent:
|
||||
up_write(&si->extent_sem);
|
||||
out_mutex:
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
up_write(&si->extent_sem);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
out:
|
||||
trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -511,7 +511,7 @@ out:
|
||||
else if (ino == 0)
|
||||
inode = NULL;
|
||||
else
|
||||
inode = scoutfs_iget(sb, ino, 0, 0);
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
|
||||
/*
|
||||
* We can't splice dir aliases into the dcache. dir entries
|
||||
@@ -720,7 +720,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
|
||||
struct list_head *ind_locks)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode = NULL;
|
||||
struct inode *inode;
|
||||
u64 ind_seq;
|
||||
int ret = 0;
|
||||
u64 ino;
|
||||
@@ -765,9 +765,11 @@ retry:
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode);
|
||||
if (ret < 0)
|
||||
inode = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_dirty_inode_item(dir, *dir_lock);
|
||||
out:
|
||||
@@ -785,8 +787,6 @@ out_unlock:
|
||||
*orph_lock = NULL;
|
||||
}
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
@@ -1319,11 +1319,11 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
insert_inode_hash(inode);
|
||||
/* XXX need to set i_op/fop before here for sec callbacks */
|
||||
d_instantiate(dentry, inode);
|
||||
inode = NULL;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0) {
|
||||
/* XXX remove inode items */
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock,
|
||||
NULL, name_len);
|
||||
@@ -1334,9 +1334,6 @@ out:
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1926,8 +1923,10 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
if (ret < 0)
|
||||
if (ret < 0) {
|
||||
iput(inode);
|
||||
goto out; /* XXX returning error but items created */
|
||||
}
|
||||
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
si->crtime = inode->i_mtime;
|
||||
@@ -1940,6 +1939,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
|
||||
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
iput(inode);
|
||||
|
||||
out:
|
||||
scoutfs_release_trans(sb);
|
||||
@@ -1948,9 +1948,6 @@ out:
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ static struct dentry *scoutfs_fh_to_dentry(struct super_block *sb,
|
||||
trace_scoutfs_fh_to_dentry(sb, fh_type, sfid);
|
||||
|
||||
if (scoutfs_valid_fileid(fh_type))
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0, SCOUTFS_IGF_LINKED);
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
@@ -100,7 +100,7 @@ static struct dentry *scoutfs_fh_to_parent(struct super_block *sb,
|
||||
|
||||
if (scoutfs_valid_fileid(fh_type) &&
|
||||
fh_type == FILEID_SCOUTFS_WITH_PARENT)
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0, SCOUTFS_IGF_LINKED);
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
@@ -123,7 +123,7 @@ static struct dentry *scoutfs_get_parent(struct dentry *child)
|
||||
scoutfs_dir_free_backref_path(sb, &list);
|
||||
trace_scoutfs_get_parent(sb, inode, ino);
|
||||
|
||||
inode = scoutfs_iget(sb, ino, 0, SCOUTFS_IGF_LINKED);
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
|
||||
@@ -395,13 +395,12 @@ int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
|
||||
int scoutfs_fence_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_mount_options opts;
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct fence_info *fi;
|
||||
int ret;
|
||||
|
||||
/* can only fence if we can be elected by quorum */
|
||||
scoutfs_options_read(sb, &opts);
|
||||
if (opts.quorum_slot_nr == -1) {
|
||||
if (opts->quorum_slot_nr == -1) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -494,13 +494,13 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_forest_insert_list(struct super_block *sb,
|
||||
struct scoutfs_btree_item_list *lst)
|
||||
int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
|
||||
void *pos, void *arg)
|
||||
{
|
||||
DECLARE_FOREST_INFO(sb, finf);
|
||||
|
||||
return scoutfs_btree_insert_list(sb, finf->alloc, finf->wri,
|
||||
&finf->our_log.item_root, lst);
|
||||
&finf->our_log.item_root, cb, pos, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -29,8 +29,8 @@ void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
|
||||
int scoutfs_forest_get_max_seq(struct super_block *sb,
|
||||
struct scoutfs_super_block *super,
|
||||
u64 *seq);
|
||||
int scoutfs_forest_insert_list(struct super_block *sb,
|
||||
struct scoutfs_btree_item_list *lst);
|
||||
int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
|
||||
void *pos, void *arg);
|
||||
int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
|
||||
|
||||
void scoutfs_forest_inc_inode_count(struct super_block *sb);
|
||||
|
||||
533
kmod/src/inode.c
533
kmod/src/inode.c
@@ -66,6 +66,10 @@ struct inode_sb_info {
|
||||
|
||||
struct delayed_work orphan_scan_dwork;
|
||||
|
||||
/* serialize multiple inode ->evict trying to delete same ino's items */
|
||||
spinlock_t deleting_items_lock;
|
||||
struct list_head deleting_items_list;
|
||||
|
||||
struct work_struct iput_work;
|
||||
struct llist_head iput_llist;
|
||||
};
|
||||
@@ -272,7 +276,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
set_item_info(si, cinode);
|
||||
}
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino)
|
||||
static void init_inode_key(struct scoutfs_key *key, u64 ino)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_FS_ZONE,
|
||||
@@ -292,7 +296,8 @@ void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino)
|
||||
* fields because they should have already had a locked refreshed inode
|
||||
* to be dereferencing its contents.
|
||||
*/
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
int flags)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
@@ -312,7 +317,7 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
if (atomic64_read(&si->last_refreshed) == refresh_gen)
|
||||
return 0;
|
||||
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
|
||||
mutex_lock(&si->item_mutex);
|
||||
if (atomic64_read(&si->last_refreshed) < refresh_gen) {
|
||||
@@ -658,12 +663,22 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
|
||||
} while (read_seqcount_retry(&si->seqcount, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* We have inversions between getting cluster locks while performing
|
||||
* final deletion on a freeing inode and waiting on a freeing inode
|
||||
* while holding a cluster lock.
|
||||
*
|
||||
* We can avoid these deadlocks by hiding freeing inodes in our hash
|
||||
* lookup function. We're fine with either returning null or populating
|
||||
* a new inode overlapping with eviction freeing a previous instance of
|
||||
* the inode.
|
||||
*/
|
||||
static int scoutfs_iget_test(struct inode *inode, void *arg)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
u64 *ino = arg;
|
||||
|
||||
return si->ino == *ino;
|
||||
return (si->ino == *ino) && !(inode->i_state & I_FREEING);
|
||||
}
|
||||
|
||||
static int scoutfs_iget_set(struct inode *inode, void *arg)
|
||||
@@ -677,93 +692,49 @@ static int scoutfs_iget_set(struct inode *inode, void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* There's a risk of a deadlock between lock invalidation and eviction.
|
||||
* Invalidation blocks locks while looking up inodes. Eviction blocks
|
||||
* inode lookups while trying to get a lock.
|
||||
*
|
||||
* We have an inode lookup variant which will never block waiting for an
|
||||
* inode. This is more aggressive than base ilookup5_nowait() which
|
||||
* will, you know, wait for inodes that are being freed. We have our
|
||||
* test function hide those inodes from find_inode so that it won't wait
|
||||
* on them.
|
||||
*
|
||||
* These semantics are sufficiently weird that we use a big giant scary
|
||||
* looking function name to deter use.
|
||||
*/
|
||||
static int ilookup_test_nonewfree(struct inode *inode, void *arg)
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
|
||||
{
|
||||
return scoutfs_iget_test(inode, arg) &&
|
||||
!(inode->i_state & (I_NEW | I_WILL_FREE | I_FREEING));
|
||||
}
|
||||
struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino)
|
||||
{
|
||||
return ilookup5_nowait(sb, ino, ilookup_test_nonewfree, &ino);
|
||||
return ilookup5(sb, ino, scoutfs_iget_test, &ino);
|
||||
}
|
||||
|
||||
/*
|
||||
* Final iput can delete an unused inode's items which can take multiple
|
||||
* locked transactions. iget (which can call iput in error cases) and
|
||||
* iput must not be called with locks or transactions held.
|
||||
*/
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf)
|
||||
{
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode = NULL;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
/* wait for vfs inode (I_FREEING in particular) before acquiring cluster lock */
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino);
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
|
||||
&ino);
|
||||
if (!inode) {
|
||||
ret = -ENOMEM;
|
||||
inode = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (inode->i_state & I_NEW) {
|
||||
/* XXX ensure refresh, instead clear in drop_inode? */
|
||||
si = SCOUTFS_I(inode);
|
||||
atomic64_set(&si->last_refreshed, 0);
|
||||
inode->i_version = 0;
|
||||
|
||||
ret = scoutfs_inode_refresh(inode, lock, 0);
|
||||
if (ret == 0)
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
if (ret) {
|
||||
iget_failed(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
} else {
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_inode_refresh(inode, lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* check nlink both for new and after refreshing */
|
||||
if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (inode->i_state & I_NEW) {
|
||||
ret = scoutfs_omap_set(sb, ino);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
|
||||
if (ret < 0) {
|
||||
if (inode) {
|
||||
if (inode->i_state & I_NEW)
|
||||
iget_failed(inode);
|
||||
else
|
||||
iput(inode);
|
||||
}
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
@@ -832,7 +803,7 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock)
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (!ret)
|
||||
@@ -1051,7 +1022,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
|
||||
ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list);
|
||||
BUG_ON(ret);
|
||||
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
init_inode_key(&key, ino);
|
||||
|
||||
err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (err) {
|
||||
@@ -1411,14 +1382,10 @@ out:
|
||||
/*
|
||||
* Allocate and initialize a new inode. The caller is responsible for
|
||||
* creating links to it and updating it. @dir can be null.
|
||||
*
|
||||
* This is called with locks and a transaction because it creates the
|
||||
* inode item. We can't call iput on the new inode on error. We
|
||||
* return the inode to the caller *including on error* for them to put
|
||||
* once they've released the transaction.
|
||||
*/
|
||||
int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
|
||||
u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret)
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev, u64 ino,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_inode_info *si;
|
||||
struct scoutfs_key key;
|
||||
@@ -1427,10 +1394,8 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
|
||||
int ret;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
si = SCOUTFS_I(inode);
|
||||
si->ino = ino;
|
||||
@@ -1456,19 +1421,22 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
|
||||
set_inode_ops(inode);
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_omap_set(sb, ino);
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (ret < 0)
|
||||
scoutfs_omap_clear(sb, ino);
|
||||
scoutfs_omap_dec(sb, ino);
|
||||
out:
|
||||
*inode_ret = inode;
|
||||
if (ret) {
|
||||
iput(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return inode;
|
||||
}
|
||||
|
||||
static void init_orphan_key(struct scoutfs_key *key, u64 ino)
|
||||
@@ -1503,6 +1471,44 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
|
||||
return scoutfs_item_delete_force(sb, &key, lock);
|
||||
}
|
||||
|
||||
struct deleting_ino_entry {
|
||||
struct list_head head;
|
||||
u64 ino;
|
||||
};
|
||||
|
||||
static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
|
||||
{
|
||||
struct deleting_ino_entry *tmp;
|
||||
bool added = true;
|
||||
|
||||
spin_lock(&inf->deleting_items_lock);
|
||||
|
||||
list_for_each_entry(tmp, &inf->deleting_items_list, head) {
|
||||
if (tmp->ino == ino) {
|
||||
added = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (added) {
|
||||
del->ino = ino;
|
||||
list_add_tail(&del->head, &inf->deleting_items_list);
|
||||
}
|
||||
|
||||
spin_unlock(&inf->deleting_items_lock);
|
||||
|
||||
return added;
|
||||
}
|
||||
|
||||
static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
|
||||
{
|
||||
if (del->ino) {
|
||||
spin_lock(&inf->deleting_items_lock);
|
||||
list_del_init(&del->head);
|
||||
spin_unlock(&inf->deleting_items_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all the items associated with a given inode. This is only
|
||||
* called once nlink has dropped to zero and nothing has the inode open
|
||||
@@ -1511,10 +1517,22 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
|
||||
* orphan item will continue triggering attempts to finish previous
|
||||
* partial deletion until all deletion is complete and the orphan item
|
||||
* is removed.
|
||||
*
|
||||
* Currently this can be called multiple times for multiple cached
|
||||
* inodes for a given ino number (ilookup avoids freeing inodes to avoid
|
||||
* cluster lock<->inode flag waiting inversions). Some items are not
|
||||
* safe to delete concurrently, for example concurrent data truncation
|
||||
* could free extents multiple times. We use a very silly list of inos
|
||||
* being deleted. Duplicates just return success. If the first
|
||||
* deletion ends up failing orphan deletion will come back around later
|
||||
* and retry.
|
||||
*/
|
||||
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
|
||||
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
|
||||
struct scoutfs_lock *orph_lock)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
struct deleting_ino_entry del = {{NULL, }};
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
LIST_HEAD(ind_locks);
|
||||
bool release = false;
|
||||
@@ -1523,10 +1541,30 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_in
|
||||
u64 size;
|
||||
int ret;
|
||||
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
if (!added_deleting_ino(inf, &del, ino)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mode = le32_to_cpu(sinode->mode);
|
||||
size = le64_to_cpu(sinode->size);
|
||||
init_inode_key(&key, ino);
|
||||
|
||||
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
|
||||
lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* XXX corruption, inode probably won't be freed without repair */
|
||||
if (le32_to_cpu(sinode.nlink)) {
|
||||
scoutfs_warn(sb, "Dangling orphan item for inode %llu.", ino);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mode = le32_to_cpu(sinode.mode);
|
||||
size = le64_to_cpu(sinode.size);
|
||||
trace_scoutfs_delete_inode(sb, ino, mode, size);
|
||||
|
||||
/* remove data items in their own transactions */
|
||||
@@ -1544,7 +1582,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_in
|
||||
/* then delete the small known number of remaining inode items */
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
prepare_index_deletion(sb, &ind_locks, ino, mode, sinode) ?:
|
||||
prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
@@ -1553,7 +1591,7 @@ retry:
|
||||
|
||||
release = true;
|
||||
|
||||
ret = remove_index_items(sb, ino, sinode, &ind_locks);
|
||||
ret = remove_index_items(sb, ino, &sinode, &ind_locks);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1563,21 +1601,15 @@ retry:
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* make sure inode item and orphan are deleted together */
|
||||
ret = scoutfs_item_dirty(sb, &key, lock);
|
||||
if (ret < 0)
|
||||
ret = scoutfs_item_delete(sb, &key, lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_item_delete(sb, &key, lock);
|
||||
BUG_ON(ret != 0); /* dirtying should have guaranteed success */
|
||||
|
||||
scoutfs_forest_dec_inode_count(sb);
|
||||
|
||||
if (ret == 0)
|
||||
scoutfs_forest_dec_inode_count(sb);
|
||||
out:
|
||||
del_deleting_ino(inf, &del);
|
||||
if (release)
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
@@ -1585,192 +1617,48 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct inode_deletion_lock_data {
|
||||
wait_queue_head_t waitq;
|
||||
atomic64_t seq;
|
||||
struct scoutfs_open_ino_map map;
|
||||
unsigned long trying[DIV_ROUND_UP(SCOUTFS_OPEN_INO_MAP_BITS, BITS_PER_LONG)];
|
||||
};
|
||||
|
||||
/*
|
||||
* Get a lock data struct that has the current omap from this hold of
|
||||
* the lock. The lock data is saved on the lock so it can be used
|
||||
* multiple times until the lock is refreshed. Only one task will send
|
||||
* an omap request at a time, and errors are only returned by each task
|
||||
* as it gets a response to its send.
|
||||
*/
|
||||
static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
struct inode_deletion_lock_data **ldata_ret, u64 group_nr)
|
||||
{
|
||||
struct inode_deletion_lock_data *ldata;
|
||||
u64 seq;
|
||||
int ret;
|
||||
|
||||
/* we're storing omap maps in locks, they need to cover the same number of inodes */
|
||||
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
|
||||
|
||||
/* allocate a new lock data struct as needed */
|
||||
while ((ldata = cmpxchg(&lock->inode_deletion_data, NULL, NULL)) == NULL) {
|
||||
ldata = kzalloc(sizeof(struct inode_deletion_lock_data), GFP_NOFS);
|
||||
if (!ldata) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
atomic64_set(&ldata->seq, lock->write_seq - 1); /* ensure refresh */
|
||||
init_waitqueue_head(&ldata->waitq);
|
||||
|
||||
/* the lock kfrees the inode_deletion_data pointer along with the lock */
|
||||
if (cmpxchg(&lock->inode_deletion_data, NULL, ldata) == NULL)
|
||||
break;
|
||||
else
|
||||
kfree(ldata);
|
||||
}
|
||||
|
||||
/* make sure that the lock's data is current */
|
||||
while ((seq = atomic64_read(&ldata->seq)) != lock->write_seq) {
|
||||
if (seq != U64_MAX && atomic64_cmpxchg(&ldata->seq, seq, U64_MAX) == seq) {
|
||||
/* ask the server for current omap */
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
|
||||
if (ret == 0)
|
||||
atomic64_set(&ldata->seq, lock->write_seq);
|
||||
else
|
||||
atomic64_set(&ldata->seq, lock->write_seq - 1);
|
||||
wake_up(&ldata->waitq);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
} else {
|
||||
/* wait for someone else who's sent a request */
|
||||
wait_event(ldata->waitq, atomic64_read(&ldata->seq) != U64_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
ldata = NULL;
|
||||
*ldata_ret = ldata;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to delete all the items for an unused inode number. This is the
|
||||
* relatively slow path that uses cluster locks, network requests, and
|
||||
* IO to ensure correctness. Callers should try hard to avoid calling
|
||||
* when there's no work to do.
|
||||
* iput_final has already written out the dirty pages to the inode
|
||||
* before we get here. We're left with a clean inode that we have to
|
||||
* tear down. We use locking and open inode number bitmaps to decide if
|
||||
* we should finally destroy an inode that is no longer open nor
|
||||
* reachable through directory entries.
|
||||
*
|
||||
* Inode references are added under cluster locks. In-memory vfs cache
|
||||
* references are added under read cluster locks and are visible in omap
|
||||
* bitmaps. Directory entry references are added under write cluster
|
||||
* locks and are visible in the inode's nlink. Orphan items exist
|
||||
* whenever nlink == 0 and are maintained under write cluster locks.
|
||||
* Directory entries can be added to an inode with nlink == 0 to
|
||||
* instantiate tmpfile inodes into the name space. Cached inodes will
|
||||
* not be created for inodes with an nlink of 0.
|
||||
*
|
||||
* Combining all this we know that it's safe to delete an inode's items
|
||||
* when we hold an exclusive write cluster lock, the inode has nlink ==
|
||||
* 0, and an omap request protected by the lock doesn't have the inode's
|
||||
* bit set.
|
||||
*
|
||||
* This is called by orphan scanning and vfs inode cache eviction after
|
||||
* they've checked that the inode could really be deleted. We serialize
|
||||
* on a bit in the lock data so that we only have one deletion attempt
|
||||
* per inode under this mount's cluster lock.
|
||||
*/
|
||||
static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct inode_deletion_lock_data *ldata = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
ret = get_current_lock_data(sb, lock, &ldata, group_nr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* only one local attempt per inode at a time */
|
||||
if (test_and_set_bit(bit_nr, ldata->trying)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* can't delete if it's cached in local or remote mounts */
|
||||
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (le32_to_cpu(sinode.nlink) > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
|
||||
out:
|
||||
if (ldata)
|
||||
clear_bit(bit_nr, ldata->trying);
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* As we drop an inode we need to decide to try and delete its items or
|
||||
* not, which is expensive. The two common cases we want to get right
|
||||
* both have cluster lock coverage and don't want to delete. Dropping
|
||||
* unused inodes during read lock invalidation has the current lock and
|
||||
* sees a nonzero nlink and knows not to delete. Final iput after a
|
||||
* local unlink also has a lock, sees a zero nlink, and tries to perform
|
||||
* item deletion in the task that dropped the last link, as users
|
||||
* expect.
|
||||
*
|
||||
* Evicting an inode outside of cluster locking is the odd slow path
|
||||
* that involves lock contention during use the worst cross-mount
|
||||
* open-unlink/delete case.
|
||||
* Because lookup ignores freeing inodes we can get here from multiple
|
||||
* instances of an inode that is being deleted. Orphan scanning in
|
||||
* particular can race with deletion. delete_inode_items() resolves
|
||||
* concurrent attempts.
|
||||
*/
|
||||
void scoutfs_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_lock *orph_lock;
|
||||
struct scoutfs_lock *lock;
|
||||
int ret;
|
||||
|
||||
trace_scoutfs_evict_inode(sb, ino, inode->i_nlink, is_bad_inode(inode));
|
||||
trace_scoutfs_evict_inode(inode->i_sb, scoutfs_ino(inode),
|
||||
inode->i_nlink, is_bad_inode(inode));
|
||||
|
||||
if (!is_bad_inode(inode)) {
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
if (is_bad_inode(inode))
|
||||
goto clear;
|
||||
|
||||
/* clear before trying to delete tests */
|
||||
scoutfs_omap_clear(sb, ino);
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
|
||||
if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0)
|
||||
try_delete_inode_items(sb, scoutfs_ino(inode));
|
||||
ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
|
||||
if (ret > 0) {
|
||||
ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
}
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
|
||||
ret, ino);
|
||||
}
|
||||
|
||||
scoutfs_omap_dec(sb, ino);
|
||||
|
||||
clear:
|
||||
clear_inode(inode);
|
||||
}
|
||||
|
||||
@@ -1846,26 +1734,18 @@ void scoutfs_inode_queue_iput(struct inode *inode)
|
||||
/*
|
||||
* All mounts are performing this work concurrently. We introduce
|
||||
* significant jitter between them to try and keep them from all
|
||||
* bunching up and working on the same inodes. We always try to delay
|
||||
* for at least one jiffy if precision tricks us into calculating no
|
||||
* delay.
|
||||
* bunching up and working on the same inodes.
|
||||
*/
|
||||
void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
|
||||
static void schedule_orphan_dwork(struct inode_sb_info *inf)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
struct scoutfs_mount_options opts;
|
||||
unsigned long low;
|
||||
unsigned long high;
|
||||
unsigned long delay;
|
||||
|
||||
#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
|
||||
#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
|
||||
unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
|
||||
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
|
||||
if (!inf->stopped) {
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
low = (opts.orphan_scan_delay_ms * 80) / 100;
|
||||
high = (opts.orphan_scan_delay_ms * 120) / 100;
|
||||
delay = msecs_to_jiffies(low + prandom_u32_max(high - low)) ?: 1;
|
||||
|
||||
mod_delayed_work(system_wq, &inf->orphan_scan_dwork, delay);
|
||||
delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
|
||||
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
|
||||
schedule_delayed_work(&inf->orphan_scan_dwork, delay);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1873,10 +1753,11 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
|
||||
* Find and delete inodes whose only remaining reference is the
|
||||
* persistent orphan item that was created as they were unlinked.
|
||||
*
|
||||
* Orphan items are maintained for inodes that have an nlink of 0.
|
||||
* Typically this is from unlink, but tmpfiles are created with orphans.
|
||||
* They're deleted as the final cached inode is evicted and the inode
|
||||
* items are destroyed.
|
||||
* Orphan items are created as the final directory entry referring to an
|
||||
* inode is deleted. They're deleted as the final cached inode is
|
||||
* evicted and the inode items are destroyed. They can linger if all
|
||||
* the cached inodes pinning the inode fail to delete as they are
|
||||
* evicted from the cache -- either through crashing or errors.
|
||||
*
|
||||
* This work runs in all mounts in the background looking for those
|
||||
* orphaned inodes that weren't fully deleted.
|
||||
@@ -1885,16 +1766,20 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
|
||||
* only find orphan items that made it to the fs root after being merged
|
||||
* from a mount's log btree. This naturally avoids orphan items that
|
||||
* exist while inodes have been unlinked but are still cached, including
|
||||
* tmpfile inodes that are actively used during normal operations.
|
||||
* O_TMPFILE inodes that are actively used during normal operations.
|
||||
* Scanning the read-only persistent fs root uses cached blocks and
|
||||
* avoids the lock contention we'd cause if we tried to use the
|
||||
* consistent item cache. The downside is that it adds a bit of
|
||||
* latency.
|
||||
* latency. If an orphan was created in error it'll take until the
|
||||
* mount's log btree is finalized and merged. A crash will have the log
|
||||
* btree merged after it is fenced.
|
||||
*
|
||||
* Once we find candidate orphan items we first check our local omap for
|
||||
* a locally cached inode. Then we ask the server for the open map
|
||||
* containing the inode. Only if we don't see any cached users do we do
|
||||
* the expensive work of acquiring locks to try and delete the items.
|
||||
* Once we find candidate orphan items we can first check our local
|
||||
* inode cache for inodes that are already on their way to eviction and
|
||||
* can be skipped. Then we ask the server for the open map containing
|
||||
* the inode. Only if we don't have it cached, and no one else does, do
|
||||
* we try and read it into our cache and evict it to trigger the final
|
||||
* inode deletion process.
|
||||
*/
|
||||
static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
{
|
||||
@@ -1906,6 +1791,7 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_key last;
|
||||
struct scoutfs_key key;
|
||||
struct inode *inode;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
u64 ino;
|
||||
@@ -1944,14 +1830,17 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
scoutfs_inc_counter(sb, orphan_scan_item);
|
||||
ino = le64_to_cpu(key.sko_ino);
|
||||
|
||||
/* locally cached inodes will try to delete as they evict */
|
||||
if (scoutfs_omap_test(sb, ino)) {
|
||||
/* locally cached inodes will already be deleted */
|
||||
inode = scoutfs_ilookup(sb, ino);
|
||||
if (inode) {
|
||||
scoutfs_inc_counter(sb, orphan_scan_cached);
|
||||
iput(inode);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* get an omap that covers the orphaned ino */
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
|
||||
bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
|
||||
|
||||
if (le64_to_cpu(omap.args.group_nr) != group_nr) {
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
|
||||
@@ -1959,15 +1848,25 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* remote cached inodes will also try to delete */
|
||||
/* don't need to evict if someone else has it open (cached) */
|
||||
if (test_bit_le(bit_nr, omap.bits)) {
|
||||
scoutfs_inc_counter(sb, orphan_scan_omap_set);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* seemingly orphaned and unused, get locks and check for sure */
|
||||
scoutfs_inc_counter(sb, orphan_scan_attempts);
|
||||
ret = try_delete_inode_items(sb, ino);
|
||||
/* try to cached and evict unused inode to delete, can be racing */
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ENOENT)
|
||||
continue;
|
||||
else
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, orphan_scan_read);
|
||||
SCOUTFS_I(inode)->drop_invalidated = true;
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
@@ -1976,7 +1875,7 @@ out:
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, orphan_scan_error);
|
||||
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
schedule_orphan_dwork(inf);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2084,6 +1983,8 @@ int scoutfs_inode_setup(struct super_block *sb)
|
||||
spin_lock_init(&inf->dir_ino_alloc.lock);
|
||||
spin_lock_init(&inf->ino_alloc.lock);
|
||||
INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
|
||||
spin_lock_init(&inf->deleting_items_lock);
|
||||
INIT_LIST_HEAD(&inf->deleting_items_list);
|
||||
INIT_WORK(&inf->iput_work, iput_worker);
|
||||
init_llist_head(&inf->iput_llist);
|
||||
|
||||
@@ -2099,7 +2000,9 @@ int scoutfs_inode_setup(struct super_block *sb)
|
||||
*/
|
||||
void scoutfs_inode_start(struct super_block *sb)
|
||||
{
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
|
||||
schedule_orphan_dwork(inf);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -80,13 +80,9 @@ int scoutfs_drop_inode(struct inode *inode);
|
||||
void scoutfs_evict_inode(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode);
|
||||
|
||||
#define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf);
|
||||
struct inode *scoutfs_ilookup_nowait(struct super_block *sb, u64 ino);
|
||||
struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino);
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf);
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
|
||||
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino);
|
||||
void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
|
||||
u32 minor, u64 ino);
|
||||
int scoutfs_inode_index_start(struct super_block *sb, u64 *seq);
|
||||
@@ -106,8 +102,9 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
|
||||
struct list_head *ind_locks);
|
||||
|
||||
int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret);
|
||||
int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
|
||||
u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret);
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev, u64 ino,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
void scoutfs_inode_set_meta_seq(struct inode *inode);
|
||||
void scoutfs_inode_set_data_seq(struct inode *inode);
|
||||
@@ -120,14 +117,14 @@ u64 scoutfs_inode_data_version(struct inode *inode);
|
||||
void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off);
|
||||
int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock);
|
||||
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
int flags);
|
||||
int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb);
|
||||
|
||||
void scoutfs_inode_queue_writeback(struct inode *inode);
|
||||
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
|
||||
|
||||
@@ -387,7 +387,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
|
||||
if (sblock > eblock)
|
||||
return -EINVAL;
|
||||
|
||||
inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino);
|
||||
inode = scoutfs_ilookup(sb, args.ino);
|
||||
if (!inode) {
|
||||
ret = -ESTALE;
|
||||
goto out;
|
||||
@@ -1320,84 +1320,6 @@ out:
|
||||
return ret ?: count;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_get_allocated_inos __user *ugai = (void __user *)arg;
|
||||
struct scoutfs_ioctl_get_allocated_inos gai;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_key key;
|
||||
struct scoutfs_key end;
|
||||
u64 __user *uinos;
|
||||
u64 bytes;
|
||||
u64 ino;
|
||||
int nr;
|
||||
int ret;
|
||||
|
||||
if (!(file->f_mode & FMODE_READ)) {
|
||||
ret = -EBADF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(&gai, ugai, sizeof(gai))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((gai.inos_ptr & (sizeof(__u64) - 1)) || (gai.inos_bytes < sizeof(__u64))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inode_init_key(&key, gai.start_ino);
|
||||
scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
uinos = (void __user *)gai.inos_ptr;
|
||||
bytes = gai.inos_bytes;
|
||||
nr = 0;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
while (bytes >= sizeof(*uinos)) {
|
||||
|
||||
ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (key.sk_zone != SCOUTFS_FS_ZONE) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* all fs items are owned by allocated inodes, and _first is always ino */
|
||||
ino = le64_to_cpu(key._sk_first);
|
||||
if (put_user(ino, uinos)) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
uinos++;
|
||||
bytes -= sizeof(*uinos);
|
||||
if (++nr == INT_MAX)
|
||||
break;
|
||||
|
||||
scoutfs_inode_init_key(&key, ino + 1);
|
||||
}
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
out:
|
||||
return ret ?: nr;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -1431,8 +1353,6 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_resize_devices(file, arg);
|
||||
case SCOUTFS_IOC_READ_XATTR_TOTALS:
|
||||
return scoutfs_ioc_read_xattr_totals(file, arg);
|
||||
case SCOUTFS_IOC_GET_ALLOCATED_INOS:
|
||||
return scoutfs_ioc_get_allocated_inos(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
@@ -520,43 +520,4 @@ struct scoutfs_ioctl_xattr_total {
|
||||
#define SCOUTFS_IOC_READ_XATTR_TOTALS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 15, struct scoutfs_ioctl_read_xattr_totals)
|
||||
|
||||
/*
|
||||
* This fills the caller's inos array with inode numbers that are in use
|
||||
* after the start ino, within an internal inode group.
|
||||
*
|
||||
* This only makes a promise about the state of the inode numbers within
|
||||
* the first and last numbers returned by one call. At one time, all of
|
||||
* those inodes were still allocated. They could have changed before
|
||||
* the call returned. And any numbers outside of the first and last
|
||||
* (or single) are undefined.
|
||||
*
|
||||
* This doesn't iterate over all allocated inodes, it only probes a
|
||||
* single group that the start inode is within. This interface was
|
||||
* first introduced to support tests that needed to find out about a
|
||||
* specific inode, while having some other similarly niche uses. It is
|
||||
* unsuitable for a consistent iteration over all the inode numbers in
|
||||
* use.
|
||||
*
|
||||
* This test of inode items doesn't serialize with the inode lifetime
|
||||
* mechanism. It only tells you the numbers of inodes that were once
|
||||
* active in the system and haven't yet been fully deleted. The inode
|
||||
* numbers returned could have been in the process of being deleted and
|
||||
* were already unreachable even before the call started.
|
||||
*
|
||||
* @start_ino: the first inode number that could be returned
|
||||
* @inos_ptr: pointer to an aligned array of 64bit inode numbers
|
||||
* @inos_bytes: the number of bytes available in the inos_ptr array
|
||||
*
|
||||
* Returns errors or the count of inode numbers returned, quite possibly
|
||||
* including 0.
|
||||
*/
|
||||
struct scoutfs_ioctl_get_allocated_inos {
|
||||
__u64 start_ino;
|
||||
__u64 inos_ptr;
|
||||
__u64 inos_bytes;
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOC_GET_ALLOCATED_INOS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 16, struct scoutfs_ioctl_get_allocated_inos)
|
||||
|
||||
#endif
|
||||
|
||||
2996
kmod/src/item.c
2996
kmod/src/item.c
File diff suppressed because it is too large
Load Diff
@@ -26,7 +26,7 @@ int scoutfs_item_delete_force(struct super_block *sb,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
u64 scoutfs_item_dirty_pages(struct super_block *sb);
|
||||
u64 scoutfs_item_dirty_bytes(struct super_block *sb);
|
||||
int scoutfs_item_write_dirty(struct super_block *sb);
|
||||
int scoutfs_item_write_done(struct super_block *sb);
|
||||
bool scoutfs_item_range_cached(struct super_block *sb,
|
||||
|
||||
@@ -142,7 +142,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode;
|
||||
|
||||
inode = scoutfs_ilookup_nowait_nonewfree(sb, ino);
|
||||
inode = scoutfs_ilookup(sb, ino);
|
||||
if (inode) {
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
@@ -255,7 +255,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
|
||||
BUG_ON(!list_empty(&lock->shrink_head));
|
||||
BUG_ON(!list_empty(&lock->cov_list));
|
||||
|
||||
kfree(lock->inode_deletion_data);
|
||||
scoutfs_omap_free_lock_data(lock->omap_data);
|
||||
kfree(lock);
|
||||
}
|
||||
|
||||
@@ -291,6 +291,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
|
||||
lock->mode = SCOUTFS_LOCK_NULL;
|
||||
|
||||
atomic64_set(&lock->forest_bloom_nr, 0);
|
||||
spin_lock_init(&lock->omap_spinlock);
|
||||
|
||||
trace_scoutfs_lock_alloc(sb, lock);
|
||||
|
||||
@@ -1049,7 +1050,7 @@ int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int
|
||||
goto out;
|
||||
|
||||
if (flags & SCOUTFS_LKF_REFRESH_INODE) {
|
||||
ret = scoutfs_inode_refresh(inode, *lock);
|
||||
ret = scoutfs_inode_refresh(inode, *lock, flags);
|
||||
if (ret < 0) {
|
||||
scoutfs_unlock(sb, *lock, mode);
|
||||
*lock = NULL;
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
#define SCOUTFS_LOCK_NR_MODES SCOUTFS_LOCK_INVALID
|
||||
|
||||
struct inode_deletion_lock_data;
|
||||
struct scoutfs_omap_lock;
|
||||
|
||||
/*
|
||||
* A few fields (start, end, refresh_gen, write_seq, granted_mode)
|
||||
@@ -47,8 +47,9 @@ struct scoutfs_lock {
|
||||
/* the forest tracks which log tree last saw bloom bit updates */
|
||||
atomic64_t forest_bloom_nr;
|
||||
|
||||
/* inode deletion tracks some state per lock */
|
||||
struct inode_deletion_lock_data *inode_deletion_data;
|
||||
/* open ino mapping has a valid map for a held write lock */
|
||||
spinlock_t omap_spinlock;
|
||||
struct scoutfs_omap_lock_data *omap_data;
|
||||
};
|
||||
|
||||
struct scoutfs_lock_coverage {
|
||||
|
||||
@@ -153,30 +153,30 @@ enum {
|
||||
*/
|
||||
static void add_client_entry(struct server_lock_node *snode,
|
||||
struct list_head *list,
|
||||
struct client_lock_entry *c_ent)
|
||||
struct client_lock_entry *clent)
|
||||
{
|
||||
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
|
||||
|
||||
if (list_empty(&c_ent->head))
|
||||
list_add_tail(&c_ent->head, list);
|
||||
if (list_empty(&clent->head))
|
||||
list_add_tail(&clent->head, list);
|
||||
else
|
||||
list_move_tail(&c_ent->head, list);
|
||||
list_move_tail(&clent->head, list);
|
||||
|
||||
c_ent->on_list = list == &snode->granted ? OL_GRANTED :
|
||||
clent->on_list = list == &snode->granted ? OL_GRANTED :
|
||||
list == &snode->requested ? OL_REQUESTED :
|
||||
OL_INVALIDATED;
|
||||
}
|
||||
|
||||
static void free_client_entry(struct lock_server_info *inf,
|
||||
struct server_lock_node *snode,
|
||||
struct client_lock_entry *c_ent)
|
||||
struct client_lock_entry *clent)
|
||||
{
|
||||
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
|
||||
|
||||
if (!list_empty(&c_ent->head))
|
||||
list_del_init(&c_ent->head);
|
||||
scoutfs_tseq_del(&inf->tseq_tree, &c_ent->tseq_entry);
|
||||
kfree(c_ent);
|
||||
if (!list_empty(&clent->head))
|
||||
list_del_init(&clent->head);
|
||||
scoutfs_tseq_del(&inf->tseq_tree, &clent->tseq_entry);
|
||||
kfree(clent);
|
||||
}
|
||||
|
||||
static bool invalid_mode(u8 mode)
|
||||
@@ -339,13 +339,13 @@ static struct client_lock_entry *find_entry(struct server_lock_node *snode,
|
||||
struct list_head *list,
|
||||
u64 rid)
|
||||
{
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
|
||||
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
|
||||
|
||||
list_for_each_entry(c_ent, list, head) {
|
||||
if (c_ent->rid == rid)
|
||||
return c_ent;
|
||||
list_for_each_entry(clent, list, head) {
|
||||
if (clent->rid == rid)
|
||||
return clent;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@@ -364,7 +364,7 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
|
||||
u64 net_id, struct scoutfs_net_lock *nl)
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct server_lock_node *snode;
|
||||
int ret;
|
||||
|
||||
@@ -376,29 +376,29 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
|
||||
goto out;
|
||||
}
|
||||
|
||||
c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!c_ent) {
|
||||
clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!clent) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&c_ent->head);
|
||||
c_ent->rid = rid;
|
||||
c_ent->net_id = net_id;
|
||||
c_ent->mode = nl->new_mode;
|
||||
INIT_LIST_HEAD(&clent->head);
|
||||
clent->rid = rid;
|
||||
clent->net_id = net_id;
|
||||
clent->mode = nl->new_mode;
|
||||
|
||||
snode = alloc_server_lock(inf, &nl->key);
|
||||
if (snode == NULL) {
|
||||
kfree(c_ent);
|
||||
kfree(clent);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
snode->stats[SLT_REQUEST]++;
|
||||
|
||||
c_ent->snode = snode;
|
||||
add_client_entry(snode, &snode->requested, c_ent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);
|
||||
clent->snode = snode;
|
||||
add_client_entry(snode, &snode->requested, clent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
|
||||
|
||||
ret = process_waiting_requests(sb, snode);
|
||||
out:
|
||||
@@ -417,7 +417,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
|
||||
struct scoutfs_net_lock *nl)
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct server_lock_node *snode;
|
||||
int ret;
|
||||
|
||||
@@ -438,18 +438,18 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
|
||||
|
||||
snode->stats[SLT_RESPONSE]++;
|
||||
|
||||
c_ent = find_entry(snode, &snode->invalidated, rid);
|
||||
if (!c_ent) {
|
||||
clent = find_entry(snode, &snode->invalidated, rid);
|
||||
if (!clent) {
|
||||
put_server_lock(inf, snode);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (nl->new_mode == SCOUTFS_LOCK_NULL) {
|
||||
free_client_entry(inf, snode, c_ent);
|
||||
free_client_entry(inf, snode, clent);
|
||||
} else {
|
||||
c_ent->mode = nl->new_mode;
|
||||
add_client_entry(snode, &snode->granted, c_ent);
|
||||
clent->mode = nl->new_mode;
|
||||
add_client_entry(snode, &snode->granted, clent);
|
||||
}
|
||||
|
||||
ret = process_waiting_requests(sb, snode);
|
||||
@@ -632,7 +632,7 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *existing;
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct server_lock_node *snode;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
@@ -652,35 +652,35 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
|
||||
}
|
||||
|
||||
for (i = 0; i < le16_to_cpu(nlr->nr); i++) {
|
||||
c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!c_ent) {
|
||||
clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
|
||||
if (!clent) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&c_ent->head);
|
||||
c_ent->rid = rid;
|
||||
c_ent->net_id = 0;
|
||||
c_ent->mode = nlr->locks[i].new_mode;
|
||||
INIT_LIST_HEAD(&clent->head);
|
||||
clent->rid = rid;
|
||||
clent->net_id = 0;
|
||||
clent->mode = nlr->locks[i].new_mode;
|
||||
|
||||
snode = alloc_server_lock(inf, &nlr->locks[i].key);
|
||||
if (snode == NULL) {
|
||||
kfree(c_ent);
|
||||
kfree(clent);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
existing = find_entry(snode, &snode->granted, rid);
|
||||
if (existing) {
|
||||
kfree(c_ent);
|
||||
kfree(clent);
|
||||
put_server_lock(inf, snode);
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
|
||||
c_ent->snode = snode;
|
||||
add_client_entry(snode, &snode->granted, c_ent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);
|
||||
clent->snode = snode;
|
||||
add_client_entry(snode, &snode->granted, clent);
|
||||
scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
|
||||
|
||||
put_server_lock(inf, snode);
|
||||
|
||||
@@ -707,7 +707,7 @@ out:
|
||||
int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct client_lock_entry *tmp;
|
||||
struct server_lock_node *snode;
|
||||
struct scoutfs_key key;
|
||||
@@ -724,9 +724,9 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
|
||||
(list == &snode->requested) ? &snode->invalidated :
|
||||
NULL) {
|
||||
|
||||
list_for_each_entry_safe(c_ent, tmp, list, head) {
|
||||
if (c_ent->rid == rid) {
|
||||
free_client_entry(inf, snode, c_ent);
|
||||
list_for_each_entry_safe(clent, tmp, list, head) {
|
||||
if (clent->rid == rid) {
|
||||
free_client_entry(inf, snode, clent);
|
||||
freed = true;
|
||||
}
|
||||
}
|
||||
@@ -749,7 +749,7 @@ out:
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "lock server err %d during client rid %016llx farewell, shutting down",
|
||||
ret, rid);
|
||||
scoutfs_server_stop(sb);
|
||||
scoutfs_server_abort(sb);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -787,15 +787,15 @@ static char *lock_on_list_string(u8 on_list)
|
||||
static void lock_server_tseq_show(struct seq_file *m,
|
||||
struct scoutfs_tseq_entry *ent)
|
||||
{
|
||||
struct client_lock_entry *c_ent = container_of(ent,
|
||||
struct client_lock_entry *clent = container_of(ent,
|
||||
struct client_lock_entry,
|
||||
tseq_entry);
|
||||
struct server_lock_node *snode = c_ent->snode;
|
||||
struct server_lock_node *snode = clent->snode;
|
||||
|
||||
seq_printf(m, SK_FMT" %s %s rid %016llx net_id %llu\n",
|
||||
SK_ARG(&snode->key), lock_mode_string(c_ent->mode),
|
||||
lock_on_list_string(c_ent->on_list), c_ent->rid,
|
||||
c_ent->net_id);
|
||||
SK_ARG(&snode->key), lock_mode_string(clent->mode),
|
||||
lock_on_list_string(clent->on_list), clent->rid,
|
||||
clent->net_id);
|
||||
}
|
||||
|
||||
static void stats_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
|
||||
@@ -857,7 +857,7 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
|
||||
DECLARE_LOCK_SERVER_INFO(sb, inf);
|
||||
struct server_lock_node *snode;
|
||||
struct server_lock_node *stmp;
|
||||
struct client_lock_entry *c_ent;
|
||||
struct client_lock_entry *clent;
|
||||
struct client_lock_entry *ctmp;
|
||||
LIST_HEAD(list);
|
||||
|
||||
@@ -873,8 +873,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
|
||||
list_splice_init(&snode->invalidated, &list);
|
||||
|
||||
mutex_lock(&snode->mutex);
|
||||
list_for_each_entry_safe(c_ent, ctmp, &list, head) {
|
||||
free_client_entry(inf, snode, c_ent);
|
||||
list_for_each_entry_safe(clent, ctmp, &list, head) {
|
||||
free_client_entry(inf, snode, clent);
|
||||
}
|
||||
mutex_unlock(&snode->mutex);
|
||||
|
||||
|
||||
@@ -1292,7 +1292,7 @@ restart:
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "client fence returned err %d, shutting down server",
|
||||
ret);
|
||||
scoutfs_server_stop(sb);
|
||||
scoutfs_server_abort(sb);
|
||||
}
|
||||
}
|
||||
destroy_conn(acc);
|
||||
@@ -1772,6 +1772,23 @@ int scoutfs_net_response_node(struct super_block *sb,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* The response function that was submitted with the request is not
|
||||
* called if the request is canceled here.
|
||||
*/
|
||||
void scoutfs_net_cancel_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id)
|
||||
{
|
||||
struct message_send *msend;
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
msend = find_request(conn, cmd, id);
|
||||
if (msend)
|
||||
complete_send(conn, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
}
|
||||
|
||||
struct sync_request_completion {
|
||||
struct completion comp;
|
||||
void *resp;
|
||||
|
||||
@@ -134,6 +134,9 @@ int scoutfs_net_submit_request_node(struct super_block *sb,
|
||||
u64 rid, u8 cmd, void *arg, u16 arg_len,
|
||||
scoutfs_net_response_t resp_func,
|
||||
void *resp_data, u64 *id_ret);
|
||||
void scoutfs_net_cancel_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id);
|
||||
int scoutfs_net_sync_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
|
||||
296
kmod/src/omap.c
296
kmod/src/omap.c
@@ -30,22 +30,27 @@
|
||||
/*
|
||||
* As a client removes an inode from its cache with an nlink of 0 it
|
||||
* needs to decide if it is the last client using the inode and should
|
||||
* fully delete all the inode's items. It needs to know if other mounts
|
||||
* still have the inode in use.
|
||||
* fully delete all its items. It needs to know if other mounts still
|
||||
* have the inode in use.
|
||||
*
|
||||
* We need a way to communicate between mounts that an inode is in use.
|
||||
* We need a way to communicate between mounts that an inode is open.
|
||||
* We don't want to pay the synchronous per-file locking round trip
|
||||
* costs associated with per-inode open locks that you'd typically see
|
||||
* in systems to solve this problem. The first prototypes of this
|
||||
* tracked open file handles so this was coined the open map, though it
|
||||
* now tracks cached inodes.
|
||||
* in systems to solve this problem.
|
||||
*
|
||||
* Clients maintain bitmaps that cover groups of inodes. As inodes
|
||||
* enter the cache their bit is set and as the inode is evicted the bit
|
||||
* is cleared. As deletion is attempted, either by scanning orphans or
|
||||
* evicting an inode with an nlink of 0, messages are sent around the
|
||||
* cluster to get the current bitmaps for that inode's group from all
|
||||
* active mounts. If the inode's bit is clear then it can be deleted.
|
||||
* Instead clients maintain open bitmaps that cover groups of inodes.
|
||||
* As inodes enter the cache their bit is set, and as the inode is
|
||||
* evicted the bit is cleared. As an inode is evicted messages are sent
|
||||
* around the cluster to get the current bitmaps for that inode's group
|
||||
* from all active mounts. If the inode's bit is clear then it can be
|
||||
* deleted.
|
||||
*
|
||||
* We associate the open bitmaps with our cluster locking of inode
|
||||
* groups to cache these open bitmaps. As long as we have the lock then
|
||||
* nlink can't be changed on any remote mounts. Specifically, it can't
|
||||
* increase from 0 so any clear bits can gain references on remote
|
||||
* mounts. As long as we have the lock, all clear bits in the group for
|
||||
* inodes with 0 nlink can be deleted.
|
||||
*
|
||||
* This layer maintains a list of client rids to send messages to. The
|
||||
* server calls us as clients enter and leave the cluster. We can't
|
||||
@@ -80,12 +85,14 @@ struct omap_info {
|
||||
struct omap_info *name = SCOUTFS_SB(sb)->omap_info
|
||||
|
||||
/*
|
||||
* The presence of an inode in the inode sets its bit in the lock
|
||||
* group's bitmap.
|
||||
* The presence of an inode in the inode cache increases the count of
|
||||
* its inode number's position within its lock group. These structs
|
||||
* track the counts for all the inodes in a lock group and maintain a
|
||||
* bitmap whose bits are set for each non-zero count.
|
||||
*
|
||||
* We don't want to add additional global synchronization of inode cache
|
||||
* maintenance so these are tracked in an rcu hash table. Once their
|
||||
* total reaches zero they're removed from the hash and queued for
|
||||
* total count reaches zero they're removed from the hash and queued for
|
||||
* freeing and readers should ignore them.
|
||||
*/
|
||||
struct omap_group {
|
||||
@@ -95,6 +102,7 @@ struct omap_group {
|
||||
u64 nr;
|
||||
spinlock_t lock;
|
||||
unsigned int total;
|
||||
unsigned int *counts;
|
||||
__le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S];
|
||||
};
|
||||
|
||||
@@ -103,7 +111,8 @@ do { \
|
||||
__typeof__(group) _grp = (group); \
|
||||
__typeof__(bit_nr) _nr = (bit_nr); \
|
||||
\
|
||||
trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr); \
|
||||
trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr, \
|
||||
_nr < 0 ? -1 : _grp->counts[_nr]); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
@@ -125,6 +134,18 @@ struct omap_request {
|
||||
struct scoutfs_open_ino_map map;
|
||||
};
|
||||
|
||||
/*
|
||||
* In each inode group cluster lock we store data to track the open ino
|
||||
* map which tracks all the inodes that the cluster lock covers. When
|
||||
* the seq shows that the map is stale we send a request to update it.
|
||||
*/
|
||||
struct scoutfs_omap_lock_data {
|
||||
u64 seq;
|
||||
bool req_in_flight;
|
||||
wait_queue_head_t waitq;
|
||||
struct scoutfs_open_ino_map map;
|
||||
};
|
||||
|
||||
static inline void init_rid_list(struct omap_rid_list *list)
|
||||
{
|
||||
INIT_LIST_HEAD(&list->head);
|
||||
@@ -211,7 +232,7 @@ static void free_rids(struct omap_rid_list *list)
|
||||
}
|
||||
}
|
||||
|
||||
void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
|
||||
static void calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
|
||||
{
|
||||
*group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
|
||||
*bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
|
||||
@@ -221,13 +242,21 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
|
||||
{
|
||||
struct omap_group *group;
|
||||
|
||||
BUILD_BUG_ON((sizeof(group->counts[0]) * SCOUTFS_OPEN_INO_MAP_BITS) > PAGE_SIZE);
|
||||
|
||||
group = kzalloc(sizeof(struct omap_group), GFP_NOFS);
|
||||
if (group) {
|
||||
group->sb = sb;
|
||||
group->nr = group_nr;
|
||||
spin_lock_init(&group->lock);
|
||||
|
||||
trace_group(sb, alloc, group, -1);
|
||||
group->counts = (void *)get_zeroed_page(GFP_NOFS);
|
||||
if (!group->counts) {
|
||||
kfree(group);
|
||||
group = NULL;
|
||||
} else {
|
||||
trace_group(sb, alloc, group, -1);
|
||||
}
|
||||
}
|
||||
|
||||
return group;
|
||||
@@ -236,6 +265,7 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
|
||||
static void free_group(struct super_block *sb, struct omap_group *group)
|
||||
{
|
||||
trace_group(sb, free, group, -1);
|
||||
free_page((unsigned long)group->counts);
|
||||
kfree(group);
|
||||
}
|
||||
|
||||
@@ -253,16 +283,13 @@ static const struct rhashtable_params group_ht_params = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Track an cached inode in its group. Our set can be racing with a
|
||||
* final clear that removes the group from the hash, sets total to
|
||||
* Track an cached inode in its group. Our increment can be racing with
|
||||
* a final decrement that removes the group from the hash, sets total to
|
||||
* UINT_MAX, and calls rcu free. We can retry until the dead group is
|
||||
* no longer visible in the hash table and we can insert a new allocated
|
||||
* group.
|
||||
*
|
||||
* The caller must ensure that the bit is clear, -EEXIST will be
|
||||
* returned otherwise.
|
||||
*/
|
||||
int scoutfs_omap_set(struct super_block *sb, u64 ino)
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
@@ -271,7 +298,7 @@ int scoutfs_omap_set(struct super_block *sb, u64 ino)
|
||||
bool found;
|
||||
int ret = 0;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
retry:
|
||||
found = false;
|
||||
@@ -281,10 +308,10 @@ retry:
|
||||
spin_lock(&group->lock);
|
||||
if (group->total < UINT_MAX) {
|
||||
found = true;
|
||||
if (WARN_ON_ONCE(test_and_set_bit_le(bit_nr, group->bits)))
|
||||
ret = -EEXIST;
|
||||
else
|
||||
if (group->counts[bit_nr]++ == 0) {
|
||||
set_bit_le(bit_nr, group->bits);
|
||||
group->total++;
|
||||
}
|
||||
}
|
||||
trace_group(sb, inc, group, bit_nr);
|
||||
spin_unlock(&group->lock);
|
||||
@@ -315,50 +342,29 @@ retry:
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool scoutfs_omap_test(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
bool ret = false;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
rcu_read_lock();
|
||||
group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
|
||||
if (group) {
|
||||
spin_lock(&group->lock);
|
||||
ret = !!test_bit_le(bit_nr, group->bits);
|
||||
spin_unlock(&group->lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear a previously set ino bit. Trying to clear a bit that's already
|
||||
* clear implies imbalanced set/clear or bugs freeing groups. We only
|
||||
* free groups here as the last clear drops the group's total to 0.
|
||||
* Decrement a previously incremented ino count. Not finding a count
|
||||
* implies imbalanced inc/dec or bugs freeing groups. We only free
|
||||
* groups here as the last dec drops the group's total count to 0.
|
||||
*/
|
||||
void scoutfs_omap_clear(struct super_block *sb, u64 ino)
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
rcu_read_lock();
|
||||
group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
|
||||
if (group) {
|
||||
spin_lock(&group->lock);
|
||||
WARN_ON_ONCE(!test_bit_le(bit_nr, group->bits));
|
||||
WARN_ON_ONCE(group->counts[bit_nr] == 0);
|
||||
WARN_ON_ONCE(group->total == 0);
|
||||
WARN_ON_ONCE(group->total == UINT_MAX);
|
||||
if (test_and_clear_bit_le(bit_nr, group->bits)) {
|
||||
if (--group->counts[bit_nr] == 0) {
|
||||
clear_bit_le(bit_nr, group->bits);
|
||||
if (--group->total == 0) {
|
||||
group->total = UINT_MAX;
|
||||
rhashtable_remove_fast(&ominf->group_ht, &group->ht_head,
|
||||
@@ -658,7 +664,8 @@ int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id,
|
||||
|
||||
/*
|
||||
* The client is receiving a request from the server for its map for the
|
||||
* given group. Look up the group and copy the bits to the map.
|
||||
* given group. Look up the group and copy the bits to the map for
|
||||
* non-zero open counts.
|
||||
*
|
||||
* The mount originating the request for this bitmap has the inode group
|
||||
* write locked. We can't be adding links to any inodes in the group
|
||||
@@ -807,6 +814,179 @@ void scoutfs_omap_server_shutdown(struct super_block *sb)
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lock_data *ldata)
|
||||
{
|
||||
bool in_flight;
|
||||
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
in_flight = ldata->req_in_flight;
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
|
||||
return in_flight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure the map covered by the cluster lock is current. The caller
|
||||
* holds the cluster lock so once we store lock_data on the cluster lock
|
||||
* it won't be freed and the write_seq in the cluster lock won't change.
|
||||
*
|
||||
* The omap_spinlock protects the omap_data in the cluster lock. We
|
||||
* have to drop it if we have to block to allocate lock_data, send a
|
||||
* request for a new map, or wait for a request in flight to finish.
|
||||
*/
|
||||
static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
struct scoutfs_omap_lock_data **ldata_ret, u64 group_nr)
|
||||
{
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
bool send_req;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
ldata = lock->omap_data;
|
||||
if (ldata == NULL) {
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
ldata = kzalloc(sizeof(struct scoutfs_omap_lock_data), GFP_NOFS);
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
if (!ldata) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (lock->omap_data == NULL) {
|
||||
ldata->seq = lock->write_seq - 1; /* ensure refresh */
|
||||
init_waitqueue_head(&ldata->waitq);
|
||||
|
||||
lock->omap_data = ldata;
|
||||
} else {
|
||||
kfree(ldata);
|
||||
ldata = lock->omap_data;
|
||||
}
|
||||
}
|
||||
|
||||
while (ldata->seq != lock->write_seq) {
|
||||
/* only one waiter sends a request at a time */
|
||||
if (!ldata->req_in_flight) {
|
||||
ldata->req_in_flight = true;
|
||||
send_req = true;
|
||||
} else {
|
||||
send_req = false;
|
||||
}
|
||||
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
if (send_req)
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
|
||||
else
|
||||
wait_event(ldata->waitq, !omap_req_in_flight(lock, ldata));
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
/* only sender can return error, other waiters retry */
|
||||
if (send_req) {
|
||||
ldata->req_in_flight = false;
|
||||
if (ret == 0)
|
||||
ldata->seq = lock->write_seq;
|
||||
wake_up(&ldata->waitq);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
|
||||
if (ret == 0)
|
||||
*ldata_ret = ldata;
|
||||
else
|
||||
*ldata_ret = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 1 and give the caller their locks when they should delete the
|
||||
* inode items. It's safe to delete the inode items when it is no
|
||||
* longer reachable and nothing is referencing it.
|
||||
*
|
||||
* The inode is unreachable when nlink hits zero. Cluster locks protect
|
||||
* modification and testing of nlink. We use the ino_lock_cov covrage
|
||||
* to short circuit the common case of having a locked inode that hasn't
|
||||
* been deleted. If it isn't locked, we have to acquire the lock to
|
||||
* refresh the inode to see its current nlink.
|
||||
*
|
||||
* Then we use an open inode bitmap that covers all the inodes in the
|
||||
* lock group to determine if the inode is present in any other mount's
|
||||
* caches. We refresh it by asking the server for all clients' maps and
|
||||
* then store it in the lock. As long as we hold the lock nothing can
|
||||
* increase nlink from zero and let people get a reference to the inode.
|
||||
*/
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* lock group and omap constants are defined independently */
|
||||
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
|
||||
|
||||
if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (inode->i_nlink > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
/* only one request to refresh the map at a time */
|
||||
ret = get_current_lock_data(sb, lock, &ldata, group_nr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* can delete caller's zero nlink inode if it's not cached in other mounts */
|
||||
ret = !test_bit_le(bit_nr, ldata->map.bits);
|
||||
out:
|
||||
trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);
|
||||
|
||||
if (ret > 0) {
|
||||
err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
|
||||
if (err < 0)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
if (ret <= 0) {
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
lock = NULL;
|
||||
}
|
||||
|
||||
*lock_ret = lock;
|
||||
*orph_lock_ret = orph_lock;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata)
|
||||
{
|
||||
if (ldata) {
|
||||
WARN_ON_ONCE(ldata->req_in_flight);
|
||||
WARN_ON_ONCE(waitqueue_active(&ldata->waitq));
|
||||
kfree(ldata);
|
||||
}
|
||||
}
|
||||
|
||||
int scoutfs_omap_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
#ifndef _SCOUTFS_OMAP_H_
|
||||
#define _SCOUTFS_OMAP_H_
|
||||
|
||||
int scoutfs_omap_set(struct super_block *sb, u64 ino);
|
||||
bool scoutfs_omap_test(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_clear(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
|
||||
int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
|
||||
struct scoutfs_open_ino_map_args *args);
|
||||
void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr);
|
||||
|
||||
int scoutfs_omap_add_rid(struct super_block *sb, u64 rid);
|
||||
int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid);
|
||||
|
||||
@@ -26,30 +26,22 @@
|
||||
#include "msg.h"
|
||||
#include "options.h"
|
||||
#include "super.h"
|
||||
#include "inode.h"
|
||||
|
||||
enum {
|
||||
Opt_metadev_path,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
static const match_table_t tokens = {
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
|
||||
struct options_info {
|
||||
seqlock_t seqlock;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_sysfs_attrs sysfs_attrs;
|
||||
struct options_sb_info {
|
||||
struct dentry *debugfs_dir;
|
||||
};
|
||||
|
||||
#define DECLARE_OPTIONS_INFO(sb, name) \
|
||||
struct options_info *name = SCOUTFS_SB(sb)->options_info
|
||||
u32 scoutfs_option_u32(struct super_block *sb, int token)
|
||||
{
|
||||
WARN_ON_ONCE(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_bdev_path(struct super_block *sb, substring_t *substr,
|
||||
char **bdev_path_ret)
|
||||
@@ -97,29 +89,8 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void free_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
kfree(opts->metadev_path);
|
||||
}
|
||||
|
||||
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
|
||||
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
memset(opts, 0, sizeof(*opts));
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->orphan_scan_delay_ms = DEFAULT_ORPHAN_SCAN_DELAY_MS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse the option string into our options struct. This can allocate
|
||||
* memory in the struct. The caller is responsible for always calling
|
||||
* free_options() when the struct is destroyed, including when we return
|
||||
* an error.
|
||||
*/
|
||||
static int parse_options(struct super_block *sb, char *options, struct scoutfs_mount_options *opts)
|
||||
int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
struct mount_options *parsed)
|
||||
{
|
||||
substring_t args[MAX_OPT_ARGS];
|
||||
int nr;
|
||||
@@ -127,61 +98,49 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
char *p;
|
||||
int ret;
|
||||
|
||||
/* Set defaults */
|
||||
memset(parsed, 0, sizeof(*parsed));
|
||||
parsed->quorum_slot_nr = -1;
|
||||
|
||||
while ((p = strsep(&options, ",")) != NULL) {
|
||||
if (!*p)
|
||||
continue;
|
||||
|
||||
token = match_token(p, tokens, args);
|
||||
switch (token) {
|
||||
|
||||
case Opt_metadev_path:
|
||||
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
|
||||
case Opt_orphan_scan_delay_ms:
|
||||
if (opts->orphan_scan_delay_ms != -1) {
|
||||
scoutfs_err(sb, "multiple orphan_scan_delay_ms options provided, only provide one.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 ||
|
||||
nr < MIN_ORPHAN_SCAN_DELAY_MS || nr > MAX_ORPHAN_SCAN_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid orphan_scan_delay_ms option, must be between %lu and %lu",
|
||||
MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
opts->orphan_scan_delay_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_quorum_slot_nr:
|
||||
if (opts->quorum_slot_nr != -1) {
|
||||
|
||||
if (parsed->quorum_slot_nr != -1) {
|
||||
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 || nr < 0 || nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
||||
if (ret < 0 || nr < 0 ||
|
||||
nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
||||
scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u",
|
||||
SCOUTFS_QUORUM_MAX_SLOTS - 1);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
opts->quorum_slot_nr = nr;
|
||||
parsed->quorum_slot_nr = nr;
|
||||
break;
|
||||
case Opt_metadev_path:
|
||||
|
||||
ret = parse_bdev_path(sb, &args[0],
|
||||
&parsed->metadev_path);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
default:
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
|
||||
return -EINVAL;
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
|
||||
p);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!opts->metadev_path) {
|
||||
if (!parsed->metadev_path) {
|
||||
scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -189,181 +148,40 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
return 0;
|
||||
}
|
||||
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
unsigned int seq;
|
||||
|
||||
if (WARN_ON_ONCE(optinf == NULL)) {
|
||||
/* trying to use options before early setup or after destroy */
|
||||
init_default_options(opts);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&optinf->seqlock);
|
||||
memcpy(opts, &optinf->opts, sizeof(struct scoutfs_mount_options));
|
||||
} while (read_seqretry(&optinf->seqlock, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Early setup that parses and stores the options so that the rest of
|
||||
* setup can use them. Full options setup that relies on other
|
||||
* components will be done later.
|
||||
*/
|
||||
int scoutfs_options_early_setup(struct super_block *sb, char *options)
|
||||
int scoutfs_options_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_mount_options opts;
|
||||
struct options_info *optinf;
|
||||
struct options_sb_info *osi;
|
||||
int ret;
|
||||
|
||||
init_default_options(&opts);
|
||||
osi = kzalloc(sizeof(struct options_sb_info), GFP_KERNEL);
|
||||
if (!osi)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = parse_options(sb, options, &opts);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
sbi->options = osi;
|
||||
|
||||
optinf = kzalloc(sizeof(struct options_info), GFP_KERNEL);
|
||||
if (!optinf) {
|
||||
osi->debugfs_dir = debugfs_create_dir("options", sbi->debug_root);
|
||||
if (!osi->debugfs_dir) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
seqlock_init(&optinf->seqlock);
|
||||
scoutfs_sysfs_init_attrs(sb, &optinf->sysfs_attrs);
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts = opts;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
sbi->options_info = optinf;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
free_options(&opts);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
seq_printf(seq, ",metadev_path=%s", opts.metadev_path);
|
||||
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
||||
if (opts.quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s", opts.metadev_path);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(metadev_path);
|
||||
|
||||
static ssize_t orphan_scan_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.orphan_scan_delay_ms);
|
||||
}
|
||||
static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[20]; /* more than enough for octal -U32_MAX */
|
||||
long val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtol(nullterm, 0, &val);
|
||||
if (ret < 0 || val < MIN_ORPHAN_SCAN_DELAY_MS || val > MAX_ORPHAN_SCAN_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid orphan_scan_delay_ms value written to options sysfs file, must be between %lu and %lu",
|
||||
MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.orphan_scan_delay_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
|
||||
|
||||
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", opts.quorum_slot_nr);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
int scoutfs_options_setup(struct super_block *sb)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_sysfs_create_attrs(sb, &optinf->sysfs_attrs, options_attrs, "mount_options");
|
||||
if (ret < 0)
|
||||
if (ret)
|
||||
scoutfs_options_destroy(sb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* We remove the sysfs files early in unmount so that they can't try to call other subsystems
|
||||
* as they're being destroyed.
|
||||
*/
|
||||
void scoutfs_options_stop(struct super_block *sb)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (optinf)
|
||||
scoutfs_sysfs_destroy_attrs(sb, &optinf->sysfs_attrs);
|
||||
}
|
||||
|
||||
void scoutfs_options_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
struct options_sb_info *osi = sbi->options;
|
||||
|
||||
scoutfs_options_stop(sb);
|
||||
|
||||
if (optinf) {
|
||||
free_options(&optinf->opts);
|
||||
kfree(optinf);
|
||||
sbi->options_info = NULL;
|
||||
if (osi) {
|
||||
if (osi->debugfs_dir)
|
||||
debugfs_remove_recursive(osi->debugfs_dir);
|
||||
kfree(osi);
|
||||
sbi->options = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,19 +5,23 @@
|
||||
#include <linux/in.h>
|
||||
#include "format.h"
|
||||
|
||||
struct scoutfs_mount_options {
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
enum scoutfs_mount_options {
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_metadev_path,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
||||
struct mount_options {
|
||||
int quorum_slot_nr;
|
||||
char *metadev_path;
|
||||
};
|
||||
|
||||
int scoutfs_options_early_setup(struct super_block *sb, char *options);
|
||||
int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
struct mount_options *parsed);
|
||||
int scoutfs_options_setup(struct super_block *sb);
|
||||
void scoutfs_options_stop(struct super_block *sb);
|
||||
void scoutfs_options_destroy(struct super_block *sb);
|
||||
|
||||
u32 scoutfs_option_u32(struct super_block *sb, int token);
|
||||
#define scoutfs_option_bool scoutfs_option_u32
|
||||
|
||||
#endif /* _SCOUTFS_OPTIONS_H_ */
|
||||
|
||||
@@ -105,8 +105,6 @@ enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
|
||||
struct quorum_status {
|
||||
enum quorum_role role;
|
||||
u64 term;
|
||||
u64 server_start_term;
|
||||
int server_event;
|
||||
int vote_for;
|
||||
unsigned long vote_bits;
|
||||
ktime_t timeout;
|
||||
@@ -118,7 +116,7 @@ struct quorum_info {
|
||||
struct socket *sock;
|
||||
bool shutdown;
|
||||
|
||||
int our_quorum_slot_nr;
|
||||
unsigned long flags;
|
||||
int votes_needed;
|
||||
|
||||
spinlock_t show_lock;
|
||||
@@ -129,6 +127,8 @@ struct quorum_info {
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
};
|
||||
|
||||
#define QINF_FLAG_SERVER 0
|
||||
|
||||
#define DECLARE_QUORUM_INFO(sb, name) \
|
||||
struct quorum_info *name = SCOUTFS_SB(sb)->quorum_info
|
||||
#define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \
|
||||
@@ -160,7 +160,9 @@ static ktime_t heartbeat_timeout(void)
|
||||
static int create_socket(struct super_block *sb)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct socket *sock = NULL;
|
||||
struct sockaddr_in sin;
|
||||
int addrlen;
|
||||
@@ -174,7 +176,7 @@ static int create_socket(struct super_block *sb)
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
|
||||
addrlen = sizeof(sin);
|
||||
ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
|
||||
@@ -205,6 +207,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
int only)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
ktime_t now;
|
||||
int i;
|
||||
@@ -213,7 +216,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
.fsid = super->hdr.fsid,
|
||||
.term = cpu_to_le64(term),
|
||||
.type = type,
|
||||
.from = qinf->our_quorum_slot_nr,
|
||||
.from = opts->quorum_slot_nr,
|
||||
};
|
||||
struct kvec kv = {
|
||||
.iov_base = &qmes,
|
||||
@@ -235,7 +238,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i) ||
|
||||
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
|
||||
(only >= 0 && i != only) || i == opts->quorum_slot_nr)
|
||||
continue;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, i, &sin);
|
||||
@@ -473,8 +476,8 @@ static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_
|
||||
*/
|
||||
static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
u64 blkno = SCOUTFS_QUORUM_BLKNO + qinf->our_quorum_slot_nr;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
|
||||
struct scoutfs_quorum_block blk;
|
||||
int ret;
|
||||
|
||||
@@ -493,6 +496,16 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has fenced previous leaders and reclaimed their
|
||||
* resources. We can now update our fence event with a greater term to
|
||||
* stop future leaders from doing the same.
|
||||
*/
|
||||
int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term)
|
||||
{
|
||||
return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has been elected and has started running but can't
|
||||
* yet assume that it has exclusive access to the metadata device. We
|
||||
@@ -582,9 +595,15 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
||||
}
|
||||
|
||||
out:
|
||||
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
if (fence_started) {
|
||||
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
} else {
|
||||
err = scoutfs_quorum_fence_complete(sb, term);
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, quorum_fence_error);
|
||||
@@ -592,34 +611,21 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The main quorum task maintains its private status. It seemed cleaner
|
||||
* to occasionally copy the status for showing in sysfs/debugfs files
|
||||
* than to have the two lock access to shared status. The show copy is
|
||||
* updated after being modified before the quorum task sleeps for a
|
||||
* significant amount of time, either waiting on timeouts or interacting
|
||||
* with the server.
|
||||
*/
|
||||
static void update_show_status(struct quorum_info *qinf, struct quorum_status *qst)
|
||||
{
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->show_status = *qst;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* The quorum work always runs in the background of quorum member
|
||||
* mounts. It's responsible for starting and stopping the server if
|
||||
* it's elected leader. While it's leader it sends heartbeats to
|
||||
* suppress other quorum work from standing for election.
|
||||
* it's elected leader, and the server can call back into it to let it
|
||||
* know that it has shut itself down (perhaps due to error) so that the
|
||||
* work should stop sending heartbeats.
|
||||
*/
|
||||
static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst = {0,};
|
||||
struct quorum_status qst;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
@@ -628,7 +634,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* start out as a follower */
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = 0;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
|
||||
/* read our starting term from greatest in all events in all slots */
|
||||
read_greatest_term(sb, &qst.term);
|
||||
@@ -646,8 +654,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
ret = recv_msg(sb, &msg, qst.timeout);
|
||||
if (ret < 0) {
|
||||
if (ret != -ETIMEDOUT && ret != -EAGAIN) {
|
||||
@@ -664,6 +670,24 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
msg.term < qst.term)
|
||||
msg.type = SCOUTFS_QUORUM_MSG_INVALID;
|
||||
|
||||
/* if the server has shutdown we become follower */
|
||||
if (!test_bit(QINF_FLAG_SERVER, &qinf->flags) &&
|
||||
qst.role == LEADER) {
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
||||
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
}
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->show_status = qst;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for,
|
||||
qst.vote_bits,
|
||||
ktime_to_timespec64(qst.timeout));
|
||||
@@ -674,6 +698,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
if (qst.role == LEADER) {
|
||||
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
|
||||
msg.type, msg.from, msg.term, qst.term);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = msg.term;
|
||||
@@ -695,18 +720,11 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
/* followers and candidates start new election on timeout */
|
||||
if (qst.role != LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
/* .. but only if their server has stopped */
|
||||
if (!scoutfs_server_is_down(sb)) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
|
||||
continue;
|
||||
}
|
||||
|
||||
qst.role = CANDIDATE;
|
||||
qst.term++;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
set_bit(qinf->our_quorum_slot_nr, &qst.vote_bits);
|
||||
set_bit(opts->quorum_slot_nr, &qst.vote_bits);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE,
|
||||
qst.term);
|
||||
qst.timeout = election_timeout();
|
||||
@@ -743,69 +761,29 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
qst.term);
|
||||
qst.timeout = heartbeat_interval();
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
/* record that we've been elected before starting up server */
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
qst.server_start_term = qst.term;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT;
|
||||
scoutfs_server_start(sb, qst.term);
|
||||
}
|
||||
|
||||
/*
|
||||
* This leader's server is up, having finished fencing
|
||||
* previous leaders. We update the fence event with the
|
||||
* current term to let future leaders know that previous
|
||||
* servers have been fenced.
|
||||
*/
|
||||
if (qst.role == LEADER && qst.server_event != SCOUTFS_QUORUM_EVENT_FENCE &&
|
||||
scoutfs_server_is_up(sb)) {
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, qst.term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_FENCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Stop a running server if we're no longer leader in
|
||||
* its term.
|
||||
*/
|
||||
if (!(qst.role == LEADER && qst.term == qst.server_start_term) &&
|
||||
scoutfs_server_is_running(sb)) {
|
||||
/* make very sure server is fully shut down */
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
/* set server bit before server shutdown could clear */
|
||||
set_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
|
||||
/*
|
||||
* A previously running server has stopped. The quorum
|
||||
* protocol might have shut it down by changing roles or
|
||||
* it might have stopped on its own, perhaps on errors.
|
||||
* If we're still a leader then we become a follower and
|
||||
* send resignations to encourage the next election.
|
||||
* Always update the _STOP event to stop connections and
|
||||
* fencing.
|
||||
*/
|
||||
if (qst.server_start_term > 0 && scoutfs_server_is_down(sb)) {
|
||||
if (qst.role == LEADER) {
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
||||
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.server_start_term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
ret = scoutfs_server_start(sb, qst.term);
|
||||
if (ret < 0) {
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
/* store our increased term */
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
|
||||
true);
|
||||
if (err < 0) {
|
||||
ret = err;
|
||||
goto out;
|
||||
}
|
||||
ret = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
qst.server_start_term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
qst.server_start_term = 0;
|
||||
}
|
||||
|
||||
/* leaders regularly send heartbeats to delay elections */
|
||||
@@ -842,19 +820,12 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
/* always try to stop a running server as we stop */
|
||||
if (scoutfs_server_is_running(sb)) {
|
||||
scoutfs_server_stop_wait(sb);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, qst.term);
|
||||
|
||||
if (qst.server_start_term > 0) {
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
qst.server_start_term, true);
|
||||
if (err < 0 && ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) {
|
||||
scoutfs_server_stop(sb);
|
||||
scoutfs_fence_stop(sb);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.term);
|
||||
}
|
||||
|
||||
/* record that this slot no longer has an active quorum */
|
||||
@@ -866,6 +837,21 @@ out:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has shutdown and is no longer using shared
|
||||
* resources. Clear the bit so that we stop sending heartbeats and
|
||||
* allow the next server to be elected. Update the stop event so that
|
||||
* it won't be considered available by clients or fenced by the next
|
||||
* leader.
|
||||
*/
|
||||
void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clients read quorum blocks looking for the leader with a server whose
|
||||
* address it can try and connect to.
|
||||
@@ -968,6 +954,7 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
||||
struct mount_options *opts = &SCOUTFS_SB(qinf->sb)->opts;
|
||||
struct quorum_status qst;
|
||||
struct last_msg last;
|
||||
struct timespec64 ts;
|
||||
@@ -984,11 +971,9 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
ret = 0;
|
||||
|
||||
snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n",
|
||||
qinf->our_quorum_slot_nr);
|
||||
opts->quorum_slot_nr);
|
||||
snprintf_ret(buf, size, &ret, "term %llu\n",
|
||||
qst.term);
|
||||
snprintf_ret(buf, size, &ret, "server_start_term %llu\n", qst.server_start_term);
|
||||
snprintf_ret(buf, size, &ret, "server_event %d\n", qst.server_event);
|
||||
snprintf_ret(buf, size, &ret, "role %d (%s)\n",
|
||||
qst.role, role_str(qst.role));
|
||||
snprintf_ret(buf, size, &ret, "vote_for %d\n",
|
||||
@@ -1063,6 +1048,7 @@ static inline bool valid_ipv4_port(__be16 port)
|
||||
static int verify_quorum_slots(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct sockaddr_in other;
|
||||
@@ -1113,7 +1099,7 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
|
||||
if (!quorum_slot_present(super, opts->quorum_slot_nr)) {
|
||||
char *str = slots;
|
||||
*str = '\0';
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
@@ -1128,7 +1114,7 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
}
|
||||
}
|
||||
scoutfs_err(sb, "quorum_slot_nr=%u option references unused slot, must be one of the following configured slots:%s",
|
||||
qinf->our_quorum_slot_nr, slots);
|
||||
opts->quorum_slot_nr, slots);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -1151,12 +1137,11 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
int scoutfs_quorum_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_mount_options opts;
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct quorum_info *qinf;
|
||||
int ret;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
if (opts.quorum_slot_nr < 0)
|
||||
if (opts->quorum_slot_nr < 0)
|
||||
return 0;
|
||||
|
||||
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
|
||||
@@ -1168,8 +1153,6 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
spin_lock_init(&qinf->show_lock);
|
||||
INIT_WORK(&qinf->work, scoutfs_quorum_worker);
|
||||
scoutfs_sysfs_init_attrs(sb, &qinf->ssa);
|
||||
/* static for the lifetime of the mount */
|
||||
qinf->our_quorum_slot_nr = opts.quorum_slot_nr;
|
||||
|
||||
sbi->quorum_info = qinf;
|
||||
qinf->sb = sb;
|
||||
|
||||
@@ -2,12 +2,14 @@
|
||||
#define _SCOUTFS_QUORUM_H_
|
||||
|
||||
int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
|
||||
void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term);
|
||||
|
||||
u8 scoutfs_quorum_votes_needed(struct super_block *sb);
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
|
||||
struct sockaddr_in *sin);
|
||||
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
|
||||
int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term);
|
||||
|
||||
int scoutfs_quorum_setup(struct super_block *sb);
|
||||
void scoutfs_quorum_shutdown(struct super_block *sb);
|
||||
|
||||
@@ -262,7 +262,7 @@ void scoutfs_recov_shutdown(struct super_block *sb)
|
||||
recinf->timeout_fn = NULL;
|
||||
spin_unlock(&recinf->lock);
|
||||
|
||||
list_for_each_entry_safe(pend, tmp, &list, head) {
|
||||
list_for_each_entry_safe(pend, tmp, &recinf->pending, head) {
|
||||
list_del(&pend->head);
|
||||
kfree(pend);
|
||||
}
|
||||
|
||||
@@ -403,24 +403,24 @@ TRACE_EVENT(scoutfs_sync_fs,
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_write_func,
|
||||
TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_pages),
|
||||
TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_bytes),
|
||||
|
||||
TP_ARGS(sb, dirty_block_bytes, dirty_item_pages),
|
||||
TP_ARGS(sb, dirty_block_bytes, dirty_item_bytes),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, dirty_block_bytes)
|
||||
__field(__u64, dirty_item_pages)
|
||||
__field(__u64, dirty_item_bytes)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->dirty_block_bytes = dirty_block_bytes;
|
||||
__entry->dirty_item_pages = dirty_item_pages;
|
||||
__entry->dirty_item_bytes = dirty_item_bytes;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_pages %llu",
|
||||
SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_pages)
|
||||
TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_bytes %llu",
|
||||
SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_bytes)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
@@ -1843,53 +1843,6 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
|
||||
TP_ARGS(sb, rid, nr_clients)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, holding)
|
||||
__field(int, applying)
|
||||
__field(int, nr_holders)
|
||||
__field(__u32, avail_before)
|
||||
__field(__u32, freed_before)
|
||||
__field(int, exceeded)
|
||||
),
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->holding = !!holding;
|
||||
__entry->applying = !!applying;
|
||||
__entry->nr_holders = nr_holders;
|
||||
__entry->avail_before = avail_before;
|
||||
__entry->freed_before = freed_before;
|
||||
__entry->exceeded = !!exceeded;
|
||||
),
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u exceeded %u",
|
||||
SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
|
||||
__entry->avail_before, __entry->freed_before, __entry->exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
|
||||
#define slt_symbolic(mode) \
|
||||
__print_symbolic(mode, \
|
||||
{ SLT_CLIENT, "client" }, \
|
||||
@@ -2667,9 +2620,9 @@ TRACE_EVENT(scoutfs_item_invalidate_page,
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
int bit_nr, int bit_count),
|
||||
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
@@ -2677,6 +2630,7 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
__field(__u64, group_nr)
|
||||
__field(unsigned int, group_total)
|
||||
__field(int, bit_nr)
|
||||
__field(int, bit_count)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -2685,42 +2639,43 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
__entry->group_nr = group_nr;
|
||||
__entry->group_total = group_total;
|
||||
__entry->bit_nr = bit_nr;
|
||||
__entry->bit_count = bit_count;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d",
|
||||
TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d",
|
||||
SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total,
|
||||
__entry->bit_nr)
|
||||
__entry->bit_nr, __entry->bit_count)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_omap_should_delete,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -64,6 +64,8 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
|
||||
struct scoutfs_key *key);
|
||||
void scoutfs_server_hold_commit(struct super_block *sb);
|
||||
int scoutfs_server_apply_commit(struct super_block *sb, int err);
|
||||
void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);
|
||||
|
||||
int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
|
||||
@@ -75,12 +77,9 @@ u64 scoutfs_server_seq(struct super_block *sb);
|
||||
u64 scoutfs_server_next_seq(struct super_block *sb);
|
||||
void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
|
||||
|
||||
void scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
int scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_abort(struct super_block *sb);
|
||||
void scoutfs_server_stop(struct super_block *sb);
|
||||
void scoutfs_server_stop_wait(struct super_block *sb);
|
||||
bool scoutfs_server_is_running(struct super_block *sb);
|
||||
bool scoutfs_server_is_up(struct super_block *sb);
|
||||
bool scoutfs_server_is_down(struct super_block *sb);
|
||||
|
||||
int scoutfs_server_setup(struct super_block *sb);
|
||||
void scoutfs_server_destroy(struct super_block *sb);
|
||||
|
||||
@@ -132,6 +132,44 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
if (opts->quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts->quorum_slot_nr);
|
||||
seq_printf(seq, ",metadev_path=%s", opts->metadev_path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(metadev_path);
|
||||
|
||||
static ssize_t quorum_server_nr_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", opts->quorum_slot_nr);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_server_nr);
|
||||
|
||||
static struct attribute *mount_options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(quorum_server_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int scoutfs_sync_fs(struct super_block *sb, int wait)
|
||||
{
|
||||
trace_scoutfs_sync_fs(sb, wait);
|
||||
@@ -208,11 +246,13 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
scoutfs_destroy_triggers(sb);
|
||||
scoutfs_fence_destroy(sb);
|
||||
scoutfs_options_destroy(sb);
|
||||
scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
|
||||
debugfs_remove(sbi->debug_root);
|
||||
scoutfs_destroy_counters(sb);
|
||||
scoutfs_destroy_sysfs(sb);
|
||||
scoutfs_metadev_close(sb);
|
||||
|
||||
kfree(sbi->opts.metadev_path);
|
||||
kfree(sbi);
|
||||
|
||||
sb->s_fs_info = NULL;
|
||||
@@ -242,7 +282,7 @@ static const struct super_operations scoutfs_super_ops = {
|
||||
.destroy_inode = scoutfs_destroy_inode,
|
||||
.sync_fs = scoutfs_sync_fs,
|
||||
.statfs = scoutfs_statfs,
|
||||
.show_options = scoutfs_options_show,
|
||||
.show_options = scoutfs_show_options,
|
||||
.put_super = scoutfs_put_super,
|
||||
.umount_begin = scoutfs_umount_begin,
|
||||
};
|
||||
@@ -471,9 +511,9 @@ out:
|
||||
|
||||
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
{
|
||||
struct scoutfs_mount_options opts;
|
||||
struct block_device *meta_bdev;
|
||||
struct scoutfs_sb_info *sbi;
|
||||
struct mount_options opts;
|
||||
struct block_device *meta_bdev;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
@@ -501,12 +541,13 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
spin_lock_init(&sbi->next_ino_lock);
|
||||
spin_lock_init(&sbi->data_wait_root.lock);
|
||||
sbi->data_wait_root.root = RB_ROOT;
|
||||
scoutfs_sysfs_init_attrs(sb, &sbi->mopts_ssa);
|
||||
|
||||
/* parse options early for use during setup */
|
||||
ret = scoutfs_options_early_setup(sb, data);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
scoutfs_options_read(sb, &opts);
|
||||
ret = scoutfs_parse_options(sb, data, &opts);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sbi->opts = opts;
|
||||
|
||||
ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
|
||||
if (ret != SCOUTFS_BLOCK_SM_SIZE) {
|
||||
@@ -515,7 +556,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
}
|
||||
|
||||
meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb);
|
||||
meta_bdev =
|
||||
blkdev_get_by_path(sbi->opts.metadev_path,
|
||||
SCOUTFS_META_BDEV_MODE, sb);
|
||||
if (IS_ERR(meta_bdev)) {
|
||||
scoutfs_err(sb, "could not open metadev: error %ld",
|
||||
PTR_ERR(meta_bdev));
|
||||
@@ -535,6 +578,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
scoutfs_setup_sysfs(sb) ?:
|
||||
scoutfs_setup_counters(sb) ?:
|
||||
scoutfs_options_setup(sb) ?:
|
||||
scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
|
||||
mount_options_attrs, "mount_options") ?:
|
||||
scoutfs_setup_triggers(sb) ?:
|
||||
scoutfs_fence_setup(sb) ?:
|
||||
scoutfs_block_setup(sb) ?:
|
||||
@@ -556,7 +601,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
|
||||
/* this interruptible iget lets hung mount be aborted with ctl-c */
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE, 0);
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ERESTARTSYS)
|
||||
@@ -607,7 +652,6 @@ static void scoutfs_kill_sb(struct super_block *sb)
|
||||
}
|
||||
|
||||
if (SCOUTFS_HAS_SBI(sb)) {
|
||||
scoutfs_options_stop(sb);
|
||||
scoutfs_inode_orphan_stop(sb);
|
||||
scoutfs_lock_unmount_begin(sb);
|
||||
}
|
||||
|
||||
@@ -44,7 +44,6 @@ struct scoutfs_sb_info {
|
||||
|
||||
spinlock_t next_ino_lock;
|
||||
|
||||
struct options_info *options_info;
|
||||
struct data_info *data_info;
|
||||
struct inode_sb_info *inode_sb_info;
|
||||
struct btree_info *btree_info;
|
||||
@@ -75,6 +74,10 @@ struct scoutfs_sb_info {
|
||||
struct scoutfs_counters *counters;
|
||||
struct scoutfs_triggers *triggers;
|
||||
|
||||
struct mount_options opts;
|
||||
struct options_sb_info *options;
|
||||
struct scoutfs_sysfs_attrs mopts_ssa;
|
||||
|
||||
struct dentry *debug_root;
|
||||
|
||||
bool forced_unmount;
|
||||
|
||||
@@ -37,15 +37,6 @@ struct attr_funcs {
|
||||
#define ATTR_FUNCS_RO(_name) \
|
||||
static struct attr_funcs _name##_attr_funcs = __ATTR_RO(_name)
|
||||
|
||||
static ssize_t data_device_maj_min_show(struct kobject *kobj, struct attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = KOBJ_TO_SB(kobj, sb_id_kobj);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u:%u\n",
|
||||
MAJOR(sb->s_bdev->bd_dev), MINOR(sb->s_bdev->bd_dev));
|
||||
}
|
||||
ATTR_FUNCS_RO(data_device_maj_min);
|
||||
|
||||
static ssize_t format_version_show(struct kobject *kobj, struct attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
@@ -110,7 +101,6 @@ static ssize_t attr_funcs_show(struct kobject *kobj, struct attribute *attr,
|
||||
|
||||
|
||||
static struct attribute *sb_id_attrs[] = {
|
||||
&data_device_maj_min_attr_funcs.attr,
|
||||
&format_version_attr_funcs.attr,
|
||||
&fsid_attr_funcs.attr,
|
||||
&rid_attr_funcs.attr,
|
||||
|
||||
@@ -207,7 +207,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
|
||||
}
|
||||
|
||||
trace_scoutfs_trans_write_func(sb, scoutfs_block_writer_dirty_bytes(sb, &tri->wri),
|
||||
scoutfs_item_dirty_pages(sb));
|
||||
scoutfs_item_dirty_bytes(sb));
|
||||
|
||||
if (tri->deadline_expired)
|
||||
scoutfs_inc_counter(sb, trans_commit_timer);
|
||||
@@ -422,16 +422,18 @@ static void release_holders(struct super_block *sb)
|
||||
*/
|
||||
static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
|
||||
{
|
||||
u64 dirty_blocks = (scoutfs_item_dirty_bytes(sb) >> SCOUTFS_BLOCK_LG_SHIFT) + 1;
|
||||
|
||||
/*
|
||||
* In theory each dirty item page could be straddling two full
|
||||
* blocks, requiring 4 allocations for each item cache page.
|
||||
* That's much too conservative, typically many dirty item cache
|
||||
* pages that are near each other all land in one block. This
|
||||
* In theory each dirty item could be added to a full block that
|
||||
* has to split, requiring 2 meta block allocs for each dirty
|
||||
* item. That's much too conservative, typically many dirty
|
||||
* items that are near each other all land in one block. This
|
||||
* rough estimate is still so far beyond what typically happens
|
||||
* that it accounts for having to dirty parent blocks and
|
||||
* whatever dirtying is done during the transaction hold.
|
||||
*/
|
||||
if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
|
||||
if (scoutfs_alloc_meta_low(sb, &tri->alloc, dirty_blocks * 4)) {
|
||||
scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
|
||||
return true;
|
||||
}
|
||||
@@ -640,7 +642,6 @@ void scoutfs_shutdown_trans(struct super_block *sb)
|
||||
tri->write_workq = NULL;
|
||||
}
|
||||
|
||||
scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri);
|
||||
scoutfs_block_writer_forget_all(sb, &tri->wri);
|
||||
|
||||
kfree(tri);
|
||||
|
||||
289
kmod/src/xattr.c
289
kmod/src/xattr.c
@@ -57,6 +57,12 @@ static u32 xattr_names_equal(const char *a_name, unsigned int a_len,
|
||||
return a_len == b_len && memcmp(a_name, b_name, a_len) == 0;
|
||||
}
|
||||
|
||||
static unsigned int xattr_full_bytes(struct scoutfs_xattr *xat)
|
||||
{
|
||||
return offsetof(struct scoutfs_xattr,
|
||||
name[xat->name_len + le16_to_cpu(xat->val_len)]);
|
||||
}
|
||||
|
||||
static unsigned int xattr_nr_parts(struct scoutfs_xattr *xat)
|
||||
{
|
||||
return SCOUTFS_XATTR_NR_PARTS(xat->name_len,
|
||||
@@ -131,29 +137,12 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
|
||||
}
|
||||
|
||||
/*
|
||||
* xattrs are stored in multiple items. The first item is a
|
||||
* concatenation of an initial header, the name, and then as much of the
|
||||
* value as fits in the remainder of the first item. This return the
|
||||
* size of the first item that'd store an xattr with the given name
|
||||
* length and value payload size.
|
||||
*/
|
||||
static int first_item_bytes(int name_len, size_t size)
|
||||
{
|
||||
if (WARN_ON_ONCE(name_len <= 0) ||
|
||||
WARN_ON_ONCE(name_len > SCOUTFS_XATTR_MAX_NAME_LEN))
|
||||
return 0;
|
||||
|
||||
return min_t(int, sizeof(struct scoutfs_xattr) + name_len + size,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next xattr, set the caller's key, and copy as much of the
|
||||
* first item into the callers buffer as we can. Returns the number of
|
||||
* bytes copied which can include the header, name, and start of the
|
||||
* value from the first item. The caller is responsible for comparing
|
||||
* their lengths, the header, and the returned length before safely
|
||||
* using the buffer.
|
||||
* Find the next xattr and copy the key, xattr header, and as much of
|
||||
* the name and value into the callers buffer as we can. Returns the
|
||||
* number of bytes copied which include the header, name, and value and
|
||||
* can be limited by the xattr length or the callers buffer. The caller
|
||||
* is responsible for comparing their lengths, the header, and the
|
||||
* returned length before safely using the xattr.
|
||||
*
|
||||
* If a name is provided then we'll iterate over items with a matching
|
||||
* name_hash until we find a matching name. If we don't find a matching
|
||||
@@ -165,17 +154,20 @@ static int first_item_bytes(int name_len, size_t size)
|
||||
* Returns -ENOENT if it didn't find a next item.
|
||||
*/
|
||||
static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
struct scoutfs_xattr *xat, unsigned int xat_bytes,
|
||||
struct scoutfs_xattr *xat, unsigned int bytes,
|
||||
const char *name, unsigned int name_len,
|
||||
u64 name_hash, u64 id, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key last;
|
||||
u8 last_part;
|
||||
int total;
|
||||
u8 part;
|
||||
int ret;
|
||||
|
||||
/* need to be able to see the name we're looking for */
|
||||
if (WARN_ON_ONCE(name_len > 0 &&
|
||||
xat_bytes < offsetof(struct scoutfs_xattr, name[name_len])))
|
||||
if (WARN_ON_ONCE(name_len > 0 && bytes < offsetof(struct scoutfs_xattr,
|
||||
name[name_len])))
|
||||
return -EINVAL;
|
||||
|
||||
if (name_len)
|
||||
@@ -184,15 +176,26 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
init_xattr_key(key, scoutfs_ino(inode), name_hash, id);
|
||||
init_xattr_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
|
||||
|
||||
last_part = 0;
|
||||
part = 0;
|
||||
total = 0;
|
||||
|
||||
for (;;) {
|
||||
ret = scoutfs_item_next(sb, key, &last, xat, xat_bytes, lock);
|
||||
if (ret < 0)
|
||||
key->skx_part = part;
|
||||
ret = scoutfs_item_next(sb, key, &last,
|
||||
(void *)xat + total, bytes - total,
|
||||
lock);
|
||||
if (ret < 0) {
|
||||
/* XXX corruption, ran out of parts */
|
||||
if (ret == -ENOENT && part > 0)
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
|
||||
trace_scoutfs_xattr_get_next_key(sb, key);
|
||||
|
||||
/* XXX corruption */
|
||||
if (key->skx_part != 0) {
|
||||
if (key->skx_part != part) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
@@ -202,7 +205,8 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
* the first part and if the next xattr name fits in our
|
||||
* buffer then the item must have included it.
|
||||
*/
|
||||
if ((ret < sizeof(struct scoutfs_xattr) ||
|
||||
if (part == 0 &&
|
||||
(ret < sizeof(struct scoutfs_xattr) ||
|
||||
(xat->name_len <= name_len &&
|
||||
ret < offsetof(struct scoutfs_xattr,
|
||||
name[xat->name_len])) ||
|
||||
@@ -212,7 +216,7 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
break;
|
||||
}
|
||||
|
||||
if (name_len > 0) {
|
||||
if (part == 0 && name_len) {
|
||||
/* ran out of names that could match */
|
||||
if (le64_to_cpu(key->skx_name_hash) != name_hash) {
|
||||
ret = -ENOENT;
|
||||
@@ -220,126 +224,64 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
}
|
||||
|
||||
/* keep looking for our name */
|
||||
if (!xattr_names_equal(name, name_len, xat->name, xat->name_len)) {
|
||||
if (!xattr_names_equal(name, name_len,
|
||||
xat->name, xat->name_len)) {
|
||||
part = 0;
|
||||
le64_add_cpu(&key->skx_id, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* use the matching name we found */
|
||||
last_part = xattr_nr_parts(xat) - 1;
|
||||
}
|
||||
|
||||
/* found next name */
|
||||
break;
|
||||
total += ret;
|
||||
if (total == bytes || part == last_part) {
|
||||
/* copied as much as we could */
|
||||
ret = total;
|
||||
break;
|
||||
}
|
||||
part++;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has already read and verified the xattr's first item.
|
||||
* Copy the value from the tail of the first item and from any future
|
||||
* items into the destination buffer.
|
||||
*/
|
||||
static int copy_xattr_value(struct super_block *sb, struct scoutfs_key *xat_key,
|
||||
struct scoutfs_xattr *xat, int xat_bytes,
|
||||
char *buffer, size_t size,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_key key;
|
||||
size_t copied = 0;
|
||||
int val_tail;
|
||||
int bytes;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
/* must have first item up to value */
|
||||
if (WARN_ON_ONCE(xat_bytes < sizeof(struct scoutfs_xattr)) ||
|
||||
WARN_ON_ONCE(xat_bytes < offsetof(struct scoutfs_xattr, name[xat->name_len])))
|
||||
return -EINVAL;
|
||||
|
||||
/* only ever copy up to the full value */
|
||||
size = min_t(size_t, size, le16_to_cpu(xat->val_len));
|
||||
|
||||
/* must have full first item if caller needs value from second item */
|
||||
val_tail = SCOUTFS_XATTR_MAX_PART_SIZE -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
if (WARN_ON_ONCE(size > val_tail && xat_bytes != SCOUTFS_XATTR_MAX_PART_SIZE))
|
||||
return -EINVAL;
|
||||
|
||||
/* copy from tail of first item */
|
||||
bytes = min_t(unsigned int, size, val_tail);
|
||||
if (bytes > 0) {
|
||||
memcpy(buffer, &xat->name[xat->name_len], bytes);
|
||||
copied += bytes;
|
||||
}
|
||||
|
||||
key = *xat_key;
|
||||
for (i = 1; copied < size; i++) {
|
||||
key.skx_part = i;
|
||||
bytes = min_t(unsigned int, size - copied, SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
ret = scoutfs_item_lookup(sb, &key, buffer + copied, bytes, lock);
|
||||
if (ret >= 0 && ret != bytes)
|
||||
ret = -EIO;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
copied += ret;
|
||||
}
|
||||
|
||||
return copied;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is working with items that are either in the allocated
|
||||
* first compound item or further items that are offsets into a value
|
||||
* buffer. Give them a pointer and length of the start of the item.
|
||||
*/
|
||||
static void xattr_item_part_buffer(void **buf, int *len, int part,
|
||||
struct scoutfs_xattr *xat, unsigned int xat_bytes,
|
||||
const char *value, size_t size)
|
||||
{
|
||||
int off;
|
||||
|
||||
if (part == 0) {
|
||||
*buf = xat;
|
||||
*len = xat_bytes;
|
||||
} else {
|
||||
off = (part * SCOUTFS_XATTR_MAX_PART_SIZE) -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
BUG_ON(off >= size); /* calls limited by number of parts */
|
||||
*buf = (void *)value + off;
|
||||
*len = min_t(size_t, size - off, SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create all the items associated with the given xattr. If this
|
||||
* returns an error it will have already cleaned up any items it created
|
||||
* before seeing the error.
|
||||
*/
|
||||
static int create_xattr_items(struct inode *inode, u64 id, struct scoutfs_xattr *xat,
|
||||
int xat_bytes, const char *value, size_t size, u8 new_parts,
|
||||
static int create_xattr_items(struct inode *inode, u64 id,
|
||||
struct scoutfs_xattr *xat, unsigned int bytes,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
void *buf;
|
||||
int len;
|
||||
int i;
|
||||
unsigned int part_bytes;
|
||||
unsigned int total;
|
||||
int ret;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
for (i = 0; i < new_parts; i++) {
|
||||
key.skx_part = i;
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
total = 0;
|
||||
ret = 0;
|
||||
while (total < bytes) {
|
||||
part_bytes = min_t(unsigned int, bytes - total,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, buf, len, lock);
|
||||
if (ret < 0) {
|
||||
ret = scoutfs_item_create(sb, &key,
|
||||
(void *)xat + total, part_bytes,
|
||||
lock);
|
||||
if (ret) {
|
||||
while (key.skx_part-- > 0)
|
||||
scoutfs_item_delete(sb, &key, lock);
|
||||
break;
|
||||
}
|
||||
|
||||
total += part_bytes;
|
||||
key.skx_part++;
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -387,20 +329,20 @@ out:
|
||||
* deleted items.
|
||||
*/
|
||||
static int change_xattr_items(struct inode *inode, u64 id,
|
||||
struct scoutfs_xattr *xat, int xat_bytes,
|
||||
const char *value, size_t size,
|
||||
u8 new_parts, u8 old_parts, struct scoutfs_lock *lock)
|
||||
struct scoutfs_xattr *new_xat,
|
||||
unsigned int new_bytes, u8 new_parts,
|
||||
u8 old_parts, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key key;
|
||||
int last_created = -1;
|
||||
void *buf;
|
||||
int len;
|
||||
int bytes;
|
||||
int off;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
xattr_name_hash(new_xat->name, new_xat->name_len), id);
|
||||
|
||||
/* dirty existing old items */
|
||||
for (i = 0; i < old_parts; i++) {
|
||||
@@ -412,10 +354,13 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
|
||||
/* create any new items past the old */
|
||||
for (i = old_parts; i < new_parts; i++) {
|
||||
key.skx_part = i;
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
|
||||
bytes = min_t(unsigned int, new_bytes - off,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, buf, len, lock);
|
||||
key.skx_part = i;
|
||||
ret = scoutfs_item_create(sb, &key, (void *)new_xat + off,
|
||||
bytes, lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -424,10 +369,13 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
|
||||
/* update dirtied overlapping existing items, last partial first */
|
||||
for (i = min(old_parts, new_parts) - 1; i >= 0; i--) {
|
||||
key.skx_part = i;
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
|
||||
bytes = min_t(unsigned int, new_bytes - off,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
ret = scoutfs_item_update(sb, &key, buf, len, lock);
|
||||
key.skx_part = i;
|
||||
ret = scoutfs_item_update(sb, &key, (void *)new_xat + off,
|
||||
bytes, lock);
|
||||
/* only last partial can fail, then we unwind created */
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@@ -464,7 +412,7 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_key key;
|
||||
unsigned int xat_bytes;
|
||||
unsigned int bytes;
|
||||
size_t name_len;
|
||||
int ret;
|
||||
|
||||
@@ -475,8 +423,9 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
if (name_len > SCOUTFS_XATTR_MAX_NAME_LEN)
|
||||
return -ENODATA;
|
||||
|
||||
xat_bytes = first_item_bytes(name_len, size);
|
||||
xat = kmalloc(xat_bytes, GFP_NOFS);
|
||||
/* only need enough for caller's name and value sizes */
|
||||
bytes = sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
|
||||
if (!xat)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -486,32 +435,40 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
|
||||
down_read(&si->xattr_rwsem);
|
||||
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes, name, name_len, 0, 0, lck);
|
||||
ret = get_next_xattr(inode, &key, xat, bytes,
|
||||
name, name_len, 0, 0, lck);
|
||||
|
||||
up_read(&si->xattr_rwsem);
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_READ);
|
||||
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = -ENODATA;
|
||||
goto unlock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* the caller just wants to know the size */
|
||||
if (size == 0) {
|
||||
ret = le16_to_cpu(xat->val_len);
|
||||
goto unlock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* the caller's buffer wasn't big enough */
|
||||
if (size < le16_to_cpu(xat->val_len)) {
|
||||
ret = -ERANGE;
|
||||
goto unlock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = copy_xattr_value(sb, &key, xat, xat_bytes, buffer, size, lck);
|
||||
unlock:
|
||||
up_read(&si->xattr_rwsem);
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_READ);
|
||||
/* XXX corruption, the items didn't match the header */
|
||||
if (ret < xattr_full_bytes(xat)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = le16_to_cpu(xat->val_len);
|
||||
memcpy(buffer, &xat->name[xat->name_len], ret);
|
||||
out:
|
||||
kfree(xat);
|
||||
vfree(xat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -639,8 +596,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
bool undo_totl = false;
|
||||
LIST_HEAD(ind_locks);
|
||||
u8 found_parts;
|
||||
unsigned int xat_bytes_totl;
|
||||
unsigned int xat_bytes;
|
||||
unsigned int bytes;
|
||||
unsigned int val_len;
|
||||
u64 ind_seq;
|
||||
u64 total;
|
||||
@@ -673,12 +629,9 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
if (tgs.totl && ((ret = parse_totl_key(&totl_key, name, name_len)) != 0))
|
||||
return ret;
|
||||
|
||||
/* allocate enough to always read an existing xattr's totl */
|
||||
xat_bytes_totl = first_item_bytes(name_len,
|
||||
max_t(size_t, size, SCOUTFS_XATTR_MAX_TOTL_U64));
|
||||
/* but store partial first item that only includes the new xattr's value */
|
||||
xat_bytes = first_item_bytes(name_len, size);
|
||||
xat = kmalloc(xat_bytes_totl, GFP_NOFS);
|
||||
bytes = sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
/* alloc enough to read old totl value */
|
||||
xat = __vmalloc(bytes + SCOUTFS_XATTR_MAX_TOTL_U64, GFP_NOFS, PAGE_KERNEL);
|
||||
if (!xat) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@@ -692,7 +645,9 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
down_write(&si->xattr_rwsem);
|
||||
|
||||
/* find an existing xattr to delete, including possible totl value */
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes_totl, name, name_len, 0, 0, lck);
|
||||
ret = get_next_xattr(inode, &key, xat,
|
||||
sizeof(struct scoutfs_xattr) + name_len + SCOUTFS_XATTR_MAX_TOTL_U64,
|
||||
name, name_len, 0, 0, lck);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto unlock;
|
||||
|
||||
@@ -728,7 +683,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
le64_add_cpu(&tval.total, -total);
|
||||
}
|
||||
|
||||
/* prepare the xattr header, name, and start of value in first item */
|
||||
/* prepare our xattr */
|
||||
if (value) {
|
||||
if (found_parts)
|
||||
id = le64_to_cpu(key.skx_id);
|
||||
@@ -738,9 +693,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
xat->val_len = cpu_to_le16(size);
|
||||
memset(xat->__pad, 0, sizeof(xat->__pad));
|
||||
memcpy(xat->name, name, name_len);
|
||||
memcpy(&xat->name[name_len], value,
|
||||
min(size, SCOUTFS_XATTR_MAX_PART_SIZE -
|
||||
offsetof(struct scoutfs_xattr, name[name_len])));
|
||||
memcpy(&xat->name[xat->name_len], value, size);
|
||||
|
||||
if (tgs.totl) {
|
||||
ret = parse_totl_u64(value, size, &total);
|
||||
@@ -788,15 +741,14 @@ retry:
|
||||
}
|
||||
|
||||
if (found_parts && value)
|
||||
ret = change_xattr_items(inode, id, xat, xat_bytes, value, size,
|
||||
ret = change_xattr_items(inode, id, xat, bytes,
|
||||
xattr_nr_parts(xat), found_parts, lck);
|
||||
else if (found_parts)
|
||||
ret = delete_xattr_items(inode, le64_to_cpu(key.skx_name_hash),
|
||||
le64_to_cpu(key.skx_id), found_parts,
|
||||
lck);
|
||||
else
|
||||
ret = create_xattr_items(inode, id, xat, xat_bytes, value, size,
|
||||
xattr_nr_parts(xat), lck);
|
||||
ret = create_xattr_items(inode, id, xat, bytes, lck);
|
||||
if (ret < 0)
|
||||
goto release;
|
||||
|
||||
@@ -826,7 +778,7 @@ unlock:
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
out:
|
||||
kfree(xat);
|
||||
vfree(xat);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -855,7 +807,7 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_key key;
|
||||
unsigned int xat_bytes;
|
||||
unsigned int bytes;
|
||||
ssize_t total = 0;
|
||||
u32 name_hash = 0;
|
||||
bool is_hidden;
|
||||
@@ -868,8 +820,8 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
id = *id_pos;
|
||||
|
||||
/* need a buffer large enough for all possible names */
|
||||
xat_bytes = first_item_bytes(SCOUTFS_XATTR_MAX_NAME_LEN, 0);
|
||||
xat = kmalloc(xat_bytes, GFP_NOFS);
|
||||
bytes = sizeof(struct scoutfs_xattr) + SCOUTFS_XATTR_MAX_NAME_LEN;
|
||||
xat = kmalloc(bytes, GFP_NOFS);
|
||||
if (!xat) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@@ -882,7 +834,8 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
down_read(&si->xattr_rwsem);
|
||||
|
||||
for (;;) {
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes, NULL, 0, name_hash, id, lck);
|
||||
ret = get_next_xattr(inode, &key, xat, bytes,
|
||||
NULL, 0, name_hash, id, lck);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = total;
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -3,7 +3,6 @@ src/createmany
|
||||
src/dumb_renameat2
|
||||
src/dumb_setxattr
|
||||
src/handle_cat
|
||||
src/handle_fsetxattr
|
||||
src/bulk_create_paths
|
||||
src/find_xattrs
|
||||
src/stage_tmpfile
|
||||
|
||||
@@ -6,7 +6,6 @@ BIN := src/createmany \
|
||||
src/dumb_renameat2 \
|
||||
src/dumb_setxattr \
|
||||
src/handle_cat \
|
||||
src/handle_fsetxattr \
|
||||
src/bulk_create_paths \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs \
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
#!/usr/bin/bash
|
||||
|
||||
#
|
||||
# This fencing script is used for testing clusters of multiple mounts on
|
||||
# a single host. It finds mounts to fence by looking for their rids and
|
||||
# only knows how to "fence" by using forced unmount.
|
||||
#
|
||||
|
||||
echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
|
||||
|
||||
log() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo_fail() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
}
|
||||
|
||||
rid="$SCOUTFS_FENCED_REQ_RID"
|
||||
|
||||
for fs in /sys/fs/scoutfs/*; do
|
||||
[ ! -d "$fs" ] && continue
|
||||
|
||||
fs_rid="$(cat $fs/rid)" || \
|
||||
echo_fail "failed to get rid in $fs"
|
||||
if [ "$fs_rid" != "$rid" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
nr="$(cat $fs/data_device_maj_min)" || \
|
||||
echo_fail "failed to get data device major:minor in $fs"
|
||||
|
||||
mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
|
||||
echo_fail "findmnt -t scoutfs -S $nr failed"
|
||||
for mnt in $mnts; do
|
||||
umount -f "$mnt" || \
|
||||
echo_fail "umout -f $mnt failed"
|
||||
done
|
||||
done
|
||||
|
||||
exit 0
|
||||
@@ -56,11 +56,8 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .*: all clients recovered"
|
||||
re="$re|scoutfs .* error: client rid.*lock recovery timed out"
|
||||
|
||||
# we test bad devices and options
|
||||
# some tests mount w/o options
|
||||
re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
|
||||
re="$re|scoutfs .* error: meta_super META flag not set"
|
||||
re="$re|scoutfs .* error: could not open metadev:.*"
|
||||
re="$re|scoutfs .* error: Unknown or malformed option,.*"
|
||||
|
||||
# in debugging kernels we can slow things down a bit
|
||||
re="$re|hrtimer: interrupt took .*"
|
||||
|
||||
@@ -75,20 +75,6 @@ t_fs_nrs()
|
||||
seq 0 $((T_NR_MOUNTS - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
|
||||
# All other cases output 0, including the fs nr being a client which
|
||||
# won't have a quorum/ dir.
|
||||
#
|
||||
t_fs_is_leader()
|
||||
{
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader 2>/dev/null)" == "1" ]; then
|
||||
echo "1"
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# Output the mount nr of the current server. This takes no steps to
|
||||
# ensure that the server doesn't shut down and have some other mount
|
||||
@@ -97,7 +83,7 @@ t_fs_is_leader()
|
||||
t_server_nr()
|
||||
{
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$(t_fs_is_leader $i)" == "1" ]; then
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "1" ]; then
|
||||
echo $i
|
||||
return
|
||||
fi
|
||||
@@ -115,7 +101,7 @@ t_server_nr()
|
||||
t_first_client_nr()
|
||||
{
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$(t_fs_is_leader $i)" == "0" ]; then
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "0" ]; then
|
||||
echo $i
|
||||
return
|
||||
fi
|
||||
@@ -376,49 +362,3 @@ t_wait_for_leader() {
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
t_set_sysfs_mount_option() {
|
||||
local nr="$1"
|
||||
local name="$2"
|
||||
local val="$3"
|
||||
local opt="$(t_sysfs_path $nr)/mount_options/$name"
|
||||
|
||||
echo "$val" > "$opt"
|
||||
}
|
||||
|
||||
t_set_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local val="$2"
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
t_set_sysfs_mount_option $i $name $val
|
||||
done
|
||||
}
|
||||
|
||||
declare -A _saved_opts
|
||||
t_save_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local ind
|
||||
local opt
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
opt="$(t_sysfs_path $i)/mount_options/$name"
|
||||
ind="$name_$i"
|
||||
|
||||
_saved_opts[$ind]="$(cat $opt)"
|
||||
done
|
||||
}
|
||||
|
||||
t_restore_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local ind
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
ind="$name_$i"
|
||||
|
||||
t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
== prepare devices, mount point, and logs
|
||||
== bad devices, bad options
|
||||
== swapped devices
|
||||
== both meta devices
|
||||
== both data devices
|
||||
== good volume, bad option and good options
|
||||
@@ -1 +0,0 @@
|
||||
== 60s of unmounting non-quorum clients during recovery
|
||||
@@ -1,3 +0,0 @@
|
||||
== creating reasonably large per-mount files
|
||||
== 10s of racing cold reads and fallocate nop
|
||||
== cleaning up files
|
||||
@@ -2,4 +2,3 @@
|
||||
== unlinked and opened inodes still exist
|
||||
== orphan from failed evict deletion is picked up
|
||||
== orphaned inos in all mounts all deleted
|
||||
== 30s of racing evict deletion, orphan scanning, and open by handle
|
||||
|
||||
@@ -227,9 +227,8 @@ test "$T_QUORUM" -le "$T_NR_MOUNTS" || \
|
||||
die "-q quorum mmembers must not be greater than -n mounts"
|
||||
|
||||
# top level paths
|
||||
T_TESTS=$(realpath "$(dirname $0)")
|
||||
T_KMOD=$(realpath "$T_TESTS/../kmod")
|
||||
T_UTILS=$(realpath "$T_TESTS/../utils")
|
||||
T_KMOD=$(realpath "$(dirname $0)/../kmod")
|
||||
T_UTILS=$(realpath "$T_KMOD/../utils")
|
||||
|
||||
test -d "$T_KMOD" || die "kmod/ repo dir $T_KMOD not directory"
|
||||
test -d "$T_UTILS" || die "utils/ repo dir $T_UTILS not directory"
|
||||
@@ -380,14 +379,13 @@ cmd grep . /sys/kernel/debug/tracing/options/trace_printk \
|
||||
# Build a fenced config that runs scripts out of the repository rather
|
||||
# than the default system directory
|
||||
#
|
||||
conf="$T_RESULTS/scoutfs-fenced.conf"
|
||||
conf="$T_RESULTS/scoutfs-fencd.conf"
|
||||
cat > $conf << EOF
|
||||
SCOUTFS_FENCED_DELAY=1
|
||||
SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh
|
||||
SCOUTFS_FENCED_RUN_ARGS="ignored run args"
|
||||
SCOUTFS_FENCED_RUN=$T_UTILS/fenced/local-force-unmount
|
||||
SCOUTFS_FENCED_RUN_ARGS=""
|
||||
EOF
|
||||
export SCOUTFS_FENCED_CONFIG_FILE="$conf"
|
||||
T_FENCED_LOG="$T_RESULTS/fenced.log"
|
||||
|
||||
#
|
||||
# Run the agent in the background, log its output, an kill it if we
|
||||
@@ -395,7 +393,7 @@ T_FENCED_LOG="$T_RESULTS/fenced.log"
|
||||
#
|
||||
fenced_log()
|
||||
{
|
||||
echo "[$(timestamp)] $*" >> "$T_FENCED_LOG"
|
||||
echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log"
|
||||
}
|
||||
fenced_pid=""
|
||||
kill_fenced()
|
||||
@@ -406,7 +404,7 @@ kill_fenced()
|
||||
fi
|
||||
}
|
||||
trap kill_fenced EXIT
|
||||
$T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
|
||||
$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" &
|
||||
fenced_pid=$!
|
||||
fenced_log "started fenced pid $fenced_pid in the background"
|
||||
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
export-get-name-parent.sh
|
||||
basic-block-counts.sh
|
||||
basic-bad-mounts.sh
|
||||
inode-items-updated.sh
|
||||
simple-inode-index.sh
|
||||
simple-staging.sh
|
||||
simple-release-extents.sh
|
||||
fallocate.sh
|
||||
setattr_more.sh
|
||||
offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
@@ -35,7 +33,6 @@ resize-devices.sh
|
||||
fence-and-reclaim.sh
|
||||
orphan-inodes.sh
|
||||
mount-unmount-race.sh
|
||||
client-unmount-recovery.sh
|
||||
createmany-parallel-mounts.sh
|
||||
archive-light-cycle.sh
|
||||
block-stale-reads.sh
|
||||
|
||||
@@ -1,189 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <inttypes.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <endian.h>
|
||||
#include <time.h>
|
||||
#include <linux/types.h>
|
||||
#include <sys/xattr.h>
|
||||
|
||||
#define FILEID_SCOUTFS 0x81
|
||||
#define FILEID_SCOUTFS_WITH_PARENT 0x82
|
||||
|
||||
struct our_handle {
|
||||
struct file_handle handle;
|
||||
/*
|
||||
* scoutfs file handle can be ino or ino/parent. The
|
||||
* handle_type field of struct file_handle denotes which
|
||||
* version is in use. We only use the ino variant here.
|
||||
*/
|
||||
__le64 scoutfs_ino;
|
||||
};
|
||||
|
||||
#define DEFAULT_NAME "user.handle_fsetxattr"
|
||||
#define DEFAULT_VALUE "value"
|
||||
|
||||
static void exit_usage(void)
|
||||
{
|
||||
printf(" -h/-? output this usage message and exit\n"
|
||||
" -e keep trying on enoent, consider success an error\n"
|
||||
" -i <num> 64bit inode number for handle open, can be multiple\n"
|
||||
" -m <string> scoutfs mount path string for ioctl fd\n"
|
||||
" -n <string> optional xattr name string, defaults to \""DEFAULT_NAME"\"\n"
|
||||
" -s <num> loop for num seconds, defaults to 0 for one iteration"
|
||||
" -v <string> optional xattr value string, defaults to \""DEFAULT_VALUE"\"\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct our_handle handle;
|
||||
struct timespec ts;
|
||||
bool enoent_success_err = false;
|
||||
uint64_t seconds = 0;
|
||||
char *value = NULL;
|
||||
char *name = NULL;
|
||||
char *mnt = NULL;
|
||||
int nr_inos = 0;
|
||||
uint64_t *inos;
|
||||
uint64_t i;
|
||||
int *fds;
|
||||
int mntfd;
|
||||
int fd;
|
||||
int ret;
|
||||
char c;
|
||||
int j;
|
||||
|
||||
/* can't have more inos than args */
|
||||
inos = calloc(argc, sizeof(inos[0]));
|
||||
fds = calloc(argc, sizeof(fds[0]));
|
||||
if (!inos || !fds) {
|
||||
perror("calloc");
|
||||
exit(1);
|
||||
}
|
||||
for (i = 0; i < argc; i++)
|
||||
fds[i] = -1;
|
||||
|
||||
while ((c = getopt(argc, argv, "+ei:m:n:s:v:")) != -1) {
|
||||
switch (c) {
|
||||
case 'e':
|
||||
enoent_success_err = true;
|
||||
break;
|
||||
case 'i':
|
||||
inos[nr_inos] = strtoll(optarg, NULL, 0);
|
||||
nr_inos++;
|
||||
break;
|
||||
case 'm':
|
||||
mnt = strdup(optarg);
|
||||
break;
|
||||
case 'n':
|
||||
name = strdup(optarg);
|
||||
break;
|
||||
case 's':
|
||||
seconds = strtoll(optarg, NULL, 0);
|
||||
break;
|
||||
case 'v':
|
||||
value = strdup(optarg);
|
||||
break;
|
||||
case '?':
|
||||
printf("unknown argument: %c\n", optind);
|
||||
case 'h':
|
||||
exit_usage();
|
||||
}
|
||||
}
|
||||
|
||||
if (nr_inos == 0) {
|
||||
printf("specify non-zero inode number with -i\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!mnt) {
|
||||
printf("specify scoutfs mount path for ioctl with -p\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (name == NULL)
|
||||
name = DEFAULT_NAME;
|
||||
if (value == NULL)
|
||||
value = DEFAULT_VALUE;
|
||||
|
||||
mntfd = open(mnt, O_RDONLY);
|
||||
if (mntfd == -1) {
|
||||
perror("opening mountpoint");
|
||||
return 1;
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
seconds += ts.tv_sec;
|
||||
|
||||
for (i = 0; ; i++) {
|
||||
for (j = 0; j < nr_inos; j++) {
|
||||
fd = fds[j];
|
||||
|
||||
if (fd < 0) {
|
||||
handle.handle.handle_bytes = sizeof(struct our_handle);
|
||||
handle.handle.handle_type = FILEID_SCOUTFS;
|
||||
handle.scoutfs_ino = htole64(inos[j]);
|
||||
|
||||
fd = open_by_handle_at(mntfd, &handle.handle, O_RDWR);
|
||||
if (fd == -1) {
|
||||
if (!enoent_success_err || errno != ENOENT) {
|
||||
perror("open_by_handle_at");
|
||||
return 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
fds[j] = fd;
|
||||
}
|
||||
|
||||
ret = fsetxattr(fd, name, value, strlen(value), 0);
|
||||
if (ret < 0) {
|
||||
perror("fsetxattr");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if ((i % 10) == 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
if (ts.tv_sec >= seconds)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (enoent_success_err) {
|
||||
bool able = false;
|
||||
for (i = 0; i < nr_inos; i++) {
|
||||
if (fds[i] >= 0) {
|
||||
printf("was able to open ino %"PRIu64"\n", inos[i]);
|
||||
able = true;
|
||||
}
|
||||
}
|
||||
if (able)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* not bothering to close or free */
|
||||
return 0;
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
|
||||
mount_fail()
|
||||
{
|
||||
local mnt=${!#}
|
||||
|
||||
echo "mounting $@" >> $T_TMP.mount.out
|
||||
mount -t scoutfs "$@" >> $T_TMP.mount.out 2>&1
|
||||
if [ $? == 0 ]; then
|
||||
umount "$mnt" || t_fail "couldn't unmount"
|
||||
t_fail "bad mount succeeded"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "== prepare devices, mount point, and logs"
|
||||
SCR="/mnt/scoutfs.extra"
|
||||
mkdir -p "$SCR"
|
||||
> $T_TMP.mount.out
|
||||
scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
|
||||
|| t_fail "mkfs failed"
|
||||
|
||||
echo "== bad devices, bad options"
|
||||
mount_fail -o _bad /dev/null /dev/null "$SCR"
|
||||
|
||||
echo "== swapped devices"
|
||||
mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
|
||||
|
||||
echo "== both meta devices"
|
||||
mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
|
||||
|
||||
echo "== both data devices"
|
||||
mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
echo "== good volume, bad option and good options"
|
||||
mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
t_pass
|
||||
@@ -1,61 +0,0 @@
|
||||
#
|
||||
# Unmount Server and unmount a client as it's replaying to a remaining server
|
||||
#
|
||||
|
||||
majority_nr=$(t_majority_count)
|
||||
quorum_nr=$T_QUORUM
|
||||
|
||||
test "$quorum_nr" == "$majority_nr" && \
|
||||
t_skip "all quorum members make up majority, need more mounts to unmount"
|
||||
|
||||
test "$T_NR_MOUNTS" -lt "$T_QUORUM" && \
|
||||
t_skip "Need enough non-quorum clients to unmount"
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
mounted[$i]=1
|
||||
done
|
||||
|
||||
LENGTH=60
|
||||
echo "== ${LENGTH}s of unmounting non-quorum clients during recovery"
|
||||
END=$((SECONDS + LENGTH))
|
||||
while [ "$SECONDS" -lt "$END" ]; do
|
||||
sv=$(t_server_nr)
|
||||
rid=$(t_mount_rid $sv)
|
||||
echo "sv $sv rid $rid" >> "$T_TMP.log"
|
||||
sync
|
||||
t_umount $sv &
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$i" -ge "$quorum_nr" ]; then
|
||||
t_umount $i &
|
||||
echo "umount $i pid $pid quo $quorum_nr" \
|
||||
>> $T_TMP.log
|
||||
mounted[$i]=0
|
||||
fi
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
t_mount $sv &
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "${mounted[$i]}" == 0 ]; then
|
||||
t_mount $i &
|
||||
fi
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
declare RID_LIST=$(cat /sys/fs/scoutfs/*/rid | sort -u)
|
||||
read -a rid_arr <<< $RID_LIST
|
||||
|
||||
declare LOCK_LIST=$(cut -d' ' -f 5 /sys/kernel/debug/scoutfs/*/server_locks | sort -u)
|
||||
read -a lock_arr <<< $LOCK_LIST
|
||||
|
||||
for i in "${lock_arr[@]}"; do
|
||||
if [[ ! " ${rid_arr[*]} " =~ " $i " ]]; then
|
||||
t_fail "RID($i): exists when not mounted"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
t_pass
|
||||
@@ -1,38 +0,0 @@
|
||||
|
||||
t_require_commands fallocate cat
|
||||
|
||||
echo "== creating reasonably large per-mount files"
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/file-$n"
|
||||
|
||||
LC_ALL=C fallocate -l 128MiB "$path" || \
|
||||
t_fail "initial creating fallocate failed"
|
||||
done
|
||||
|
||||
#
|
||||
# we had lock inversions between read and fallocate, dropping
|
||||
# the cache each time forces waiting for IO during the calls
|
||||
# with the inverted locks held so we have a better chance
|
||||
# of the deadlock happening.
|
||||
#
|
||||
DURATION=10
|
||||
echo "== ${DURATION}s of racing cold reads and fallocate nop"
|
||||
END=$((SECONDS + DURATION))
|
||||
while [ $SECONDS -le $END ]; do
|
||||
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/file-$n"
|
||||
|
||||
LC_ALL=C fallocate -o 0 -l 4KiB "$path" &
|
||||
cat "$path" > /dev/null &
|
||||
done
|
||||
|
||||
wait || t_fail "fallocate or cat failed"
|
||||
done
|
||||
|
||||
echo "== cleaning up files"
|
||||
rm -f "$T_D0"/file-*
|
||||
|
||||
t_pass
|
||||
@@ -45,18 +45,6 @@ check_read_write()
|
||||
fi
|
||||
}
|
||||
|
||||
# verify that fenced ran our testing fence script
|
||||
verify_fenced_run()
|
||||
{
|
||||
local rids="$@"
|
||||
local rid
|
||||
|
||||
for rid in $rids; do
|
||||
grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \
|
||||
t_fail "fenced didn't execute RUN script for rid $rid"
|
||||
done
|
||||
}
|
||||
|
||||
echo "== make sure all mounts can see each other"
|
||||
check_read_write
|
||||
|
||||
@@ -74,14 +62,12 @@ done
|
||||
while t_rid_is_fencing $rid; do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rid
|
||||
t_mount $cl
|
||||
check_read_write
|
||||
|
||||
echo "== force unmount all non-server, connection timeout, fence nop, mount"
|
||||
sv=$(t_server_nr)
|
||||
pattern="nonsense"
|
||||
rids=""
|
||||
sync
|
||||
for cl in $(t_fs_nrs); do
|
||||
if [ $cl == $sv ]; then
|
||||
@@ -89,7 +75,6 @@ for cl in $(t_fs_nrs); do
|
||||
fi
|
||||
|
||||
rid=$(t_mount_rid $cl)
|
||||
rids="$rids $rid"
|
||||
pattern="$pattern|$rid"
|
||||
echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
|
||||
|
||||
@@ -104,7 +89,6 @@ done
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rids
|
||||
# remount all the clients
|
||||
for cl in $(t_fs_nrs); do
|
||||
if [ $cl == $sv ]; then
|
||||
@@ -125,17 +109,11 @@ t_wait_for_leader
|
||||
while t_rid_is_fencing $rid; do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rid
|
||||
t_mount $sv
|
||||
check_read_write
|
||||
|
||||
echo "== force unmount everything, new server fences all previous"
|
||||
sync
|
||||
rids=""
|
||||
# get rids before forced unmount breaks scoutfs statfs
|
||||
for nr in $(t_fs_nrs); do
|
||||
rids="$rids $(t_mount_rid $nr)"
|
||||
done
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_force_umount $nr
|
||||
done
|
||||
@@ -144,7 +122,6 @@ t_mount_all
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rids
|
||||
check_read_write
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -26,17 +26,9 @@ inode_exists()
|
||||
{
|
||||
local ino="$1"
|
||||
|
||||
scoutfs get-allocated-inos -i "$ino" -s -p "$T_M0" > $T_TMP.inos.log 2>&1
|
||||
test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino"
|
||||
handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
|
||||
}
|
||||
|
||||
t_save_all_sysfs_mount_options orphan_scan_delay_ms
|
||||
restore_delays()
|
||||
{
|
||||
t_restore_all_sysfs_mount_options orphan_scan_delay_ms
|
||||
}
|
||||
trap restore_delays EXIT
|
||||
|
||||
echo "== test our inode existance function"
|
||||
path="$T_D0/file"
|
||||
touch "$path"
|
||||
@@ -45,7 +37,6 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
|
||||
echo "== unlinked and opened inodes still exist"
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for background sleep to run and open stdin
|
||||
pid="$!"
|
||||
rm -f "$path"
|
||||
inode_exists $ino || echo "$ino didn't exist"
|
||||
@@ -53,8 +44,7 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
echo "== orphan from failed evict deletion is picked up"
|
||||
# pending kill signal stops evict from getting locks and deleting
|
||||
silent_kill $pid
|
||||
t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
|
||||
sleep 5
|
||||
sleep 55
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
|
||||
echo "== orphaned inos in all mounts all deleted"
|
||||
@@ -65,7 +55,6 @@ for nr in $(t_fs_nrs); do
|
||||
touch "$path"
|
||||
inos="$inos $(stat -c %i $path)"
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for background sleep to run and open stdin
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
@@ -80,63 +69,9 @@ while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
# wait for orphan scans to run
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
|
||||
# also have to wait for delayed log merge work from mount
|
||||
sleep 15
|
||||
sleep 55
|
||||
for ino in $inos; do
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
done
|
||||
|
||||
RUNTIME=30
|
||||
echo "== ${RUNTIME}s of racing evict deletion, orphan scanning, and open by handle"
|
||||
|
||||
# exclude last client mount
|
||||
last=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
last=$nr
|
||||
done
|
||||
|
||||
END=$((SECONDS + RUNTIME))
|
||||
while [ $SECONDS -lt $END ]; do
|
||||
# hold open per-mount unlinked files
|
||||
pids=""
|
||||
ino_args=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
test $nr == $last && continue
|
||||
|
||||
eval path="\$T_D${nr}/racing-$nr"
|
||||
touch "$path"
|
||||
ino_args="$ino_args -i $(stat -c %i $path)"
|
||||
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for sleep to start and open input :/
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
|
||||
# remount excluded last client to force log merging and make orphan visible
|
||||
sync
|
||||
t_umount $last
|
||||
t_mount $last
|
||||
|
||||
# get all mounts scanning orphans at high frequency
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 100
|
||||
|
||||
# spin having tasks in each mount trying to open/fsetxattr all inos
|
||||
for nr in $(t_fs_nrs); do
|
||||
test $nr == $last && continue
|
||||
|
||||
eval path="\$T_M${nr}"
|
||||
handle_fsetxattr -e $ino_args -m "$path" -s 2 &
|
||||
done
|
||||
|
||||
# trigger eviction deletion of each file in each mount
|
||||
silent_kill $pids
|
||||
|
||||
wait || t_fail "handle_fsetxattr failed"
|
||||
|
||||
# slow down orphan scanning for the next iteration
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms $(((RUNTIME * 2) * 1000))
|
||||
done
|
||||
|
||||
t_pass
|
||||
|
||||
35
utils/fenced/local-force-unmount
Executable file
35
utils/fenced/local-force-unmount
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/bash
|
||||
|
||||
echo_fail() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
}
|
||||
|
||||
rid="$SCOUTFS_FENCED_REQ_RID"
|
||||
|
||||
#
|
||||
# Look for a local mount with the rid to fence. Typically we'll at
|
||||
# least find the mount with the server that requested the fence that
|
||||
# we're processing. But it's possible that mounts are unmounted
|
||||
# before, or while, we're running.
|
||||
#
|
||||
mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \
|
||||
echo_fail "findmnt -t scoutfs failed" > /dev/stderr
|
||||
|
||||
for mnt in $mnts; do
|
||||
mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \
|
||||
echo_fail "scoutfs statfs $mnt failed"
|
||||
|
||||
if [ "$mnt_rid" == "$rid" ]; then
|
||||
umount -f "$mnt" || \
|
||||
echo_fail "umout -f $mnt"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
#
|
||||
# If the mount doesn't exist on this host then it can't access the
|
||||
# devices by definition and can be considered fenced.
|
||||
#
|
||||
exit 0
|
||||
@@ -55,21 +55,9 @@ test -x "$SCOUTFS_FENCED_RUN" || \
|
||||
error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable"
|
||||
|
||||
#
|
||||
# Main loop watching for fence request across all filesystems. The
|
||||
# server can shut down without waiting for pending fence requests to
|
||||
# finish. All of the interaction with the fence directory and files can
|
||||
# fail at any moment. We will generate log messages when the dir or
|
||||
# files disappear.
|
||||
# main loop watching for fence request across all filesystems
|
||||
#
|
||||
|
||||
# generate failure messages to stderr while still echoing 0 for the caller
|
||||
careful_cat()
|
||||
{
|
||||
local path="$@"
|
||||
|
||||
cat "$@" || echo 0
|
||||
}
|
||||
|
||||
while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
for fence in /sys/fs/scoutfs/*/fence/*; do
|
||||
# catches unmatched regex when no dirs
|
||||
@@ -78,8 +66,7 @@ while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
fi
|
||||
|
||||
# skip requests that have been handled
|
||||
if [ "$(careful_cat $fence/fenced)" == 1 -o \
|
||||
"$(careful_cat $fence/error)" == 1 ]; then
|
||||
if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -94,10 +81,10 @@ while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
export SCOUTFS_FENCED_REQ_RID="$rid"
|
||||
export SCOUTFS_FENCED_REQ_IP="$ip"
|
||||
|
||||
$SCOUTFS_FENCED_RUN $SCOUTFS_FENCED_RUN_ARGS
|
||||
$run $SCOUTFS_FENCED_RUN_ARGS
|
||||
rc=$?
|
||||
if [ "$rc" != 0 ]; then
|
||||
log_message "server $srv fencing rid $rid saw error status $rc"
|
||||
log_message "server $srv fencing rid $rid saw error status $rc from $run"
|
||||
echo 1 > "$fence/error"
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
# delay, in seconds, between each check for pending fence requests.
|
||||
SCOUTFS_FENCED_DELAY=1
|
||||
# path to executable to run to service fence request
|
||||
#SCOUTFS_FENCED_RUN=
|
||||
# arguments to pass to binary
|
||||
SCOUTFS_FENCED_RUN=/usr/libexec/scoutfs-fenced/run/local-force-unmount
|
||||
SCOUTFS_FENCED_RUN_ARGS=""
|
||||
|
||||
@@ -21,21 +21,6 @@ contains the filesystem's metadata.
|
||||
.sp
|
||||
This option is required.
|
||||
.TP
|
||||
.B orphan_scan_delay_ms=<number>
|
||||
This option sets the average expected delay, in milliseconds, between
|
||||
each mount's scan of the global orphaned inode list. Jitter is added to
|
||||
avoid contention so each individual delay between scans is a random
|
||||
value up to 20% less than or greater than this average expected delay.
|
||||
.sp
|
||||
The minimum value for this option is 100ms which is very short and is
|
||||
only reasonable for testing or experiments. The default is 10000ms (10
|
||||
seconds) and the maximum is 60000ms (1 minute).
|
||||
.sp
|
||||
This option can be changed in an active mount by writing to its file in
|
||||
the options directory in the mount's sysfs directory. Writing a new
|
||||
value will cause the next pending orphan scan to be rescheduled
|
||||
with the newly written delay time.
|
||||
.TP
|
||||
.B quorum_slot_nr=<number>
|
||||
The quorum_slot_nr option assigns a quorum member slot to the mount.
|
||||
The mount will use the slot assignment to claim exclusive ownership of
|
||||
|
||||
@@ -15,7 +15,7 @@ environment variable. If that variable is also absent the current working
|
||||
directory will be used.
|
||||
|
||||
.TP
|
||||
.BI "change-format-version [-V, --format-version VERS] [-F|--offline] META-DEVICE DATA-DEVICE"
|
||||
.BI "change-format-version [-V, --format-version VERS] [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.sp
|
||||
Change the format version of an existing file system. The maxmimum
|
||||
supported version is used by default. A specific version in the range
|
||||
@@ -25,7 +25,7 @@ output of --help.
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-F, --offline"
|
||||
.B "-F, --offline META-DEVICE DATA-DEVICE"
|
||||
Change the format version by writing directly to the metadata and data
|
||||
devices. Like mkfs, this writes directly to the devices without
|
||||
protection and must only be used on completely unmounted devices. The
|
||||
@@ -43,7 +43,7 @@ the super blocks on both devices.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "change-quorum-config {-Q|--quorum-slot NR,ADDR,PORT} [-F|--offline] META-DEVICE"
|
||||
.BI "change-quorum-config {-Q|--quorum-slot} NR,ADDR,PORT [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.sp
|
||||
Change the quorum configuration for an existing file system. The new
|
||||
configuration completely replaces the old configuration. Any slots
|
||||
@@ -61,7 +61,7 @@ multiple arguments as described in the
|
||||
.B mkfs
|
||||
command.
|
||||
.TP
|
||||
.B "-F, --offline"
|
||||
.B "-F, --offline META-DEVICE"
|
||||
Perform the change offline by updating the superblock in the metadata
|
||||
device. The command will read the super block and refuse to make the
|
||||
change if it sees any evidence that the metadata device is currently in
|
||||
@@ -617,33 +617,6 @@ command is used first.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "get-allocated-inos [-i|--ino INO] [-s|--single] [-p|--path PATH]"
|
||||
.sp
|
||||
This debugging command prints allocated inode numbers. It only prints
|
||||
inodes
|
||||
found in the group that contains the starting inode. The printed inode
|
||||
numbers aren't necessarily reachable. They could be anywhere in the
|
||||
process from being unlinked to finally deleted when their items
|
||||
were found.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-i, --ino INO"
|
||||
The first 64bit inode number which could be printed.
|
||||
.TP
|
||||
.B "-s, --single"
|
||||
Only print the single starting inode when it is allocated, all other allocated
|
||||
inode numbers will be ignored.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
|
||||
.SH SEE ALSO
|
||||
.BR scoutfs (5),
|
||||
.BR xattr (7),
|
||||
|
||||
@@ -55,6 +55,7 @@ install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
|
||||
install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
|
||||
install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
|
||||
install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
|
||||
install -m 755 -D fenced/local-force-unmount $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/run/local-force-unmount
|
||||
install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
|
||||
install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
|
||||
|
||||
|
||||
@@ -222,7 +222,7 @@ static struct argp_option options[] = {
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE DATA-DEVICE",
|
||||
"",
|
||||
"Change format version of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ static struct argp_option options[] = {
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE",
|
||||
"",
|
||||
"Change quorum slots and addresses of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "ioctl.h"
|
||||
#include "cmd.h"
|
||||
|
||||
struct get_allocated_inos_args {
|
||||
char *path;
|
||||
u64 ino;
|
||||
bool have_ino;
|
||||
bool single;
|
||||
};
|
||||
|
||||
static int do_get_allocated_inos(struct get_allocated_inos_args *args)
|
||||
{
|
||||
struct scoutfs_ioctl_get_allocated_inos gai;
|
||||
u64 *inos = NULL;
|
||||
int fd = -1;
|
||||
u64 bytes;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (args->single)
|
||||
bytes = sizeof(*inos);
|
||||
else
|
||||
bytes = SCOUTFS_LOCK_INODE_GROUP_NR * sizeof(*inos);
|
||||
|
||||
inos = malloc(bytes);
|
||||
if (!inos) {
|
||||
fprintf(stderr, "inode number array allocation failed\n");
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
fd = get_path(args->path, O_RDONLY);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
memset(&gai, 0, sizeof(gai));
|
||||
gai.start_ino = args->ino;
|
||||
gai.inos_ptr = (unsigned long)inos;
|
||||
gai.inos_bytes = bytes;
|
||||
|
||||
ret = ioctl(fd, SCOUTFS_IOC_GET_ALLOCATED_INOS, &gai);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "get_allocated_inos ioctl failed: "
|
||||
"%s (%d)\n", strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->single && ret > 0 && inos[0] != args->ino)
|
||||
ret = 0;
|
||||
|
||||
for (i = 0; i < ret; i++)
|
||||
printf("%llu\n", inos[i]);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
free(inos);
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct get_allocated_inos_args *args = state->input;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'i':
|
||||
ret = parse_u64(arg, &args->ino);
|
||||
if (ret)
|
||||
return ret;
|
||||
args->have_ino = true;
|
||||
case 'p':
|
||||
args->path = strdup_or_error(state, arg);
|
||||
break;
|
||||
case 's':
|
||||
args->single = true;
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->have_ino)
|
||||
argp_error(state, "must provide --ino starting inode number option");
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "ino", 'i', "NUMBER", 0, "Start from 64bit inode number (required)"},
|
||||
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
|
||||
{ "single", 's', NULL, 0, "Only print single specific inode number argument"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
NULL,
|
||||
"Print allocated inode numbers from starting inode number"
|
||||
};
|
||||
|
||||
static int get_allocated_inos_cmd(int argc, char **argv)
|
||||
{
|
||||
|
||||
struct get_allocated_inos_args get_allocated_inos_args = {NULL};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &get_allocated_inos_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_get_allocated_inos(&get_allocated_inos_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) get_allocated_inos_ctor(void)
|
||||
{
|
||||
cmd_register_argp("get-allocated-inos", &argp, GROUP_DEBUG, get_allocated_inos_cmd);
|
||||
}
|
||||
@@ -278,8 +278,6 @@ static int print_log_trees_item(struct scoutfs_key *key, u64 seq, u8 flags, void
|
||||
" data_freed: "ALCROOT_F"\n"
|
||||
" srch_file: "SRF_FMT"\n"
|
||||
" inode_count_delta: %lld\n"
|
||||
" get_trans_seq: %lld\n"
|
||||
" commit_trans_seq: %lld\n"
|
||||
" max_item_seq: %llu\n"
|
||||
" finalize_seq: %llu\n"
|
||||
" rid: %016llx\n"
|
||||
@@ -298,8 +296,6 @@ static int print_log_trees_item(struct scoutfs_key *key, u64 seq, u8 flags, void
|
||||
ALCROOT_A(<->data_freed),
|
||||
SRF_A(<->srch_file),
|
||||
le64_to_cpu(lt->inode_count_delta),
|
||||
le64_to_cpu(lt->get_trans_seq),
|
||||
le64_to_cpu(lt->commit_trans_seq),
|
||||
le64_to_cpu(lt->max_item_seq),
|
||||
le64_to_cpu(lt->finalize_seq),
|
||||
le64_to_cpu(lt->rid),
|
||||
|
||||
Reference in New Issue
Block a user