mirror of
https://github.com/versity/scoutfs.git
synced 2026-06-09 21:22:36 +00:00
Compare commits
24 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5bea29a168 | |||
| 7a999f2657 | |||
| 166ab58b99 | |||
| 8bc1ee8346 | |||
| 285b68879a | |||
| 1ac3efe701 | |||
| ce76682db7 | |||
| 686f8515bc | |||
| 93bc52cc54 | |||
| 1108d1288a | |||
| 0abcd5a004 | |||
| 888ad8ec5c | |||
| 16ea0ef671 | |||
| 1b8e3f7c05 | |||
| 3ae0ebd0d8 | |||
| 714b7f2a84 | |||
| 945f8b4828 | |||
| 95f2a87864 | |||
| 38ee2defd5 | |||
| 0fc8ccb122 | |||
| e4a3c2b95d | |||
| cf4e6611d3 | |||
| 65429a9cc4 | |||
| 83a6bbb640 |
@@ -1,6 +1,19 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.x
|
||||
\
|
||||
*TBD*
|
||||
|
||||
|
||||
* **Add scoutfs(1) change-quorum-config command**
|
||||
\
|
||||
Add a change-quorum-config command to scoutfs(1) to change the quorum
|
||||
configuration stored in the metadata device while the file system is
|
||||
unmounted. This can be used to change the mounts that will
|
||||
participate in quorum and the IP addresses they use.
|
||||
|
||||
---
|
||||
v1.0
|
||||
\
|
||||
|
||||
@@ -13,6 +13,7 @@ scoutfs-y += \
|
||||
block.o \
|
||||
btree.o \
|
||||
client.o \
|
||||
cwskip.o \
|
||||
counters.o \
|
||||
data.o \
|
||||
dir.o \
|
||||
|
||||
+20
-23
@@ -1233,10 +1233,6 @@ static int btree_walk(struct super_block *sb,
|
||||
WARN_ON_ONCE((flags & (BTW_GET_PAR|BTW_SET_PAR)) && !par_root))
|
||||
return -EINVAL;
|
||||
|
||||
/* all ops come through walk and walk calls all reads */
|
||||
if (scoutfs_forcing_unmount(sb))
|
||||
return -EIO;
|
||||
|
||||
scoutfs_inc_counter(sb, btree_walk);
|
||||
|
||||
restart:
|
||||
@@ -1879,12 +1875,11 @@ out:
|
||||
* set in btree items. They're only used for fs items written through
|
||||
* the item cache and forest of log btrees.
|
||||
*/
|
||||
int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_item_list *lst)
|
||||
int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
|
||||
scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg)
|
||||
{
|
||||
struct scoutfs_btree_item_desc desc;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_btree_block *bt;
|
||||
@@ -1893,44 +1888,46 @@ int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
int cmp;
|
||||
int ret = 0;
|
||||
|
||||
while (lst) {
|
||||
pos = iter_cb(sb, &desc, pos, arg);
|
||||
|
||||
while (pos) {
|
||||
ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
|
||||
&lst->key, lst->val_len, &bl, &kr, NULL);
|
||||
desc.key, desc.val_len, &bl, &kr, NULL);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
bt = bl->data;
|
||||
|
||||
do {
|
||||
item = leaf_item_hash_search(sb, bt, &lst->key);
|
||||
item = leaf_item_hash_search(sb, bt, desc.key);
|
||||
if (item) {
|
||||
/* try to merge delta values, _NULL not deleted; merge will */
|
||||
ret = scoutfs_forest_combine_deltas(&lst->key,
|
||||
ret = scoutfs_forest_combine_deltas(desc.key,
|
||||
item_val(bt, item),
|
||||
item_val_len(item),
|
||||
lst->val, lst->val_len);
|
||||
desc.val, desc.val_len);
|
||||
if (ret < 0) {
|
||||
scoutfs_block_put(sb, bl);
|
||||
goto out;
|
||||
}
|
||||
|
||||
item->seq = cpu_to_le64(lst->seq);
|
||||
item->flags = lst->flags;
|
||||
item->seq = cpu_to_le64(desc.seq);
|
||||
item->flags = desc.flags;
|
||||
|
||||
if (ret == 0)
|
||||
update_item_value(bt, item, lst->val, lst->val_len);
|
||||
update_item_value(bt, item, desc.val, desc.val_len);
|
||||
else
|
||||
ret = 0;
|
||||
} else {
|
||||
scoutfs_avl_search(&bt->item_root,
|
||||
cmp_key_item, &lst->key,
|
||||
cmp_key_item, desc.key,
|
||||
&cmp, &par, NULL, NULL);
|
||||
create_item(bt, &lst->key, lst->seq, lst->flags, lst->val,
|
||||
lst->val_len, par, cmp);
|
||||
create_item(bt, desc.key, desc.seq, desc.flags, desc.val,
|
||||
desc.val_len, par, cmp);
|
||||
}
|
||||
|
||||
lst = lst->next;
|
||||
} while (lst && scoutfs_key_compare(&lst->key, &kr.end) <= 0 &&
|
||||
mid_free_item_room(bt, lst->val_len));
|
||||
pos = iter_cb(sb, &desc, pos, arg);
|
||||
} while (pos && scoutfs_key_compare(desc.key, &kr.end) <= 0 &&
|
||||
mid_free_item_room(bt, desc.val_len));
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
}
|
||||
|
||||
+17
-6
@@ -18,11 +18,24 @@ struct scoutfs_btree_item_ref {
|
||||
#define SCOUTFS_BTREE_ITEM_REF(name) \
|
||||
struct scoutfs_btree_item_ref name = {NULL,}
|
||||
|
||||
/* caller gives an item to the callback */
|
||||
/* btree gives an item to caller */
|
||||
typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
|
||||
struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, void *arg);
|
||||
|
||||
struct scoutfs_btree_item_desc {
|
||||
struct scoutfs_key *key;
|
||||
void *val;
|
||||
u64 seq;
|
||||
u8 flags;
|
||||
unsigned val_len;
|
||||
};
|
||||
|
||||
/* btree iterates through items from caller */
|
||||
typedef void *(*scoutfs_btree_item_iter_cb)(struct super_block *sb,
|
||||
struct scoutfs_btree_item_desc *desc,
|
||||
void *pos, void *arg);
|
||||
|
||||
/* simple singly-linked list of items */
|
||||
struct scoutfs_btree_item_list {
|
||||
struct scoutfs_btree_item_list *next;
|
||||
@@ -78,11 +91,9 @@ int scoutfs_btree_read_items(struct super_block *sb,
|
||||
struct scoutfs_key *start,
|
||||
struct scoutfs_key *end,
|
||||
scoutfs_btree_item_cb cb, void *arg);
|
||||
int scoutfs_btree_insert_list(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_item_list *lst);
|
||||
int scoutfs_btree_insert_list(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_btree_root *root,
|
||||
scoutfs_btree_item_iter_cb iter_cb, void *pos, void *arg);
|
||||
|
||||
int scoutfs_btree_parent_range(struct super_block *sb,
|
||||
struct scoutfs_btree_root *root,
|
||||
|
||||
@@ -668,3 +668,11 @@ void scoutfs_client_destroy(struct super_block *sb)
|
||||
kfree(client);
|
||||
sbi->client_info = NULL;
|
||||
}
|
||||
|
||||
void scoutfs_client_net_shutdown(struct super_block *sb)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
if (client && client->conn)
|
||||
scoutfs_net_shutdown(sb, client->conn);
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_op
|
||||
int scoutfs_client_resize_devices(struct super_block *sb, struct scoutfs_net_resize_devices *nrd);
|
||||
int scoutfs_client_statfs(struct super_block *sb, struct scoutfs_net_statfs *nst);
|
||||
|
||||
void scoutfs_client_net_shutdown(struct super_block *sb);
|
||||
int scoutfs_client_setup(struct super_block *sb);
|
||||
void scoutfs_client_destroy(struct super_block *sb);
|
||||
|
||||
|
||||
+11
-20
@@ -90,36 +90,27 @@
|
||||
EXPAND_COUNTER(forest_read_items) \
|
||||
EXPAND_COUNTER(forest_roots_next_hint) \
|
||||
EXPAND_COUNTER(forest_set_bloom_bits) \
|
||||
EXPAND_COUNTER(item_alloc_bytes) \
|
||||
EXPAND_COUNTER(item_clear_dirty) \
|
||||
EXPAND_COUNTER(item_create) \
|
||||
EXPAND_COUNTER(item_delete) \
|
||||
EXPAND_COUNTER(item_delta) \
|
||||
EXPAND_COUNTER(item_delta_written) \
|
||||
EXPAND_COUNTER(item_dirty) \
|
||||
EXPAND_COUNTER(item_free_bytes) \
|
||||
EXPAND_COUNTER(item_invalidate) \
|
||||
EXPAND_COUNTER(item_invalidate_page) \
|
||||
EXPAND_COUNTER(item_invalidate_item) \
|
||||
EXPAND_COUNTER(item_lookup) \
|
||||
EXPAND_COUNTER(item_mark_dirty) \
|
||||
EXPAND_COUNTER(item_next) \
|
||||
EXPAND_COUNTER(item_page_accessed) \
|
||||
EXPAND_COUNTER(item_page_alloc) \
|
||||
EXPAND_COUNTER(item_page_clear_dirty) \
|
||||
EXPAND_COUNTER(item_page_compact) \
|
||||
EXPAND_COUNTER(item_page_free) \
|
||||
EXPAND_COUNTER(item_page_lru_add) \
|
||||
EXPAND_COUNTER(item_page_lru_remove) \
|
||||
EXPAND_COUNTER(item_page_mark_dirty) \
|
||||
EXPAND_COUNTER(item_page_rbtree_walk) \
|
||||
EXPAND_COUNTER(item_page_split) \
|
||||
EXPAND_COUNTER(item_pcpu_add_replaced) \
|
||||
EXPAND_COUNTER(item_pcpu_page_hit) \
|
||||
EXPAND_COUNTER(item_pcpu_page_miss) \
|
||||
EXPAND_COUNTER(item_pcpu_page_miss_keys) \
|
||||
EXPAND_COUNTER(item_read_pages_split) \
|
||||
EXPAND_COUNTER(item_shrink_page) \
|
||||
EXPAND_COUNTER(item_shrink_page_dirty) \
|
||||
EXPAND_COUNTER(item_shrink_page_reader) \
|
||||
EXPAND_COUNTER(item_shrink_page_trylock) \
|
||||
EXPAND_COUNTER(item_shrink) \
|
||||
EXPAND_COUNTER(item_shrink_all) \
|
||||
EXPAND_COUNTER(item_shrink_exhausted) \
|
||||
EXPAND_COUNTER(item_shrink_read_search) \
|
||||
EXPAND_COUNTER(item_shrink_removed) \
|
||||
EXPAND_COUNTER(item_shrink_searched) \
|
||||
EXPAND_COUNTER(item_shrink_skipped) \
|
||||
EXPAND_COUNTER(item_shrink_write_search) \
|
||||
EXPAND_COUNTER(item_update) \
|
||||
EXPAND_COUNTER(item_write_dirty) \
|
||||
EXPAND_COUNTER(lock_alloc) \
|
||||
|
||||
@@ -0,0 +1,584 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include "cwskip.h"
|
||||
|
||||
/*
|
||||
* This skip list is built to allow concurrent modification and limit
|
||||
* contention to the region of the list around the modification. All
|
||||
* node references are protected by RCU. Each node has a write_seq
|
||||
* that works like a seqlock, the big differences are that we nest them
|
||||
* and use trylock to acquire them.
|
||||
*
|
||||
* Readers sample the write_seqs of nodes containing links as they
|
||||
* traverse them, verifying that the node hasn't been modified before
|
||||
* traversing to the node referenced by the link.
|
||||
*
|
||||
* Writers remember the seqs of all the nodes they traversed to end up
|
||||
* at their final node. They try to acquire the lock of all the nodes
|
||||
* needed to modify the list at a given height. Their trylocks will
|
||||
* fail if any of the nodes have changed since their traversal.
|
||||
*
|
||||
* The interface is built around references to adjacent pairs of nodes
|
||||
* and their sequence numbers. This lets readers and writers traverse
|
||||
* through their local region of the list until they hit contention and
|
||||
* must start over with a full search.
|
||||
*
|
||||
* The caller is responsible for allocating and freeing nodes. The
|
||||
* interface is built around caller's objects which each have embedded
|
||||
* nodes.
|
||||
*/
|
||||
|
||||
/*
|
||||
* node_off is the positive offset of the cwskip node within the
|
||||
* container structs stored in the list. The node_off is subtracted
|
||||
* from node pointers to give the caller a pointer to their stored
|
||||
* container struct.
|
||||
*/
|
||||
void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
|
||||
unsigned long node_off)
|
||||
{
|
||||
memset(root, 0, sizeof(&root));
|
||||
root->cmp_fn = cmp_fn;
|
||||
root->node_off = node_off;
|
||||
}
|
||||
|
||||
/* This is completely racey and should be used accordingly. */
|
||||
bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SCOUTFS_CWSKIP_MAX_HEIGHT; i++) {
|
||||
if (root->node.links[i] != NULL)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a random height between 1 and max height, inclusive. Using
|
||||
* ffs means that each greater height relies on all lower height bits
|
||||
* being clear and we get the height distribution we want: 1 = 1/2,
|
||||
* 2 = 1/4, 3 = 1/8, etc.
|
||||
*/
|
||||
int scoutfs_cwskip_rand_height(void)
|
||||
{
|
||||
return ffs(prandom_u32() | (1 << (SCOUTFS_CWSKIP_MAX_HEIGHT - 1)));
|
||||
}
|
||||
|
||||
static void *node_container(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
return node ? (void *)((unsigned long)node - root->node_off) : NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the caller's containers for the given nodes. There isn't a
|
||||
* previous container when the previous node is the root's static
|
||||
* full-height node.
|
||||
*/
|
||||
static void set_containers(struct scoutfs_cwskip_root *root, struct scoutfs_cwskip_node *prev,
|
||||
struct scoutfs_cwskip_node *node, void **prev_cont, void **node_cont)
|
||||
{
|
||||
if (prev_cont)
|
||||
*prev_cont = (prev != &root->node) ? node_container(root, prev) : NULL;
|
||||
if (node_cont)
|
||||
*node_cont = node_container(root, node);
|
||||
}
|
||||
|
||||
static struct scoutfs_cwskip_node *node_read_begin(struct scoutfs_cwskip_node *node,
|
||||
unsigned int *seq)
|
||||
{
|
||||
if (node) {
|
||||
*seq = READ_ONCE(node->write_seq) & ~1U;
|
||||
smp_rmb();
|
||||
} else {
|
||||
*seq = 1; /* caller shouldn't use if we return null, being careful */
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
static bool node_read_retry(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
if (node) {
|
||||
smp_rmb();
|
||||
return READ_ONCE(node->write_seq) != seq;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* write_seq is only an int to reduce the size of nodes and full-height
|
||||
* seq arrays, it could be a long if archs have trouble with int
|
||||
* cmpxchg.
|
||||
*/
|
||||
static bool __node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
if (seq & 1)
|
||||
return false;
|
||||
|
||||
return cmpxchg(&node->write_seq, seq, seq + 1) == seq;
|
||||
}
|
||||
|
||||
static bool node_trylock(struct scoutfs_cwskip_node *node, unsigned int seq)
|
||||
{
|
||||
bool locked = __node_trylock(node, seq);
|
||||
if (locked)
|
||||
smp_wmb();
|
||||
return locked;
|
||||
}
|
||||
|
||||
static void __node_unlock(struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
node->write_seq++;
|
||||
}
|
||||
|
||||
static void node_unlock(struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
__node_unlock(node);
|
||||
smp_wmb();
|
||||
}
|
||||
|
||||
/* return -1/1 to go left/right, never 0 */
|
||||
static int random_cmp(void *K, void *C)
|
||||
{
|
||||
return (int)(prandom_u32() & 2) - 1;
|
||||
}
|
||||
|
||||
static void cwskip_search(struct scoutfs_cwskip_root *root, void *key, int *node_cmp,
|
||||
struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_writer *wr,
|
||||
unsigned int *prev_seqs)
|
||||
{
|
||||
struct scoutfs_cwskip_node *prev;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
scoutfs_cwskip_cmp_t cmp_fn;
|
||||
unsigned int prev_seq;
|
||||
unsigned int node_seq;
|
||||
int level;
|
||||
int cmp;
|
||||
|
||||
if (key == NULL)
|
||||
cmp_fn = random_cmp;
|
||||
|
||||
restart:
|
||||
prev = node_read_begin(&root->node, &prev_seq);
|
||||
node = NULL;
|
||||
node_seq = 1;
|
||||
cmp = -1;
|
||||
|
||||
level = SCOUTFS_CWSKIP_MAX_HEIGHT - 1;
|
||||
while (prev && level >= 0) {
|
||||
node = node_read_begin(prev->links[level], &node_seq);
|
||||
if (!node) {
|
||||
cmp = -1;
|
||||
level--;
|
||||
continue;
|
||||
}
|
||||
|
||||
cmp = cmp_fn(key, node_container(root, node));
|
||||
if (cmp > 0) {
|
||||
if (node_read_retry(prev, prev_seq))
|
||||
goto restart;
|
||||
prev = node;
|
||||
prev_seq = node_seq;
|
||||
node = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (wr) {
|
||||
wr->prevs[level] = prev;
|
||||
prev_seqs[level] = prev_seq;
|
||||
}
|
||||
|
||||
level--;
|
||||
}
|
||||
|
||||
rd->prev = prev;
|
||||
rd->prev_seq = prev_seq;
|
||||
rd->node = node;
|
||||
rd->node_seq = node_seq;
|
||||
*node_cmp = cmp;
|
||||
}
|
||||
|
||||
static void init_reader(struct scoutfs_cwskip_reader *rd, struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
memset(rd, 0, sizeof(struct scoutfs_cwskip_reader));
|
||||
rd->root = root;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and returns nodes that surround the search key.
|
||||
*
|
||||
* Either prev or null can be null if there are no nodes before or after
|
||||
* the search key. *node_cmp is set to the final comparison of the key
|
||||
* and the returned node's container key, it will be 0 if an exact match
|
||||
* is found.
|
||||
*
|
||||
* This starts an RCU read critical section and is fully concurrent with
|
||||
* both other readers and writers. The nodes won't be freed until
|
||||
* after the section so its always safe to reference them but their
|
||||
* contents might be nonsense if they're modified during the read.
|
||||
* Nothing learned from the list during the read section should have an
|
||||
* effect until after _read_valid has said it was OK.
|
||||
*
|
||||
* _read_valid can be called after referencing the nodes to see if they
|
||||
* were stable during the read. _read_next can be used to iterate
|
||||
* forward through the list without repeating the search. The caller
|
||||
* must always call a matching _read_end once they're done.
|
||||
*/
|
||||
void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
|
||||
void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd)
|
||||
__acquires(RCU) /* :/ */
|
||||
{
|
||||
init_reader(rd, root);
|
||||
|
||||
rcu_read_lock();
|
||||
cwskip_search(root, key, node_cmp, rd, NULL, NULL);
|
||||
set_containers(root, rd->prev, rd->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true of the nodes referenced by the reader haven't been
|
||||
* modified and any references of them were consistent. Thsi does not
|
||||
* end the reader critical section and can be called multiple times.
|
||||
*/
|
||||
bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd)
|
||||
{
|
||||
return !(node_read_retry(rd->prev, rd->prev_seq) &&
|
||||
node_read_retry(rd->node, rd->node_seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance from the current prev/node to the next pair of nodes in the
|
||||
* list. prev_cont is set to what node_cont was before the call.
|
||||
* node_cont is set to the next node after the current node_cont.
|
||||
*
|
||||
* This returns true if it found a next node and that its load of the
|
||||
* next pointer from node was valid and stable. Returning false means
|
||||
* that the caller should retry. There could be more items in the list.
|
||||
*/
|
||||
bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont)
|
||||
{
|
||||
struct scoutfs_cwskip_node *next;
|
||||
unsigned int next_seq;
|
||||
bool valid_next;
|
||||
|
||||
next = rd->node ? node_read_begin(rd->node->links[0], &next_seq) : NULL;
|
||||
valid_next = scoutfs_cwskip_read_valid(rd) && next;
|
||||
if (valid_next) {
|
||||
rd->prev = rd->node;
|
||||
rd->prev_seq = rd->node_seq;
|
||||
rd->node = next;
|
||||
rd->node_seq = next_seq;
|
||||
|
||||
set_containers(rd->root, rd->prev, rd->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
return valid_next;
|
||||
}
|
||||
|
||||
/*
|
||||
* End the critical section started with _read_begin.
|
||||
*/
|
||||
void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd)
|
||||
__releases(RCU) /* :/ */
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Higher locks are more likely to cause contention so we unlock them
|
||||
* first.
|
||||
*/
|
||||
static void writer_unlock(struct scoutfs_cwskip_writer *wr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = wr->locked_height - 1; i >= 0; i--) {
|
||||
if (i == 0 || (wr->prevs[i - 1] != wr->prevs[i]))
|
||||
__node_unlock(wr->prevs[i]);
|
||||
}
|
||||
|
||||
if (wr->node_locked)
|
||||
__node_unlock(wr->node);
|
||||
|
||||
smp_wmb();
|
||||
|
||||
wr->locked_height = 0;
|
||||
wr->node_locked = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* A search traversal has saved all the previous nodes at each level.
|
||||
*
|
||||
* We try to acquire the write_seq locks for all the prevs up to height
|
||||
* from the seqs that we read during the search. The search was
|
||||
* protected by read sections so the prevs represent a consistent
|
||||
* version of the list at some point in the past. If nodes have been
|
||||
* locked since we read them we won't be able to acquire the locks.
|
||||
* Nodes aren't re-inserted after removal so we shouldn't see nodes in
|
||||
* multiple places (which would deadlock).
|
||||
*
|
||||
* The same node can be in multiple prev slots. We're careful to only
|
||||
* try locking the lowest duplicate slot.
|
||||
*
|
||||
* We lock from the highest level down. This only matters when there's
|
||||
* contention. The higher nodes are more likely to see contention so
|
||||
* we want trylock to fail early to avoid useless locking churn on lower
|
||||
* nodes.
|
||||
*/
|
||||
static bool writer_trylock(struct scoutfs_cwskip_writer *wr, unsigned int *prev_seqs, int height)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(wr->locked_height != 0) ||
|
||||
WARN_ON_ONCE(height < 1 || height > ARRAY_SIZE(wr->prevs)))
|
||||
return false;
|
||||
|
||||
for (i = height - 1; i >= 0; i--) {
|
||||
if ((i == 0 || wr->prevs[i - 1] != wr->prevs[i]) &&
|
||||
!__node_trylock(wr->prevs[i], prev_seqs[i]))
|
||||
break;
|
||||
wr->locked_height++;
|
||||
}
|
||||
|
||||
if (i < height) {
|
||||
writer_unlock(wr);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* paranoid debugging verification */
|
||||
for (i = 0; i < wr->locked_height; i++) {
|
||||
BUG_ON(wr->prevs[i]->height <= i);
|
||||
BUG_ON(wr->node && i < wr->node->height && wr->prevs[i]->links[i] != wr->node);
|
||||
}
|
||||
|
||||
smp_mb();
|
||||
return true;
|
||||
}
|
||||
|
||||
static void init_writer(struct scoutfs_cwskip_writer *wr, struct scoutfs_cwskip_root *root)
|
||||
{
|
||||
memset(wr, 0, sizeof(struct scoutfs_cwskip_writer));
|
||||
wr->root = root;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search for and return references to the two nodes that surround the
|
||||
* search key, with the nodes locked.
|
||||
*
|
||||
* Either node can be null if there are no nodes before or after the
|
||||
* search key. We still hold a lock on the static root node if the
|
||||
* search key falls before the first node in the list.
|
||||
*
|
||||
* If lock_height is 0 then the caller is saying that they just want to
|
||||
* lock the surrounding nodes and not modify their position in the list.
|
||||
* We only lock those two nodes. Any greater lock_height represents a
|
||||
* height that we need to lock so the caller can insert an allocated
|
||||
* node with that height.
|
||||
*
|
||||
* The caller can use the writer context to iterate through locked nodes
|
||||
* via the lowest level list that contains all nodes. If they hit a
|
||||
* node that's higher than the locked height in the writer then they
|
||||
* have to unlock and restart because we don't have the previous node
|
||||
* for that height. We set a min level that we lock to reduce the
|
||||
* possibility of hitting higher nodes and retrying.
|
||||
*/
|
||||
#define MIN_LOCKED_HEIGHT 4
|
||||
void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
|
||||
void **prev_cont, void **node_cont, int *node_cmp,
|
||||
struct scoutfs_cwskip_writer *wr)
|
||||
__acquires(RCU) /* :/ */
|
||||
{
|
||||
unsigned int prev_seqs[SCOUTFS_CWSKIP_MAX_HEIGHT];
|
||||
struct scoutfs_cwskip_reader rd;
|
||||
int node_height;
|
||||
int use_height;
|
||||
bool locked;
|
||||
|
||||
BUG_ON(WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT));
|
||||
|
||||
do {
|
||||
init_reader(&rd, root);
|
||||
init_writer(wr, root);
|
||||
|
||||
rcu_read_lock();
|
||||
cwskip_search(root, key, node_cmp, &rd, wr, NULL);
|
||||
|
||||
wr->node = rd.node;
|
||||
if (wr->node) {
|
||||
/* _trylock of prevs will issue barrier on success */
|
||||
if (!__node_trylock(wr->node, rd.node_seq)) {
|
||||
locked = false;
|
||||
continue;
|
||||
}
|
||||
wr->node_locked = true;
|
||||
node_height = wr->node->height;
|
||||
} else {
|
||||
node_height = 0;
|
||||
}
|
||||
|
||||
if (lock_height > 0)
|
||||
use_height = max3(MIN_LOCKED_HEIGHT, node_height, lock_height);
|
||||
else
|
||||
use_height = 1;
|
||||
|
||||
locked = writer_trylock(wr, prev_seqs, use_height);
|
||||
if (!locked)
|
||||
rcu_read_unlock();
|
||||
} while (!locked);
|
||||
|
||||
set_containers(root, wr->prevs[0], wr->node, prev_cont, node_cont);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new node between the writer's two locked nodes. The
|
||||
* inserting node is locked and replaces the existing node in the writer
|
||||
* which is unlocked.
|
||||
*
|
||||
* The next node may not exist. The previous nodes will always exist
|
||||
* though they may be the static root node.
|
||||
*
|
||||
* The inserting node is visible to readers the moment we store the
|
||||
* first link to it in previous nodes. We first lock it with a write
|
||||
* barrier so that any readers will retry if they visit it before all
|
||||
* its links are updated and its unlocked.
|
||||
*
|
||||
* We don't unlock prevs that are higher than the inserting node. This
|
||||
* lets the caller continue iterating through nodes that are higher than
|
||||
* insertion but still under the locked height.
|
||||
*/
|
||||
void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *ins)
|
||||
{
|
||||
struct scoutfs_cwskip_node *node = wr->node;
|
||||
int i;
|
||||
|
||||
BUG_ON(ins->height > wr->locked_height);
|
||||
node_trylock(ins, ins->write_seq);
|
||||
|
||||
for (i = 0; i < ins->height; i++) {
|
||||
ins->links[i] = wr->prevs[i]->links[i];
|
||||
wr->prevs[i]->links[i] = ins;
|
||||
}
|
||||
|
||||
if (node)
|
||||
node_unlock(node);
|
||||
wr->node = ins;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the node in the writer from the list. The writers node
|
||||
* pointer is not advanced because we don't want this to be able to fail
|
||||
* if trylock on the next node fails. The caller can call _write_next
|
||||
* on this writer and it will try and iterate from prevs[0].
|
||||
*
|
||||
* The caller's removal argument must be the node pointer in the writer.
|
||||
* This is redundant but meant to communicate to the caller that they're
|
||||
* responsible for the node after removing it (presumably queueing it
|
||||
* for freeing before _write_end leaves rcu).
|
||||
*
|
||||
* Readers can be traversing our node as we modify its pointers and can
|
||||
* read a temporarily inconsistent state. We have the node locked so
|
||||
* the reader will immediately retry once the check the seqs after
|
||||
* hitting our node that's being removed.
|
||||
*/
|
||||
void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *node)
|
||||
{
|
||||
int i;
|
||||
|
||||
BUG_ON(node != wr->node);
|
||||
BUG_ON(node->height > wr->locked_height);
|
||||
|
||||
for (i = 0; i < node->height; i++) {
|
||||
wr->prevs[i]->links[i] = node->links[i];
|
||||
node->links[i] = NULL;
|
||||
}
|
||||
|
||||
node_unlock(node);
|
||||
wr->node = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance through the list by setting prevs to node and node to the
|
||||
* next node in the list after locking it. Returns true only if there
|
||||
* was a next node that we were able to lock. Returning false can mean
|
||||
* that we weren't able to lock the next node and the caller should
|
||||
* retry a full search.
|
||||
*
|
||||
* This may be called after _write_remove clears node so we try to
|
||||
* iterate from prev if there is no node.
|
||||
*
|
||||
* If lock_height is greater than zero then the caller needs at least
|
||||
* that lock_height to insert a node of that height. If locked_height
|
||||
* doesn't cover it then we return false so the caller can retry
|
||||
* _write_begin with the needed height.
|
||||
*
|
||||
* Like insertion, we don't unlock prevs higher than the height of the
|
||||
* next node. They're not strictly needed to modify the next node but
|
||||
* we want to keep them locked so the caller can continue to iterate
|
||||
* through nodes up to the locked height.
|
||||
*/
|
||||
bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
|
||||
void **prev_cont, void **node_cont)
|
||||
{
|
||||
struct scoutfs_cwskip_node *next;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(lock_height < 0 || lock_height > SCOUTFS_CWSKIP_MAX_HEIGHT))
|
||||
return false;
|
||||
|
||||
if (wr->node)
|
||||
next = rcu_dereference(wr->node->links[0]);
|
||||
else
|
||||
next = rcu_dereference(wr->prevs[0]->links[0]);
|
||||
|
||||
if (!next ||
|
||||
(lock_height > wr->locked_height) ||
|
||||
(lock_height > 0 && next->height > wr->locked_height) ||
|
||||
!__node_trylock(next, next->write_seq))
|
||||
return false;
|
||||
|
||||
if (!wr->node) {
|
||||
/* set next as missing node */
|
||||
wr->node = next;
|
||||
wr->node_locked = true;
|
||||
|
||||
} else {
|
||||
/* existing node becomes prevs for its height */
|
||||
__node_unlock(wr->prevs[0]);
|
||||
for (i = 0; i < wr->node->height; i++)
|
||||
wr->prevs[0] = wr->node;
|
||||
wr->node = next;
|
||||
}
|
||||
|
||||
smp_wmb(); /* next locked and prev unlocked */
|
||||
|
||||
set_containers(wr->root, wr->prevs[0], wr->node, prev_cont, node_cont);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr)
|
||||
__releases(RCU) /* :/ */
|
||||
{
|
||||
writer_unlock(wr);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
#ifndef _SCOUTFS_CWSKIP_H_
|
||||
#define _SCOUTFS_CWSKIP_H_
|
||||
|
||||
/* A billion seems like a lot. */
|
||||
#define SCOUTFS_CWSKIP_MAX_HEIGHT 30
|
||||
|
||||
struct scoutfs_cwskip_node {
|
||||
int height;
|
||||
unsigned int write_seq;
|
||||
struct scoutfs_cwskip_node *links[];
|
||||
};
|
||||
|
||||
#define SCOUTFS_CWSKIP_FULL_NODE_BYTES \
|
||||
offsetof(struct scoutfs_cwskip_node, links[SCOUTFS_CWSKIP_MAX_HEIGHT + 1])
|
||||
|
||||
typedef int (*scoutfs_cwskip_cmp_t)(void *K, void *C);
|
||||
|
||||
struct scoutfs_cwskip_root {
|
||||
scoutfs_cwskip_cmp_t cmp_fn;
|
||||
unsigned long node_off;
|
||||
union {
|
||||
struct scoutfs_cwskip_node node;
|
||||
__u8 __full_root_node[SCOUTFS_CWSKIP_FULL_NODE_BYTES];
|
||||
};
|
||||
};
|
||||
|
||||
struct scoutfs_cwskip_reader {
|
||||
struct scoutfs_cwskip_root *root;
|
||||
struct scoutfs_cwskip_node *prev;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
unsigned int prev_seq;
|
||||
unsigned int node_seq;
|
||||
};
|
||||
|
||||
/*
|
||||
* The full height prevs array makes these pretty enormous :/.
|
||||
*/
|
||||
struct scoutfs_cwskip_writer {
|
||||
struct scoutfs_cwskip_root *root;
|
||||
bool node_locked;
|
||||
int locked_height;
|
||||
struct scoutfs_cwskip_node *node;
|
||||
struct scoutfs_cwskip_node *prevs[SCOUTFS_CWSKIP_MAX_HEIGHT];
|
||||
};
|
||||
|
||||
void scoutfs_cwskip_init_root(struct scoutfs_cwskip_root *root, scoutfs_cwskip_cmp_t cmp_fn,
|
||||
unsigned long node_off);
|
||||
bool scoutfs_cwskip_empty(struct scoutfs_cwskip_root *root);
|
||||
int scoutfs_cwskip_rand_height(void);
|
||||
|
||||
void scoutfs_cwskip_read_begin(struct scoutfs_cwskip_root *root, void *key, void **prev_cont,
|
||||
void **node_cont, int *node_cmp, struct scoutfs_cwskip_reader *rd);
|
||||
bool scoutfs_cwskip_read_valid(struct scoutfs_cwskip_reader *rd);
|
||||
bool scoutfs_cwskip_read_next(struct scoutfs_cwskip_reader *rd, void **prev_cont, void **node_cont);
|
||||
void scoutfs_cwskip_read_end(struct scoutfs_cwskip_reader *rd);
|
||||
|
||||
void scoutfs_cwskip_write_begin(struct scoutfs_cwskip_root *root, void *key, int lock_height,
|
||||
void **prev_cont, void **node_cont, int *node_cmp,
|
||||
struct scoutfs_cwskip_writer *wr);
|
||||
void scoutfs_cwskip_write_insert(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *ins);
|
||||
void scoutfs_cwskip_write_remove(struct scoutfs_cwskip_writer *wr,
|
||||
struct scoutfs_cwskip_node *node);
|
||||
bool scoutfs_cwskip_write_next(struct scoutfs_cwskip_writer *wr, int lock_height,
|
||||
void **prev_cont, void **node_cont);
|
||||
void scoutfs_cwskip_write_end(struct scoutfs_cwskip_writer *wr);
|
||||
|
||||
#endif
|
||||
+26
-2
@@ -1615,8 +1615,9 @@ static int verify_ancestors(struct super_block *sb, u64 p1, u64 p2,
|
||||
* from using parent/child locking orders as two groups can have both
|
||||
* parent and child relationships to each other.
|
||||
*/
|
||||
static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
struct inode *new_dir, struct dentry *new_dentry)
|
||||
static int scoutfs_rename_common(struct inode *old_dir,
|
||||
struct dentry *old_dentry, struct inode *new_dir,
|
||||
struct dentry *new_dentry, unsigned int flags)
|
||||
{
|
||||
struct super_block *sb = old_dir->i_sb;
|
||||
struct inode *old_inode = old_dentry->d_inode;
|
||||
@@ -1688,6 +1689,11 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
if ((flags & RENAME_NOREPLACE) && (new_inode != NULL)) {
|
||||
ret = -EEXIST;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (should_orphan(new_inode)) {
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
|
||||
&orph_lock);
|
||||
@@ -1870,6 +1876,23 @@ out_unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_rename(struct inode *old_dir,
|
||||
struct dentry *old_dentry, struct inode *new_dir,
|
||||
struct dentry *new_dentry)
|
||||
{
|
||||
return scoutfs_rename_common(old_dir, old_dentry, new_dir, new_dentry, 0);
|
||||
}
|
||||
|
||||
static int scoutfs_rename2(struct inode *old_dir,
|
||||
struct dentry *old_dentry, struct inode *new_dir,
|
||||
struct dentry *new_dentry, unsigned int flags)
|
||||
{
|
||||
if (flags & ~RENAME_NOREPLACE)
|
||||
return -EINVAL;
|
||||
|
||||
return scoutfs_rename_common(old_dir, old_dentry, new_dir, new_dentry, flags);
|
||||
}
|
||||
|
||||
#ifdef KC_FMODE_KABI_ITERATE
|
||||
/* we only need this to set the iterate flag for kabi :/ */
|
||||
static int scoutfs_dir_open(struct inode *inode, struct file *file)
|
||||
@@ -1960,6 +1983,7 @@ const struct inode_operations_wrapper scoutfs_dir_iops = {
|
||||
.permission = scoutfs_permission,
|
||||
},
|
||||
.tmpfile = scoutfs_tmpfile,
|
||||
.rename2 = scoutfs_rename2,
|
||||
};
|
||||
|
||||
void scoutfs_dir_exit(void)
|
||||
|
||||
+3
-3
@@ -494,13 +494,13 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_forest_insert_list(struct super_block *sb,
|
||||
struct scoutfs_btree_item_list *lst)
|
||||
int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
|
||||
void *pos, void *arg)
|
||||
{
|
||||
DECLARE_FOREST_INFO(sb, finf);
|
||||
|
||||
return scoutfs_btree_insert_list(sb, finf->alloc, finf->wri,
|
||||
&finf->our_log.item_root, lst);
|
||||
&finf->our_log.item_root, cb, pos, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+2
-2
@@ -29,8 +29,8 @@ void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
|
||||
int scoutfs_forest_get_max_seq(struct super_block *sb,
|
||||
struct scoutfs_super_block *super,
|
||||
u64 *seq);
|
||||
int scoutfs_forest_insert_list(struct super_block *sb,
|
||||
struct scoutfs_btree_item_list *lst);
|
||||
int scoutfs_forest_insert_list(struct super_block *sb, scoutfs_btree_item_iter_cb cb,
|
||||
void *pos, void *arg);
|
||||
int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
|
||||
|
||||
void scoutfs_forest_inc_inode_count(struct super_block *sb);
|
||||
|
||||
+1101
-1886
File diff suppressed because it is too large
Load Diff
+1
-1
@@ -26,7 +26,7 @@ int scoutfs_item_delete_force(struct super_block *sb,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
u64 scoutfs_item_dirty_pages(struct super_block *sb);
|
||||
u64 scoutfs_item_dirty_bytes(struct super_block *sb);
|
||||
int scoutfs_item_write_dirty(struct super_block *sb);
|
||||
int scoutfs_item_write_done(struct super_block *sb);
|
||||
bool scoutfs_item_range_cached(struct super_block *sb,
|
||||
|
||||
+27
-9
@@ -835,17 +835,9 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
|
||||
if (conn->listening_conn && conn->notify_down)
|
||||
conn->notify_down(sb, conn, conn->info, conn->rid);
|
||||
|
||||
/*
|
||||
* Usually networking is idle and we destroy pending sends, but when forcing unmount
|
||||
* we can have to wake up waiters by failing pending sends.
|
||||
*/
|
||||
list_splice_init(&conn->resend_queue, &conn->send_queue);
|
||||
list_for_each_entry_safe(msend, tmp, &conn->send_queue, head) {
|
||||
if (scoutfs_forcing_unmount(sb))
|
||||
call_resp_func(sb, conn, msend->resp_func, msend->resp_data,
|
||||
NULL, 0, -ECONNABORTED);
|
||||
list_for_each_entry_safe(msend, tmp, &conn->send_queue, head)
|
||||
free_msend(ninf, msend);
|
||||
}
|
||||
|
||||
/* accepted sockets are removed from their listener's list */
|
||||
if (conn->listening_conn) {
|
||||
@@ -1134,9 +1126,11 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
|
||||
struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
|
||||
struct scoutfs_net_connection *listener;
|
||||
struct scoutfs_net_connection *acc_conn;
|
||||
scoutfs_net_response_t resp_func;
|
||||
struct message_send *msend;
|
||||
struct message_send *tmp;
|
||||
unsigned long delay;
|
||||
void *resp_data;
|
||||
|
||||
trace_scoutfs_net_shutdown_work_enter(sb, 0, 0);
|
||||
trace_scoutfs_conn_shutdown_start(conn);
|
||||
@@ -1182,6 +1176,30 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
|
||||
/* and wait for accepted conn shutdown work to finish */
|
||||
wait_event(conn->waitq, empty_accepted_list(conn));
|
||||
|
||||
/*
|
||||
* Forced unmount will cause net submit to fail once it's
|
||||
* started and it calls shutdown to interrupt any previous
|
||||
* senders waiting for a response. The response callbacks can
|
||||
* do quite a lot of work so we're careful to call them outside
|
||||
* the lock.
|
||||
*/
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
spin_lock(&conn->lock);
|
||||
list_splice_tail_init(&conn->send_queue, &conn->resend_queue);
|
||||
while ((msend = list_first_entry_or_null(&conn->resend_queue,
|
||||
struct message_send, head))) {
|
||||
resp_func = msend->resp_func;
|
||||
resp_data = msend->resp_data;
|
||||
free_msend(ninf, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
|
||||
call_resp_func(sb, conn, resp_func, resp_data, NULL, 0, -ECONNABORTED);
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
}
|
||||
spin_unlock(&conn->lock);
|
||||
}
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
|
||||
/* greetings aren't resent across sockets */
|
||||
|
||||
@@ -403,24 +403,24 @@ TRACE_EVENT(scoutfs_sync_fs,
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_write_func,
|
||||
TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_pages),
|
||||
TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_bytes),
|
||||
|
||||
TP_ARGS(sb, dirty_block_bytes, dirty_item_pages),
|
||||
TP_ARGS(sb, dirty_block_bytes, dirty_item_bytes),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, dirty_block_bytes)
|
||||
__field(__u64, dirty_item_pages)
|
||||
__field(__u64, dirty_item_bytes)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->dirty_block_bytes = dirty_block_bytes;
|
||||
__entry->dirty_item_pages = dirty_item_pages;
|
||||
__entry->dirty_item_bytes = dirty_item_bytes;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_pages %llu",
|
||||
SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_pages)
|
||||
TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_bytes %llu",
|
||||
SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_bytes)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
|
||||
+31
-6
@@ -2068,6 +2068,19 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
break;
|
||||
}
|
||||
|
||||
/* Dirty the btree before freeing so that we can pin it
|
||||
* so that later touches will succeed.
|
||||
*/
|
||||
init_log_merge_key(&key, SCOUTFS_LOG_MERGE_FREEING_ZONE,
|
||||
le64_to_cpu(fr.seq), 0);
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc,
|
||||
&server->wri, &super->log_merge,
|
||||
&key);
|
||||
if (ret < 0) {
|
||||
err_str = "dirtying log btree";
|
||||
break;
|
||||
}
|
||||
|
||||
ret = scoutfs_btree_free_blocks(sb, &server->alloc,
|
||||
&server->wri, &fr.key,
|
||||
&fr.root, 10);
|
||||
@@ -2077,8 +2090,6 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
}
|
||||
|
||||
/* freed blocks are in allocator, we *have* to update key */
|
||||
init_log_merge_key(&key, SCOUTFS_LOG_MERGE_FREEING_ZONE,
|
||||
le64_to_cpu(fr.seq), 0);
|
||||
if (scoutfs_key_is_ones(&fr.key))
|
||||
ret = scoutfs_btree_delete(sb, &server->alloc,
|
||||
&server->wri,
|
||||
@@ -2415,7 +2426,9 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_key key;
|
||||
char *err_str = NULL;
|
||||
int ret;
|
||||
bool deleted = false;
|
||||
int ret = 0;
|
||||
int err = 0;
|
||||
|
||||
scoutfs_key_set_zeros(&rng.end);
|
||||
|
||||
@@ -2463,6 +2476,7 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
err_str = "deleting orig request";
|
||||
goto out;
|
||||
}
|
||||
deleted = true;
|
||||
|
||||
if (le64_to_cpu(comp->flags) & SCOUTFS_LOG_MERGE_COMP_ERROR) {
|
||||
/* restore the range and reclaim the allocator if it failed */
|
||||
@@ -2522,8 +2536,11 @@ out:
|
||||
if (ret < 0)
|
||||
scoutfs_err(sb, "error %d committing log merge: %s", ret, err_str);
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
BUG_ON(ret < 0); /* inconsistent */
|
||||
err = scoutfs_server_apply_commit(sb, ret);
|
||||
BUG_ON(ret < 0 && deleted); /* inconsistent */
|
||||
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
||||
}
|
||||
@@ -3812,6 +3829,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
struct scoutfs_net_connection *conn = NULL;
|
||||
DECLARE_WAIT_QUEUE_HEAD(waitq);
|
||||
struct sockaddr_in sin;
|
||||
bool alloc_init = false;
|
||||
u64 max_seq;
|
||||
int ret;
|
||||
|
||||
@@ -3820,6 +3838,8 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
|
||||
/* first make sure no other servers are still running */
|
||||
ret = scoutfs_quorum_fence_leaders(sb, server->term);
|
||||
if (ret < 0) {
|
||||
@@ -3859,7 +3879,6 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
|
||||
set_roots(server, &super->fs_root, &super->logs_root,
|
||||
&super->srch_root);
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
|
||||
/* prepare server alloc for this transaction, larger first */
|
||||
if (le64_to_cpu(super->server_meta_avail[0].total_nr) <
|
||||
@@ -3870,6 +3889,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
scoutfs_alloc_init(&server->alloc,
|
||||
&super->server_meta_avail[server->other_ind ^ 1],
|
||||
&super->server_meta_freed[server->other_ind ^ 1]);
|
||||
alloc_init = true;
|
||||
server->other_avail = &super->server_meta_avail[server->other_ind];
|
||||
server->other_freed = &super->server_meta_freed[server->other_ind];
|
||||
|
||||
@@ -3931,6 +3951,11 @@ shutdown:
|
||||
/* wait for extra queues by requests, won't find waiters */
|
||||
flush_work(&server->commit_work);
|
||||
|
||||
if (alloc_init)
|
||||
scoutfs_alloc_prepare_commit(sb, &server->alloc, &server->wri);
|
||||
|
||||
scoutfs_block_writer_forget_all(sb, &server->wri);
|
||||
|
||||
scoutfs_lock_server_destroy(sb);
|
||||
scoutfs_omap_server_shutdown(sb);
|
||||
|
||||
|
||||
@@ -271,6 +271,8 @@ static void scoutfs_umount_begin(struct super_block *sb)
|
||||
|
||||
scoutfs_warn(sb, "forcing unmount, can return errors and lose unsynced data");
|
||||
sbi->forced_unmount = true;
|
||||
|
||||
scoutfs_client_net_shutdown(sb);
|
||||
}
|
||||
|
||||
static const struct super_operations scoutfs_super_ops = {
|
||||
|
||||
+8
-6
@@ -207,7 +207,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
|
||||
}
|
||||
|
||||
trace_scoutfs_trans_write_func(sb, scoutfs_block_writer_dirty_bytes(sb, &tri->wri),
|
||||
scoutfs_item_dirty_pages(sb));
|
||||
scoutfs_item_dirty_bytes(sb));
|
||||
|
||||
if (tri->deadline_expired)
|
||||
scoutfs_inc_counter(sb, trans_commit_timer);
|
||||
@@ -422,16 +422,18 @@ static void release_holders(struct super_block *sb)
|
||||
*/
|
||||
static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
|
||||
{
|
||||
u64 dirty_blocks = (scoutfs_item_dirty_bytes(sb) >> SCOUTFS_BLOCK_LG_SHIFT) + 1;
|
||||
|
||||
/*
|
||||
* In theory each dirty item page could be straddling two full
|
||||
* blocks, requiring 4 allocations for each item cache page.
|
||||
* That's much too conservative, typically many dirty item cache
|
||||
* pages that are near each other all land in one block. This
|
||||
* In theory each dirty item could be added to a full block that
|
||||
* has to split, requiring 2 meta block allocs for each dirty
|
||||
* item. That's much too conservative, typically many dirty
|
||||
* items that are near each other all land in one block. This
|
||||
* rough estimate is still so far beyond what typically happens
|
||||
* that it accounts for having to dirty parent blocks and
|
||||
* whatever dirtying is done during the transaction hold.
|
||||
*/
|
||||
if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
|
||||
if (scoutfs_alloc_meta_low(sb, &tri->alloc, dirty_blocks * 4)) {
|
||||
scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
src/*.d
|
||||
src/createmany
|
||||
src/dumb_renameat2
|
||||
src/dumb_setxattr
|
||||
src/handle_cat
|
||||
src/bulk_create_paths
|
||||
|
||||
@@ -3,6 +3,7 @@ SHELL := /usr/bin/bash
|
||||
|
||||
# each binary command is built from a single .c file
|
||||
BIN := src/createmany \
|
||||
src/dumb_renameat2 \
|
||||
src/dumb_setxattr \
|
||||
src/handle_cat \
|
||||
src/bulk_create_paths \
|
||||
|
||||
@@ -72,6 +72,12 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* error reading quorum block"
|
||||
re="$re|scoutfs .* error .* writing quorum block"
|
||||
re="$re|scoutfs .* error .* while checking to delete inode"
|
||||
re="$re|scoutfs .* error .*writing btree blocks.*"
|
||||
re="$re|scoutfs .* error .*writing super block.*"
|
||||
re="$re|scoutfs .* error .* freeing merged btree blocks.*.looping commit del.*upd freeing item"
|
||||
re="$re|scoutfs .* error .* freeing merged btree blocks.*.final commit del.upd freeing item"
|
||||
re="$re|scoutfs .* error .*reading quorum block.*to update event.*"
|
||||
re="$re|scoutfs .* error.*server failed to bind to.*"
|
||||
|
||||
egrep -v "($re)"
|
||||
}
|
||||
|
||||
@@ -1,52 +1,2 @@
|
||||
== create shared test file
|
||||
== set and get xattrs between mount pairs while retrying
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="1"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="2"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="3"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="4"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="5"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="6"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="7"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="8"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="9"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
counter block_cache_remove_stale changed
|
||||
# file: /mnt/test/test/block-stale-reads/file
|
||||
user.xat="10"
|
||||
|
||||
counter block_cache_remove_stale changed
|
||||
== Issue scoutfs df to force block reads to trigger stale invalidation/retry
|
||||
counter block_cache_remove_stale changed
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
=== renameat2 noreplace flag test
|
||||
=== run two asynchronous calls to renameat2 NOREPLACE
|
||||
@@ -9,6 +9,8 @@ generic/011
|
||||
generic/013
|
||||
generic/014
|
||||
generic/020
|
||||
generic/023
|
||||
generic/024
|
||||
generic/028
|
||||
generic/032
|
||||
generic/034
|
||||
@@ -82,6 +84,7 @@ generic/016
|
||||
generic/018
|
||||
generic/021
|
||||
generic/022
|
||||
generic/025
|
||||
generic/026
|
||||
generic/031
|
||||
generic/033
|
||||
@@ -93,6 +96,7 @@ generic/060
|
||||
generic/061
|
||||
generic/063
|
||||
generic/064
|
||||
generic/078
|
||||
generic/079
|
||||
generic/081
|
||||
generic/082
|
||||
@@ -278,4 +282,4 @@ shared/004
|
||||
shared/032
|
||||
shared/051
|
||||
shared/289
|
||||
Passed all 73 tests
|
||||
Passed all 75 tests
|
||||
|
||||
@@ -37,4 +37,5 @@ createmany-parallel-mounts.sh
|
||||
archive-light-cycle.sh
|
||||
block-stale-reads.sh
|
||||
inode-deletion.sh
|
||||
renameat2-noreplace.sh
|
||||
xfstests.sh
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#ifndef RENAMEAT2_EXIST
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
#if !defined(SYS_renameat2) && defined(__x86_64__)
|
||||
#define SYS_renameat2 316 /* from arch/x86/entry/syscalls/syscall_64.tbl */
|
||||
#endif
|
||||
|
||||
static int renameat2(int olddfd, const char *old_dir,
|
||||
int newdfd, const char *new_dir,
|
||||
unsigned int flags)
|
||||
{
|
||||
#ifdef SYS_renameat2
|
||||
return syscall(SYS_renameat2, olddfd, old_dir, newdfd, new_dir, flags);
|
||||
#else
|
||||
errno = ENOSYS;
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef RENAME_NOREPLACE
|
||||
#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite newpath of rename */
|
||||
#endif
|
||||
#ifndef RENAME_EXCHANGE
|
||||
#define RENAME_EXCHANGE (1 << 1) /* Exchange oldpath and newpath */
|
||||
#endif
|
||||
#ifndef RENAME_WHITEOUT
|
||||
#define RENAME_WHITEOUT (1 << 2) /* Whiteout oldpath */
|
||||
#endif
|
||||
|
||||
static void exit_usage(char **argv)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"usage: %s [-n|-x|-w] old_path new_path\n"
|
||||
" -n noreplace\n"
|
||||
" -x exchange\n"
|
||||
" -w whiteout\n", argv[0]);
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const char *old_path = NULL;
|
||||
const char *new_path = NULL;
|
||||
unsigned int flags = 0;
|
||||
int ret;
|
||||
int c;
|
||||
|
||||
for (c = 1; c < argc; c++) {
|
||||
if (argv[c][0] == '-') {
|
||||
switch (argv[c][1]) {
|
||||
case 'n':
|
||||
flags |= RENAME_NOREPLACE;
|
||||
break;
|
||||
case 'x':
|
||||
flags |= RENAME_EXCHANGE;
|
||||
break;
|
||||
case 'w':
|
||||
flags |= RENAME_WHITEOUT;
|
||||
break;
|
||||
default:
|
||||
exit_usage(argv);
|
||||
}
|
||||
} else if (!old_path) {
|
||||
old_path = argv[c];
|
||||
} else if (!new_path) {
|
||||
new_path = argv[c];
|
||||
} else {
|
||||
exit_usage(argv);
|
||||
}
|
||||
}
|
||||
|
||||
if (!old_path || !new_path) {
|
||||
printf("specify the correct directory path\n");
|
||||
errno = ENOENT;
|
||||
return 1;
|
||||
}
|
||||
|
||||
ret = renameat2(AT_FDCWD, old_path, AT_FDCWD, new_path, flags);
|
||||
if (ret == -1) {
|
||||
perror("Error");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -5,57 +5,18 @@
|
||||
# persistent blocks to create stable block reading scenarios. Instead
|
||||
# we use triggers to exercise how readers encounter stale blocks.
|
||||
#
|
||||
# Trigger retries in the block cache by calling scoutfs df
|
||||
# which in turn will call scoutfs_ioctl_alloc_detail. This
|
||||
# is guaranteed to exist, which will force block cache reads.
|
||||
|
||||
t_require_commands touch setfattr getfattr
|
||||
echo "== Issue scoutfs df to force block reads to trigger stale invalidation/retry"
|
||||
nr=0
|
||||
|
||||
inc_wrap_fs_nr()
|
||||
{
|
||||
local nr="$(($1 + 1))"
|
||||
old=$(t_counter block_cache_remove_stale $nr)
|
||||
t_trigger_arm_silent block_remove_stale $nr
|
||||
|
||||
if [ "$nr" == "$T_NR_MOUNTS" ]; then
|
||||
nr=0
|
||||
fi
|
||||
scoutfs df -p "$T_M0" > /dev/null
|
||||
|
||||
echo $nr
|
||||
}
|
||||
|
||||
GETFATTR="getfattr --absolute-names"
|
||||
SETFATTR="setfattr"
|
||||
|
||||
echo "== create shared test file"
|
||||
touch "$T_D0/file"
|
||||
$SETFATTR -n user.xat -v 0 "$T_D0/file"
|
||||
|
||||
#
|
||||
# Trigger retries in the block cache as we bounce xattr values around
|
||||
# between sequential pairs of mounts. This is a little silly because if
|
||||
# either of the mounts are the server then they'll almost certaily have
|
||||
# their trigger fired prematurely by message handling btree calls while
|
||||
# working with the t_ helpers long before we work with the xattrs. But
|
||||
# the block cache stale retry path is still being exercised.
|
||||
#
|
||||
echo "== set and get xattrs between mount pairs while retrying"
|
||||
set_nr=0
|
||||
get_nr=$(inc_wrap_fs_nr $set_nr)
|
||||
|
||||
for i in $(seq 1 10); do
|
||||
eval set_file="\$T_D${set_nr}/file"
|
||||
eval get_file="\$T_D${get_nr}/file"
|
||||
|
||||
old_set=$(t_counter block_cache_remove_stale $set_nr)
|
||||
old_get=$(t_counter block_cache_remove_stale $get_nr)
|
||||
|
||||
t_trigger_arm_silent block_remove_stale $set_nr
|
||||
t_trigger_arm_silent block_remove_stale $get_nr
|
||||
|
||||
$SETFATTR -n user.xat -v $i "$set_file"
|
||||
$GETFATTR -n user.xat "$get_file" 2>&1 | t_filter_fs
|
||||
|
||||
t_counter_diff_changed block_cache_remove_stale $old_set $set_nr
|
||||
t_counter_diff_changed block_cache_remove_stale $old_get $get_nr
|
||||
|
||||
set_nr="$get_nr"
|
||||
get_nr=$(inc_wrap_fs_nr $set_nr)
|
||||
done
|
||||
t_counter_diff_changed block_cache_remove_stale $old $nr
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
#
|
||||
# simple renameat2 NOREPLACE unit test
|
||||
#
|
||||
|
||||
t_require_commands dumb_renameat2
|
||||
t_require_mounts 2
|
||||
|
||||
echo "=== renameat2 noreplace flag test"
|
||||
|
||||
# give each mount their own dir (lock group) to minimize create contention
|
||||
mkdir $T_M0/dir0
|
||||
mkdir $T_M1/dir1
|
||||
|
||||
echo "=== run two asynchronous calls to renameat2 NOREPLACE"
|
||||
for i in $(seq 0 100); do
|
||||
# prepare inputs in isolation
|
||||
touch "$T_M0/dir0/old0"
|
||||
touch "$T_M1/dir1/old1"
|
||||
|
||||
# race doing noreplace renames, both can't succeed
|
||||
dumb_renameat2 -n "$T_M0/dir0/old0" "$T_M0/dir0/sharednew" 2> /dev/null &
|
||||
pid0=$!
|
||||
dumb_renameat2 -n "$T_M1/dir1/old1" "$T_M1/dir0/sharednew" 2> /dev/null &
|
||||
pid1=$!
|
||||
|
||||
wait $pid0
|
||||
rc0=$?
|
||||
wait $pid1
|
||||
rc1=$?
|
||||
|
||||
test "$rc0" == 0 -a "$rc1" == 0 && t_fail "both renames succeeded"
|
||||
|
||||
# blow away possible files for either race outcome
|
||||
rm -f "$T_M0/dir0/old0" "$T_M1/dir1/old1" "$T_M0/dir0/sharednew" "$T_M1/dir1/sharednew"
|
||||
done
|
||||
|
||||
t_pass
|
||||
@@ -60,13 +60,9 @@ EOF
|
||||
|
||||
cat << EOF > local.exclude
|
||||
generic/003 # missing atime update in buffered read
|
||||
generic/023 # renameat2 not implemented
|
||||
generic/024 # renameat2 not implemented
|
||||
generic/025 # renameat2 not implemented
|
||||
generic/029 # mmap missing
|
||||
generic/030 # mmap missing
|
||||
generic/075 # file content mismatch failures (fds, etc)
|
||||
generic/078 # renameat2 not implemented
|
||||
generic/080 # mmap missing
|
||||
generic/103 # enospc causes trans commit failures
|
||||
generic/105 # needs trigage: something about acls
|
||||
|
||||
@@ -42,6 +42,40 @@ the super blocks on both devices.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "change-quorum-config {-Q|--quorum-slot} NR,ADDR,PORT [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.sp
|
||||
Change the quorum configuration for an existing file system. The new
|
||||
configuration completely replaces the old configuration. Any slots
|
||||
from the old configuration that should be retained must be described
|
||||
with arguments in the new configuration.
|
||||
.sp
|
||||
Currently the configuration may only be changed offline.
|
||||
.sp
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.B "-Q, --quorum-slot NR,ADDR,PORT"
|
||||
The quorum configuration is built by specifying configured slots with
|
||||
multiple arguments as described in the
|
||||
.B mkfs
|
||||
command.
|
||||
.TP
|
||||
.B "-F, --offline META-DEVICE"
|
||||
Perform the change offline by updating the superblock in the metadata
|
||||
device. The command will read the super block and refuse to make the
|
||||
change if it sees any evidence that the metadata device is currently in
|
||||
use. The file system must be successfully unmounted after possibly
|
||||
recovering any previously unresolved mounts for the change to be
|
||||
successful. After the change succeeds the newly configured slots can
|
||||
be used by mounts.
|
||||
.sp
|
||||
The offline change directly reads from and writes to the device and does
|
||||
not protect against concurrent use of the device. It must be carefully
|
||||
run when the file system will not be mounted.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "df [-h|--human-readable] [-p|--path PATH]"
|
||||
.sp
|
||||
|
||||
@@ -45,16 +45,11 @@ static int do_change_fmt_vers(struct change_fmt_vers_args *args)
|
||||
{
|
||||
struct scoutfs_super_block *meta_super = NULL;
|
||||
struct scoutfs_super_block *data_super = NULL;
|
||||
struct scoutfs_quorum_block *qblk = NULL;
|
||||
struct scoutfs_quorum_block_event *beg;
|
||||
struct scoutfs_quorum_block_event *end;
|
||||
bool wrote_meta = false;
|
||||
bool in_use = false;
|
||||
char uuid_str[37];
|
||||
int meta_fd = -1;
|
||||
int data_fd = -1;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
meta_fd = open(args->meta_device, O_DIRECT | O_SYNC | O_RDWR | O_EXCL);
|
||||
if (meta_fd < 0) {
|
||||
@@ -117,44 +112,13 @@ static int do_change_fmt_vers(struct change_fmt_vers_args *args)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (meta_super->mounted_clients.ref.blkno != 0) {
|
||||
fprintf(stderr, "meta superblock mounted clients btree is not empty.\n");
|
||||
ret = -EBUSY;
|
||||
in_use = true;
|
||||
ret = meta_super_in_use(meta_fd, meta_super);
|
||||
if (ret < 0) {
|
||||
if (ret == -EBUSY)
|
||||
fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to change the format version\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* check for active quorum slots */
|
||||
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
|
||||
if (!quorum_slot_present(meta_super, i))
|
||||
continue;
|
||||
ret = read_block(meta_fd, SCOUTFS_QUORUM_BLKNO + i, SCOUTFS_BLOCK_SM_SHIFT,
|
||||
(void **)&qblk);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error reading quorum block for slot %u\n", i);
|
||||
goto out;
|
||||
}
|
||||
|
||||
beg = &qblk->events[SCOUTFS_QUORUM_EVENT_BEGIN];
|
||||
end = &qblk->events[SCOUTFS_QUORUM_EVENT_END];
|
||||
|
||||
if (le64_to_cpu(beg->write_nr) > le64_to_cpu(end->write_nr)) {
|
||||
fprintf(stderr, "mount in quorum slot %u could still be running.\n"
|
||||
" begin event: write_nr %llu timestamp %llu.%08u\n"
|
||||
" end event: write_nr %llu timestamp %llu.%08u\n",
|
||||
i, le64_to_cpu(beg->write_nr), le64_to_cpu(beg->ts.sec),
|
||||
le32_to_cpu(beg->ts.nsec),
|
||||
le64_to_cpu(end->write_nr), le64_to_cpu(end->ts.sec),
|
||||
le32_to_cpu(end->ts.nsec));
|
||||
ret = -EBUSY;
|
||||
in_use = true;
|
||||
goto out;
|
||||
}
|
||||
|
||||
free(qblk);
|
||||
qblk = NULL;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(meta_super->fmt_vers) != args->fmt_vers) {
|
||||
meta_super->fmt_vers = cpu_to_le64(args->fmt_vers);
|
||||
|
||||
@@ -195,11 +159,7 @@ static int do_change_fmt_vers(struct change_fmt_vers_args *args)
|
||||
le64_to_cpu(meta_super->fmt_vers));
|
||||
|
||||
out:
|
||||
if (in_use)
|
||||
fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to change the format version\n");
|
||||
|
||||
if (qblk)
|
||||
free(qblk);
|
||||
if (meta_super)
|
||||
free(meta_super);
|
||||
if (data_super)
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <uuid/uuid.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <assert.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <inttypes.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "cmd.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "parse.h"
|
||||
#include "dev.h"
|
||||
#include "quorum.h"
|
||||
|
||||
struct change_quorum_args {
|
||||
char *meta_device;
|
||||
bool offline;
|
||||
int nr_slots;
|
||||
struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
};
|
||||
|
||||
static int do_change_quorum(struct change_quorum_args *args)
|
||||
{
|
||||
struct scoutfs_super_block *meta_super = NULL;
|
||||
char uuid_str[37];
|
||||
int meta_fd = -1;
|
||||
int ret;
|
||||
|
||||
meta_fd = open(args->meta_device, O_DIRECT | O_SYNC | O_RDWR | O_EXCL);
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = read_block_verify(meta_fd, SCOUTFS_BLOCK_MAGIC_SUPER, 0, SCOUTFS_SUPER_BLKNO,
|
||||
SCOUTFS_BLOCK_SM_SHIFT, (void **)&meta_super);
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to read meta super block: %s (%d)\n",
|
||||
strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = meta_super_in_use(meta_fd, meta_super);
|
||||
if (ret < 0) {
|
||||
if (ret == -EBUSY)
|
||||
fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to change the quorum config\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
assert(sizeof(meta_super->qconf.slots) == sizeof(args->slots));
|
||||
memcpy(meta_super->qconf.slots, args->slots, sizeof(meta_super->qconf.slots));
|
||||
le64_add_cpu(&meta_super->qconf.version, 1);
|
||||
|
||||
ret = write_block(meta_fd, SCOUTFS_BLOCK_MAGIC_SUPER, meta_super->hdr.fsid, 1,
|
||||
SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT, &meta_super->hdr);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
uuid_unparse(meta_super->uuid, uuid_str);
|
||||
|
||||
printf("Successfully changed quorum config for scoutfs filesystem:\n"
|
||||
" meta device path: %s\n"
|
||||
" fsid: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" quorum config version: %llu\n"
|
||||
" quorum slots: ",
|
||||
args->meta_device,
|
||||
le64_to_cpu(meta_super->hdr.fsid),
|
||||
uuid_str,
|
||||
le64_to_cpu(meta_super->qconf.version));
|
||||
|
||||
print_quorum_slots(meta_super->qconf.slots, array_size(meta_super->qconf.slots),
|
||||
" ");
|
||||
|
||||
out:
|
||||
|
||||
if (meta_super)
|
||||
free(meta_super);
|
||||
if (meta_fd != -1)
|
||||
close(meta_fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct change_quorum_args *args = state->input;
|
||||
struct scoutfs_quorum_slot slot;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'F':
|
||||
args->offline = true;
|
||||
break;
|
||||
case 'Q':
|
||||
ret = parse_quorum_slot(&slot, arg);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (args->slots[ret].addr.v4.family != cpu_to_le16(SCOUTFS_AF_NONE))
|
||||
argp_error(state, "Quorum slot %u already specified before slot '%s'\n",
|
||||
ret, arg);
|
||||
args->slots[ret] = slot;
|
||||
args->nr_slots++;
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
else
|
||||
argp_error(state, "more than one metadata device argument given");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->offline)
|
||||
argp_error(state, "must specify --offline");
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
if (!args->nr_slots)
|
||||
argp_error(state, "must specify at least one quorum slot with --quorum-slot|-Q");
|
||||
if (!valid_quorum_slots(args->slots))
|
||||
argp_error(state, "invalid quorum slot configuration");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"},
|
||||
{ "offline", 'F', NULL, 0, "Write format version in offline device super blocks [Currently Required]"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"",
|
||||
"Change quorum slots and addresses of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
static int change_quorum_cmd(int argc, char *argv[])
|
||||
{
|
||||
struct change_quorum_args change_quorum_args = {
|
||||
.offline = false,
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &change_quorum_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_change_quorum(&change_quorum_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) change_quorum_ctor(void)
|
||||
{
|
||||
cmd_register_argp("change-quorum-config", &argp, GROUP_CORE, change_quorum_cmd);
|
||||
}
|
||||
+16
-64
@@ -31,6 +31,7 @@
|
||||
#include "btree.h"
|
||||
#include "leaf_item_hash.h"
|
||||
#include "blkid.h"
|
||||
#include "quorum.h"
|
||||
|
||||
|
||||
/*
|
||||
@@ -139,7 +140,6 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
int data_fd = -1;
|
||||
char uuid_str[37];
|
||||
void *zeros = NULL;
|
||||
char *indent;
|
||||
u64 blkno;
|
||||
u64 meta_size;
|
||||
u64 data_size;
|
||||
@@ -224,6 +224,7 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
assert(sizeof(args->slots) ==
|
||||
member_sizeof(struct scoutfs_super_block, qconf.slots));
|
||||
memcpy(super->qconf.slots, args->slots, sizeof(args->slots));
|
||||
super->qconf.version = cpu_to_le64(1);
|
||||
|
||||
if (invalid_data_alloc_zone_blocks(le64_to_cpu(super->total_data_blocks),
|
||||
args->data_alloc_zone_blocks)) {
|
||||
@@ -350,14 +351,15 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
uuid_unparse(super->uuid, uuid_str);
|
||||
|
||||
printf("Created scoutfs filesystem:\n"
|
||||
" meta device path: %s\n"
|
||||
" data device path: %s\n"
|
||||
" fsid: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" format version: %llu\n"
|
||||
" 64KB metadata blocks: "SIZE_FMT"\n"
|
||||
" 4KB data blocks: "SIZE_FMT"\n"
|
||||
" quorum slots: ",
|
||||
" meta device path: %s\n"
|
||||
" data device path: %s\n"
|
||||
" fsid: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" format version: %llu\n"
|
||||
" 64KB metadata blocks: "SIZE_FMT"\n"
|
||||
" 4KB data blocks: "SIZE_FMT"\n"
|
||||
" quorum config version: %llu\n"
|
||||
" quorum slots: ",
|
||||
args->meta_device,
|
||||
args->data_device,
|
||||
le64_to_cpu(super->hdr.fsid),
|
||||
@@ -366,22 +368,11 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
SIZE_ARGS(le64_to_cpu(super->total_meta_blocks),
|
||||
SCOUTFS_BLOCK_LG_SIZE),
|
||||
SIZE_ARGS(le64_to_cpu(super->total_data_blocks),
|
||||
SCOUTFS_BLOCK_SM_SIZE));
|
||||
SCOUTFS_BLOCK_SM_SIZE),
|
||||
le64_to_cpu(super->qconf.version));
|
||||
|
||||
indent = "";
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
struct scoutfs_quorum_slot *sl = &super->qconf.slots[i];
|
||||
struct in_addr in;
|
||||
|
||||
if (sl->addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
|
||||
continue;
|
||||
|
||||
in.s_addr = htonl(le32_to_cpu(sl->addr.v4.addr));
|
||||
printf("%s%u: %s:%u", indent,
|
||||
i, inet_ntoa(in), le16_to_cpu(sl->addr.v4.port));
|
||||
indent = "\n ";
|
||||
}
|
||||
printf("\n");
|
||||
print_quorum_slots(super->qconf.slots, array_size(super->qconf.slots),
|
||||
" ");
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
@@ -398,45 +389,6 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool valid_quorum_slots(struct scoutfs_quorum_slot *slots)
|
||||
{
|
||||
struct in_addr in;
|
||||
bool valid = true;
|
||||
char *addr;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_NONE))
|
||||
continue;
|
||||
|
||||
if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4)) {
|
||||
fprintf(stderr, "quorum slot nr %u has invalid family %u\n",
|
||||
i, le16_to_cpu(slots[i].addr.v4.family));
|
||||
valid = false;
|
||||
}
|
||||
|
||||
for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
|
||||
if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
|
||||
continue;
|
||||
|
||||
if (slots[i].addr.v4.addr == slots[j].addr.v4.addr &&
|
||||
slots[i].addr.v4.port == slots[j].addr.v4.port) {
|
||||
|
||||
in.s_addr =
|
||||
htonl(le32_to_cpu(slots[i].addr.v4.addr));
|
||||
addr = inet_ntoa(in);
|
||||
fprintf(stderr, "quorum slot nr %u and %u have the same address %s:%u\n",
|
||||
i, j, addr,
|
||||
le16_to_cpu(slots[i].addr.v4.port));
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct mkfs_args *args = state->input;
|
||||
@@ -517,7 +469,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->nr_slots)
|
||||
argp_error(state, "must specify at least one quorum slot with --quorum-count|-Q");
|
||||
argp_error(state, "must specify at least one quorum slot with --quorum-slot|-Q");
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
if (!args->data_device)
|
||||
|
||||
+8
-4
@@ -922,10 +922,6 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
|
||||
uuid_unparse(super->uuid, uuid_str);
|
||||
|
||||
if (!(le64_to_cpu(super->flags) && SCOUTFS_FLAG_IS_META_BDEV))
|
||||
fprintf(stderr,
|
||||
"**** Printing metadata from a data device! Did you mean to do this? ****\n");
|
||||
|
||||
printf("super blkno %llu\n", blkno);
|
||||
print_block_header(&super->hdr, SCOUTFS_BLOCK_SM_SIZE);
|
||||
printf(" fmt_vers %llu uuid %s\n",
|
||||
@@ -1006,6 +1002,13 @@ static int print_volume(int fd)
|
||||
|
||||
print_super_block(super, SCOUTFS_SUPER_BLKNO);
|
||||
|
||||
if (!(le64_to_cpu(super->flags) & SCOUTFS_FLAG_IS_META_BDEV)) {
|
||||
fprintf(stderr,
|
||||
"**** Printing from data device is not allowed ****\n");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = print_quorum_blocks(fd, super);
|
||||
|
||||
err = print_btree(fd, super, "mounted_clients", &super->mounted_clients,
|
||||
@@ -1072,6 +1075,7 @@ static int print_volume(int fd)
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
out:
|
||||
free(super);
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
#include <stdio.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
@@ -8,3 +12,68 @@ bool quorum_slot_present(struct scoutfs_super_block *super, int i)
|
||||
{
|
||||
return super->qconf.slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
|
||||
}
|
||||
|
||||
bool valid_quorum_slots(struct scoutfs_quorum_slot *slots)
|
||||
{
|
||||
struct in_addr in;
|
||||
bool valid = true;
|
||||
char *addr;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_NONE))
|
||||
continue;
|
||||
|
||||
if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4)) {
|
||||
fprintf(stderr, "quorum slot nr %u has invalid family %u\n",
|
||||
i, le16_to_cpu(slots[i].addr.v4.family));
|
||||
valid = false;
|
||||
}
|
||||
|
||||
for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
|
||||
if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
|
||||
continue;
|
||||
|
||||
if (slots[i].addr.v4.addr == slots[j].addr.v4.addr &&
|
||||
slots[i].addr.v4.port == slots[j].addr.v4.port) {
|
||||
|
||||
in.s_addr =
|
||||
htonl(le32_to_cpu(slots[i].addr.v4.addr));
|
||||
addr = inet_ntoa(in);
|
||||
fprintf(stderr, "quorum slot nr %u and %u have the same address %s:%u\n",
|
||||
i, j, addr,
|
||||
le16_to_cpu(slots[i].addr.v4.port));
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Print quorum slots to stdout, a line at a time. The first line is
|
||||
* not indented and the rest of the lines use the indent string from the
|
||||
* caller.
|
||||
*/
|
||||
void print_quorum_slots(struct scoutfs_quorum_slot *slots, int nr, char *indent)
|
||||
{
|
||||
struct scoutfs_quorum_slot *sl;
|
||||
struct in_addr in;
|
||||
bool first = true;
|
||||
int i;
|
||||
|
||||
for (i = 0, sl = slots; i < SCOUTFS_QUORUM_MAX_SLOTS; i++, sl++) {
|
||||
|
||||
if (sl->addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
|
||||
continue;
|
||||
|
||||
in.s_addr = htonl(le32_to_cpu(sl->addr.v4.addr));
|
||||
printf("%s%u: %s:%u\n", first ? "" : indent,
|
||||
i, inet_ntoa(in), le16_to_cpu(sl->addr.v4.port));
|
||||
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,5 +4,7 @@
|
||||
#include <stdbool.h>
|
||||
|
||||
bool quorum_slot_present(struct scoutfs_super_block *super, int i);
|
||||
bool valid_quorum_slots(struct scoutfs_quorum_slot *slots);
|
||||
void print_quorum_slots(struct scoutfs_quorum_slot *slots, int nr, char *indent);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "crc.h"
|
||||
#include "quorum.h"
|
||||
|
||||
#define ENV_PATH "SCOUTFS_MOUNT_PATH"
|
||||
|
||||
@@ -201,3 +202,56 @@ int write_block_sync(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if the metadata super block indicates that there might
|
||||
* be active mounts using the system. Returns -errno, 0, or -EBUSY if
|
||||
* we found evidence that the device might be in use.
|
||||
*/
|
||||
int meta_super_in_use(int meta_fd, struct scoutfs_super_block *meta_super)
|
||||
{
|
||||
struct scoutfs_quorum_block *qblk = NULL;
|
||||
struct scoutfs_quorum_block_event *beg;
|
||||
struct scoutfs_quorum_block_event *end;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
if (meta_super->mounted_clients.ref.blkno != 0) {
|
||||
fprintf(stderr, "meta superblock mounted clients btree is not empty.\n");
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* check for active quorum slots */
|
||||
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
|
||||
if (!quorum_slot_present(meta_super, i))
|
||||
continue;
|
||||
ret = read_block(meta_fd, SCOUTFS_QUORUM_BLKNO + i, SCOUTFS_BLOCK_SM_SHIFT,
|
||||
(void **)&qblk);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error reading quorum block for slot %u\n", i);
|
||||
goto out;
|
||||
}
|
||||
|
||||
beg = &qblk->events[SCOUTFS_QUORUM_EVENT_BEGIN];
|
||||
end = &qblk->events[SCOUTFS_QUORUM_EVENT_END];
|
||||
|
||||
if (le64_to_cpu(beg->write_nr) > le64_to_cpu(end->write_nr)) {
|
||||
fprintf(stderr, "mount in quorum slot %u could still be running.\n"
|
||||
" begin event: write_nr %llu timestamp %llu.%08u\n"
|
||||
" end event: write_nr %llu timestamp %llu.%08u\n",
|
||||
i, le64_to_cpu(beg->write_nr), le64_to_cpu(beg->ts.sec),
|
||||
le32_to_cpu(beg->ts.nsec),
|
||||
le64_to_cpu(end->write_nr), le64_to_cpu(end->ts.sec),
|
||||
le32_to_cpu(end->ts.nsec));
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
free(qblk);
|
||||
qblk = NULL;
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -117,10 +117,12 @@ int read_block_crc(int fd, u64 blkno, int shift, void **ret_val);
|
||||
int read_block_verify(int fd, u32 magic, u64 fsid, u64 blkno, int shift, void **ret_val);
|
||||
|
||||
struct scoutfs_block_header;
|
||||
struct scoutfs_super_block;
|
||||
int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
|
||||
int shift, struct scoutfs_block_header *hdr);
|
||||
int write_block_sync(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
|
||||
int shift, struct scoutfs_block_header *hdr);
|
||||
int meta_super_in_use(int meta_fd, struct scoutfs_super_block *meta_super);
|
||||
|
||||
#define __stringify_1(x) #x
|
||||
#define __stringify(x) __stringify_1(x)
|
||||
|
||||
Reference in New Issue
Block a user