mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-06 12:06:26 +00:00
Compare commits
29 Commits
zab/inode_
...
v1.4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fb7e43dd23 | ||
|
|
45d90a5ae4 | ||
|
|
48f1305a8a | ||
|
|
cd4d6502b8 | ||
|
|
dff366e1a4 | ||
|
|
ca526e2bc0 | ||
|
|
e423d42106 | ||
|
|
82d2be2e4a | ||
|
|
4102b760d0 | ||
|
|
65654ee7c0 | ||
|
|
b2d6ceeb9c | ||
|
|
d8231016f8 | ||
|
|
3c2b329675 | ||
|
|
96ad8dd510 | ||
|
|
44f38a31ec | ||
|
|
fb2ff753ad | ||
|
|
bb3db7e272 | ||
|
|
c94b072925 | ||
|
|
26ae9c6e04 | ||
|
|
c8d7221ec5 | ||
|
|
7fd03dc311 | ||
|
|
4e8a088cc5 | ||
|
|
9c751c1197 | ||
|
|
875583b7ef | ||
|
|
38e5aa77c4 | ||
|
|
57a1d75e52 | ||
|
|
51d19d797f | ||
|
|
029a684c25 | ||
|
|
f2679d9598 |
@@ -2,9 +2,67 @@ Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.2-rc
|
||||
v1.4
|
||||
\
|
||||
*TBD*
|
||||
*May 6, 2022*
|
||||
|
||||
* **Fix possible client crash during server failover**
|
||||
\
|
||||
Fixed a narrow window during server failover and lock recovery that
|
||||
could cause a client mount to believe that it had an inconsistent item
|
||||
cache and panic. This required very specific lock state and messaging
|
||||
patterns between multiple mounts and multiple servers which made it
|
||||
unlikely to occur in the field.
|
||||
|
||||
---
|
||||
v1.3
|
||||
\
|
||||
*Apr 7, 2022*
|
||||
|
||||
* **Fix rare server instability under heavy load**
|
||||
\
|
||||
Fixed a case of server instability under heavy load due to concurrent
|
||||
work fully exhausting metadata block allocation pools reserved for a
|
||||
single server transaction. This would cause brief interruption as the
|
||||
server shutdown and the next server started up and made progress as
|
||||
pending work was retried.
|
||||
|
||||
* **Fix slow fencing preventing server startup**
|
||||
\
|
||||
If a server had to process many fence requests with a slow fencing
|
||||
mechanism it could be interrupted before it finished. The server
|
||||
now makes sure heartbeat messages are sent while it is making progress
|
||||
on fencing requests so that other quorum members don't interrupt the
|
||||
process.
|
||||
|
||||
* **Performance improvement in getxattr and setxattr**
|
||||
\
|
||||
Kernel allocation patterns in the getxattr and setxattr
|
||||
implementations were causing significant contention between CPUs. Their
|
||||
allocation strategy was changed so that concurrent tasks can call these
|
||||
xattr methods without degrading performance.
|
||||
|
||||
---
|
||||
v1.2
|
||||
\
|
||||
*Mar 14, 2022*
|
||||
|
||||
* **Fix deadlock between fallocate() and read() system calls**
|
||||
\
|
||||
Fixed a lock inversion that could cause two tasks to deadlock if they
|
||||
performed fallocate() and read() on a file at the same time. The
|
||||
deadlock was uninterruptible so the machine needed to be rebooted. This
|
||||
was relatively rare as fallocate() is usually used to prepare files
|
||||
before they're used.
|
||||
|
||||
* **Fix instability from heavy file deletion workloads**
|
||||
\
|
||||
Fixed rare circumstances under which background file deletion cleanup
|
||||
tasks could try to delete a file while it is being deleted by another
|
||||
task. Heavy load across multiple nodes, either many files being deleted
|
||||
or large files being deleted, increased the chances of this happening.
|
||||
Heavy staging could cause this problem because staging can create many
|
||||
internal temporary files that need to be deleted.
|
||||
|
||||
---
|
||||
v1.1
|
||||
|
||||
@@ -1318,6 +1318,17 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
return lo;
|
||||
}
|
||||
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space)
|
||||
{
|
||||
unsigned int seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&alloc->seqlock);
|
||||
*avail_total = le32_to_cpu(alloc->avail.first_nr);
|
||||
*freed_space = list_block_space(alloc->freed.first_nr);
|
||||
} while (read_seqretry(&alloc->seqlock, seq));
|
||||
}
|
||||
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag)
|
||||
{
|
||||
|
||||
@@ -158,6 +158,7 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
|
||||
|
||||
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 nr);
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag);
|
||||
|
||||
|
||||
@@ -2449,7 +2449,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_btree_root *root, int alloc_low)
|
||||
struct scoutfs_btree_root *root, int free_budget)
|
||||
{
|
||||
u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT];
|
||||
struct scoutfs_block *bl = NULL;
|
||||
@@ -2459,11 +2459,15 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_avl_node *node;
|
||||
struct scoutfs_avl_node *next;
|
||||
struct scoutfs_key par_next;
|
||||
int nr_freed = 0;
|
||||
int nr_par;
|
||||
int level;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(free_budget <= 0))
|
||||
return -EINVAL;
|
||||
|
||||
if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos)))
|
||||
return -EIO; /* XXX corruption */
|
||||
|
||||
@@ -2538,8 +2542,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
while (node) {
|
||||
|
||||
/* make sure we can always free parents after leaves */
|
||||
if (scoutfs_alloc_meta_low(sb, alloc,
|
||||
alloc_low + nr_par + 1)) {
|
||||
if ((nr_freed + 1 + nr_par) > free_budget) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
@@ -2553,6 +2556,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
le64_to_cpu(ref.blkno));
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
nr_freed++;
|
||||
|
||||
node = scoutfs_avl_next(&bt->item_root, node);
|
||||
if (node) {
|
||||
@@ -2568,6 +2572,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
blknos[i]);
|
||||
ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]);
|
||||
BUG_ON(ret); /* checked meta low, freed should fit */
|
||||
nr_freed++;
|
||||
}
|
||||
|
||||
/* restart walk past the subtree we just freed */
|
||||
|
||||
@@ -125,7 +125,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_btree_root *root, int alloc_low);
|
||||
struct scoutfs_btree_root *root, int free_budget);
|
||||
|
||||
void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
|
||||
|
||||
|
||||
@@ -157,6 +157,7 @@
|
||||
EXPAND_COUNTER(orphan_scan_error) \
|
||||
EXPAND_COUNTER(orphan_scan_item) \
|
||||
EXPAND_COUNTER(orphan_scan_omap_set) \
|
||||
EXPAND_COUNTER(quorum_candidate_server_stopping) \
|
||||
EXPAND_COUNTER(quorum_elected) \
|
||||
EXPAND_COUNTER(quorum_fence_error) \
|
||||
EXPAND_COUNTER(quorum_fence_leader) \
|
||||
|
||||
@@ -289,6 +289,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
|
||||
lock->sb = sb;
|
||||
init_waitqueue_head(&lock->waitq);
|
||||
lock->mode = SCOUTFS_LOCK_NULL;
|
||||
lock->invalidating_mode = SCOUTFS_LOCK_NULL;
|
||||
|
||||
atomic64_set(&lock->forest_bloom_nr, 0);
|
||||
|
||||
@@ -666,7 +667,9 @@ struct inv_req {
|
||||
*
|
||||
* Before we start invalidating the lock we set the lock to the new
|
||||
* mode, preventing further incompatible users of the old mode from
|
||||
* using the lock while we're invalidating.
|
||||
* using the lock while we're invalidating. We record the previously
|
||||
* granted mode so that we can send lock recover responses with the old
|
||||
* granted mode during invalidation.
|
||||
*/
|
||||
static void lock_invalidate_worker(struct work_struct *work)
|
||||
{
|
||||
@@ -691,7 +694,8 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
if (!lock_counts_match(nl->new_mode, lock->users))
|
||||
continue;
|
||||
|
||||
/* set the new mode, no incompatible users during inval */
|
||||
/* set the new mode, no incompatible users during inval, recov needs old */
|
||||
lock->invalidating_mode = lock->mode;
|
||||
lock->mode = nl->new_mode;
|
||||
|
||||
/* move everyone that's ready to our private list */
|
||||
@@ -734,6 +738,8 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
list_del(&ireq->head);
|
||||
kfree(ireq);
|
||||
|
||||
lock->invalidating_mode = SCOUTFS_LOCK_NULL;
|
||||
|
||||
if (list_empty(&lock->inv_list)) {
|
||||
/* finish if another request didn't arrive */
|
||||
list_del_init(&lock->inv_head);
|
||||
@@ -824,6 +830,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
|
||||
{
|
||||
DECLARE_LOCK_INFO(sb, linfo);
|
||||
struct scoutfs_net_lock_recover *nlr;
|
||||
enum scoutfs_lock_mode mode;
|
||||
struct scoutfs_lock *lock;
|
||||
struct scoutfs_lock *next;
|
||||
struct rb_node *node;
|
||||
@@ -844,10 +851,15 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
|
||||
|
||||
for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {
|
||||
|
||||
if (lock->invalidating_mode != SCOUTFS_LOCK_NULL)
|
||||
mode = lock->invalidating_mode;
|
||||
else
|
||||
mode = lock->mode;
|
||||
|
||||
nlr->locks[i].key = lock->start;
|
||||
nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq);
|
||||
nlr->locks[i].old_mode = lock->mode;
|
||||
nlr->locks[i].new_mode = lock->mode;
|
||||
nlr->locks[i].old_mode = mode;
|
||||
nlr->locks[i].new_mode = mode;
|
||||
|
||||
node = rb_next(&lock->node);
|
||||
if (node)
|
||||
|
||||
@@ -39,6 +39,7 @@ struct scoutfs_lock {
|
||||
struct list_head cov_list;
|
||||
|
||||
enum scoutfs_lock_mode mode;
|
||||
enum scoutfs_lock_mode invalidating_mode;
|
||||
unsigned int waiters[SCOUTFS_LOCK_NR_MODES];
|
||||
unsigned int users[SCOUTFS_LOCK_NR_MODES];
|
||||
|
||||
|
||||
@@ -749,7 +749,7 @@ out:
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "lock server err %d during client rid %016llx farewell, shutting down",
|
||||
ret, rid);
|
||||
scoutfs_server_abort(sb);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -1292,7 +1292,7 @@ restart:
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "client fence returned err %d, shutting down server",
|
||||
ret);
|
||||
scoutfs_server_abort(sb);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
}
|
||||
destroy_conn(acc);
|
||||
|
||||
@@ -105,6 +105,8 @@ enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
|
||||
struct quorum_status {
|
||||
enum quorum_role role;
|
||||
u64 term;
|
||||
u64 server_start_term;
|
||||
int server_event;
|
||||
int vote_for;
|
||||
unsigned long vote_bits;
|
||||
ktime_t timeout;
|
||||
@@ -117,7 +119,6 @@ struct quorum_info {
|
||||
bool shutdown;
|
||||
|
||||
int our_quorum_slot_nr;
|
||||
unsigned long flags;
|
||||
int votes_needed;
|
||||
|
||||
spinlock_t show_lock;
|
||||
@@ -128,8 +129,6 @@ struct quorum_info {
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
};
|
||||
|
||||
#define QINF_FLAG_SERVER 0
|
||||
|
||||
#define DECLARE_QUORUM_INFO(sb, name) \
|
||||
struct quorum_info *name = SCOUTFS_SB(sb)->quorum_info
|
||||
#define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \
|
||||
@@ -494,16 +493,6 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has fenced previous leaders and reclaimed their
|
||||
* resources. We can now update our fence event with a greater term to
|
||||
* stop future leaders from doing the same.
|
||||
*/
|
||||
int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term)
|
||||
{
|
||||
return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has been elected and has started running but can't
|
||||
* yet assume that it has exclusive access to the metadata device. We
|
||||
@@ -593,15 +582,9 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
||||
}
|
||||
|
||||
out:
|
||||
if (fence_started) {
|
||||
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
} else {
|
||||
err = scoutfs_quorum_fence_complete(sb, term);
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, quorum_fence_error);
|
||||
@@ -609,12 +592,26 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The main quorum task maintains its private status. It seemed cleaner
|
||||
* to occasionally copy the status for showing in sysfs/debugfs files
|
||||
* than to have the two lock access to shared status. The show copy is
|
||||
* updated after being modified before the quorum task sleeps for a
|
||||
* significant amount of time, either waiting on timeouts or interacting
|
||||
* with the server.
|
||||
*/
|
||||
static void update_show_status(struct quorum_info *qinf, struct quorum_status *qst)
|
||||
{
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->show_status = *qst;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* The quorum work always runs in the background of quorum member
|
||||
* mounts. It's responsible for starting and stopping the server if
|
||||
* it's elected leader, and the server can call back into it to let it
|
||||
* know that it has shut itself down (perhaps due to error) so that the
|
||||
* work should stop sending heartbeats.
|
||||
* it's elected leader. While it's leader it sends heartbeats to
|
||||
* suppress other quorum work from standing for election.
|
||||
*/
|
||||
static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
@@ -622,7 +619,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst;
|
||||
struct quorum_status qst = {0,};
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
@@ -631,9 +628,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* start out as a follower */
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = 0;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
|
||||
/* read our starting term from greatest in all events in all slots */
|
||||
read_greatest_term(sb, &qst.term);
|
||||
@@ -651,6 +646,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
ret = recv_msg(sb, &msg, qst.timeout);
|
||||
if (ret < 0) {
|
||||
if (ret != -ETIMEDOUT && ret != -EAGAIN) {
|
||||
@@ -667,24 +664,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
msg.term < qst.term)
|
||||
msg.type = SCOUTFS_QUORUM_MSG_INVALID;
|
||||
|
||||
/* if the server has shutdown we become follower */
|
||||
if (!test_bit(QINF_FLAG_SERVER, &qinf->flags) &&
|
||||
qst.role == LEADER) {
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
||||
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
}
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->show_status = qst;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for,
|
||||
qst.vote_bits,
|
||||
ktime_to_timespec64(qst.timeout));
|
||||
@@ -695,7 +674,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
if (qst.role == LEADER) {
|
||||
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
|
||||
msg.type, msg.from, msg.term, qst.term);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = msg.term;
|
||||
@@ -717,6 +695,13 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
/* followers and candidates start new election on timeout */
|
||||
if (qst.role != LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
/* .. but only if their server has stopped */
|
||||
if (!scoutfs_server_is_down(sb)) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
|
||||
continue;
|
||||
}
|
||||
|
||||
qst.role = CANDIDATE;
|
||||
qst.term++;
|
||||
qst.vote_for = -1;
|
||||
@@ -758,29 +743,69 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
qst.term);
|
||||
qst.timeout = heartbeat_interval();
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
/* record that we've been elected before starting up server */
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* make very sure server is fully shut down */
|
||||
scoutfs_server_stop(sb);
|
||||
/* set server bit before server shutdown could clear */
|
||||
set_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
qst.server_start_term = qst.term;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT;
|
||||
scoutfs_server_start(sb, qst.term);
|
||||
}
|
||||
|
||||
ret = scoutfs_server_start(sb, qst.term);
|
||||
if (ret < 0) {
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
/* store our increased term */
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
|
||||
true);
|
||||
if (err < 0) {
|
||||
ret = err;
|
||||
goto out;
|
||||
}
|
||||
ret = 0;
|
||||
continue;
|
||||
/*
|
||||
* This leader's server is up, having finished fencing
|
||||
* previous leaders. We update the fence event with the
|
||||
* current term to let future leaders know that previous
|
||||
* servers have been fenced.
|
||||
*/
|
||||
if (qst.role == LEADER && qst.server_event != SCOUTFS_QUORUM_EVENT_FENCE &&
|
||||
scoutfs_server_is_up(sb)) {
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, qst.term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_FENCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Stop a running server if we're no longer leader in
|
||||
* its term.
|
||||
*/
|
||||
if (!(qst.role == LEADER && qst.term == qst.server_start_term) &&
|
||||
scoutfs_server_is_running(sb)) {
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
|
||||
/*
|
||||
* A previously running server has stopped. The quorum
|
||||
* protocol might have shut it down by changing roles or
|
||||
* it might have stopped on its own, perhaps on errors.
|
||||
* If we're still a leader then we become a follower and
|
||||
* send resignations to encourage the next election.
|
||||
* Always update the _STOP event to stop connections and
|
||||
* fencing.
|
||||
*/
|
||||
if (qst.server_start_term > 0 && scoutfs_server_is_down(sb)) {
|
||||
if (qst.role == LEADER) {
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
||||
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.server_start_term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
}
|
||||
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
qst.server_start_term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
qst.server_start_term = 0;
|
||||
}
|
||||
|
||||
/* leaders regularly send heartbeats to delay elections */
|
||||
@@ -817,12 +842,19 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
/* always try to stop a running server as we stop */
|
||||
if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) {
|
||||
scoutfs_server_stop(sb);
|
||||
scoutfs_fence_stop(sb);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.term);
|
||||
if (scoutfs_server_is_running(sb)) {
|
||||
scoutfs_server_stop_wait(sb);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, qst.term);
|
||||
|
||||
if (qst.server_start_term > 0) {
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
qst.server_start_term, true);
|
||||
if (err < 0 && ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
}
|
||||
|
||||
/* record that this slot no longer has an active quorum */
|
||||
@@ -834,21 +866,6 @@ out:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has shutdown and is no longer using shared
|
||||
* resources. Clear the bit so that we stop sending heartbeats and
|
||||
* allow the next server to be elected. Update the stop event so that
|
||||
* it won't be considered available by clients or fenced by the next
|
||||
* leader.
|
||||
*/
|
||||
void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clients read quorum blocks looking for the leader with a server whose
|
||||
* address it can try and connect to.
|
||||
@@ -970,6 +987,8 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
qinf->our_quorum_slot_nr);
|
||||
snprintf_ret(buf, size, &ret, "term %llu\n",
|
||||
qst.term);
|
||||
snprintf_ret(buf, size, &ret, "server_start_term %llu\n", qst.server_start_term);
|
||||
snprintf_ret(buf, size, &ret, "server_event %d\n", qst.server_event);
|
||||
snprintf_ret(buf, size, &ret, "role %d (%s)\n",
|
||||
qst.role, role_str(qst.role));
|
||||
snprintf_ret(buf, size, &ret, "vote_for %d\n",
|
||||
|
||||
@@ -2,14 +2,12 @@
|
||||
#define _SCOUTFS_QUORUM_H_
|
||||
|
||||
int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
|
||||
void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term);
|
||||
|
||||
u8 scoutfs_quorum_votes_needed(struct super_block *sb);
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
|
||||
struct sockaddr_in *sin);
|
||||
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
|
||||
int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term);
|
||||
|
||||
int scoutfs_quorum_setup(struct super_block *sb);
|
||||
void scoutfs_quorum_shutdown(struct super_block *sb);
|
||||
|
||||
@@ -1843,6 +1843,53 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
|
||||
TP_ARGS(sb, rid, nr_clients)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, holding)
|
||||
__field(int, applying)
|
||||
__field(int, nr_holders)
|
||||
__field(__u32, avail_before)
|
||||
__field(__u32, freed_before)
|
||||
__field(int, exceeded)
|
||||
),
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->holding = !!holding;
|
||||
__entry->applying = !!applying;
|
||||
__entry->nr_holders = nr_holders;
|
||||
__entry->avail_before = avail_before;
|
||||
__entry->freed_before = freed_before;
|
||||
__entry->exceeded = !!exceeded;
|
||||
),
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u exceeded %u",
|
||||
SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
|
||||
__entry->avail_before, __entry->freed_before, __entry->exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
|
||||
#define slt_symbolic(mode) \
|
||||
__print_symbolic(mode, \
|
||||
{ SLT_CLIENT, "client" }, \
|
||||
|
||||
@@ -52,6 +52,41 @@
|
||||
* mount will become the leader and have less trouble.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Tracks all the holders and commit work that are operating on server
|
||||
* commits. It synchronizes holders modifying the blocks in the commit
|
||||
* and the commit work writing dirty blocks that make up a consistent
|
||||
* commit. It limits the number of active holders so that they don't
|
||||
* fully consume the allocation resources prepared for a commit.
|
||||
*/
|
||||
struct commit_users {
|
||||
wait_queue_head_t waitq;
|
||||
spinlock_t lock;
|
||||
struct list_head holding;
|
||||
struct list_head applying;
|
||||
unsigned int nr_holders;
|
||||
u32 avail_before;
|
||||
u32 freed_before;
|
||||
bool exceeded;
|
||||
};
|
||||
|
||||
static void init_commit_users(struct commit_users *cusers)
|
||||
{
|
||||
memset(cusers, 0, sizeof(struct commit_users));
|
||||
init_waitqueue_head(&cusers->waitq);
|
||||
spin_lock_init(&cusers->lock);
|
||||
INIT_LIST_HEAD(&cusers->holding);
|
||||
INIT_LIST_HEAD(&cusers->applying);
|
||||
}
|
||||
|
||||
#define TRACE_COMMIT_USERS(sb, cusers, which) \
|
||||
do { \
|
||||
__typeof__(cusers) _cusers = (cusers); \
|
||||
trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding), \
|
||||
!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before, \
|
||||
_cusers->freed_before, _cusers->exceeded); \
|
||||
} while (0)
|
||||
|
||||
struct server_info {
|
||||
struct super_block *sb;
|
||||
spinlock_t lock;
|
||||
@@ -59,9 +94,7 @@ struct server_info {
|
||||
|
||||
struct workqueue_struct *wq;
|
||||
struct work_struct work;
|
||||
int err;
|
||||
bool shutting_down;
|
||||
struct completion start_comp;
|
||||
int status;
|
||||
u64 term;
|
||||
struct scoutfs_net_connection *conn;
|
||||
|
||||
@@ -69,8 +102,7 @@ struct server_info {
|
||||
atomic64_t seq_atomic;
|
||||
|
||||
/* request processing coordinates shared commits */
|
||||
struct rw_semaphore commit_rwsem;
|
||||
struct llist_head commit_waiters;
|
||||
struct commit_users cusers;
|
||||
struct work_struct commit_work;
|
||||
|
||||
struct list_head clients;
|
||||
@@ -155,87 +187,286 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
|
||||
return is_set;
|
||||
}
|
||||
|
||||
|
||||
struct commit_waiter {
|
||||
struct completion comp;
|
||||
struct llist_node node;
|
||||
int ret;
|
||||
enum {
|
||||
SERVER_NOP = 0,
|
||||
SERVER_STARTING,
|
||||
SERVER_UP,
|
||||
SERVER_STOPPING,
|
||||
SERVER_DOWN,
|
||||
};
|
||||
|
||||
static bool test_shutting_down(struct server_info *server)
|
||||
bool scoutfs_server_is_running(struct super_block *sb)
|
||||
{
|
||||
smp_rmb();
|
||||
return server->shutting_down;
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
long was = cmpxchg(&server->status, SERVER_NOP, SERVER_NOP);
|
||||
|
||||
return was == SERVER_STARTING || was == SERVER_UP;
|
||||
}
|
||||
|
||||
static void set_shutting_down(struct server_info *server, bool val)
|
||||
bool scoutfs_server_is_up(struct super_block *sb)
|
||||
{
|
||||
server->shutting_down = val;
|
||||
smp_wmb();
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return cmpxchg(&server->status, SERVER_NOP, SERVER_NOP) == SERVER_UP;
|
||||
}
|
||||
|
||||
bool scoutfs_server_is_down(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return cmpxchg(&server->status, SERVER_NOP, SERVER_NOP) == SERVER_DOWN;
|
||||
}
|
||||
|
||||
static bool server_is_stopping(struct server_info *server)
|
||||
{
|
||||
return cmpxchg(&server->status, SERVER_NOP, SERVER_NOP) == SERVER_STOPPING;
|
||||
}
|
||||
|
||||
static void stop_server(struct server_info *server)
|
||||
{
|
||||
set_shutting_down(server, true);
|
||||
wake_up(&server->waitq);
|
||||
long was = cmpxchg(&server->status, SERVER_NOP, SERVER_NOP);
|
||||
|
||||
if ((was == SERVER_STARTING || was == SERVER_UP) &&
|
||||
cmpxchg(&server->status, was, SERVER_STOPPING) == was)
|
||||
wake_up(&server->waitq);
|
||||
}
|
||||
|
||||
static void server_up(struct server_info *server)
|
||||
{
|
||||
cmpxchg(&server->status, SERVER_STARTING, SERVER_UP);
|
||||
}
|
||||
|
||||
static void server_down(struct server_info *server)
|
||||
{
|
||||
long was = cmpxchg(&server->status, SERVER_NOP, SERVER_NOP);
|
||||
|
||||
if (was != SERVER_DOWN)
|
||||
cmpxchg(&server->status, was, SERVER_DOWN);
|
||||
}
|
||||
|
||||
/*
|
||||
* Hold the shared rwsem that lets multiple holders modify blocks in the
|
||||
* current commit and prevents the commit worker from acquiring the
|
||||
* exclusive write lock to write the commit.
|
||||
* The per-holder allocation block use budget balances batching
|
||||
* efficiency and concurrency. The larger this gets, the fewer
|
||||
* concurrent server operations can be performed in one commit. Commits
|
||||
* are immediately written after being dirtied so this really only
|
||||
* limits immediate concurrency under load, not batching over time as
|
||||
* one might expect if commits were long lived.
|
||||
*
|
||||
* This is exported for server components isolated in their own files
|
||||
* (lock_server) and which are not called directly by the server core
|
||||
* (async timeout work).
|
||||
* The upper bound is determined by the server commit hold path that can
|
||||
* dirty the most blocks.
|
||||
*/
|
||||
void scoutfs_server_hold_commit(struct super_block *sb)
|
||||
#define COMMIT_HOLD_ALLOC_BUDGET 500
|
||||
|
||||
struct commit_hold {
|
||||
struct list_head entry;
|
||||
ktime_t start;
|
||||
u32 avail;
|
||||
u32 freed;
|
||||
int ret;
|
||||
bool exceeded;
|
||||
};
|
||||
|
||||
#define COMMIT_HOLD(name) \
|
||||
struct commit_hold name = { .entry = LIST_HEAD_INIT(name.entry) }
|
||||
|
||||
/*
|
||||
* See if the currently active holders have, all together, consumed more
|
||||
* allocation resources than they were allowed. We don't have
|
||||
* per-holder allocation consumption tracking. The best we can do is
|
||||
* flag all the current holders so that as they release we can see
|
||||
* everyone involved in crossing the limit.
|
||||
*/
|
||||
static void check_holder_budget(struct super_block *sb, struct server_info *server,
|
||||
struct commit_users *cusers)
|
||||
{
|
||||
static bool exceeded_once = false;
|
||||
struct commit_hold *hold;
|
||||
struct timespec ts;
|
||||
u32 avail_used;
|
||||
u32 freed_used;
|
||||
u32 avail_now;
|
||||
u32 freed_now;
|
||||
u32 budget;
|
||||
|
||||
assert_spin_locked(&cusers->lock);
|
||||
|
||||
if (cusers->exceeded || cusers->nr_holders == 0 || exceeded_once)
|
||||
return;
|
||||
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
|
||||
avail_used = cusers->avail_before - avail_now;
|
||||
freed_used = cusers->freed_before - freed_now;
|
||||
budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
if (avail_used <= budget && freed_used <= budget)
|
||||
return;
|
||||
|
||||
exceeded_once = true;
|
||||
cusers->exceeded = cusers->nr_holders;
|
||||
|
||||
scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u",
|
||||
cusers->nr_holders, cusers->avail_before, avail_now,
|
||||
cusers->freed_before, freed_now);
|
||||
|
||||
list_for_each_entry(hold, &cusers->holding, entry) {
|
||||
ts = ktime_to_timespec(hold->start);
|
||||
scoutfs_err(sb, "exceeding hold start %llu.%09llu av %u fr %u",
|
||||
(u64)ts.tv_sec, (u64)ts.tv_nsec, hold->avail, hold->freed);
|
||||
hold->exceeded = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't have per-holder consumption. We allow commit holders as
|
||||
* long as the total budget of all the holders doesn't exceed the alloc
|
||||
* resources that were available
|
||||
*/
|
||||
static bool commit_alloc_has_room(struct server_info *server, struct commit_users *cusers,
|
||||
unsigned int more_holders)
|
||||
{
|
||||
u32 avail_before;
|
||||
u32 freed_before;
|
||||
u32 budget;
|
||||
|
||||
if (cusers->nr_holders > 0) {
|
||||
avail_before = cusers->avail_before;
|
||||
freed_before = cusers->freed_before;
|
||||
} else {
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_before, &freed_before);
|
||||
}
|
||||
|
||||
budget = (cusers->nr_holders + more_holders) * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
|
||||
return avail_before >= budget && freed_before >= budget;
|
||||
}
|
||||
|
||||
static bool hold_commit(struct super_block *sb, struct server_info *server,
|
||||
struct commit_users *cusers, struct commit_hold *hold)
|
||||
{
|
||||
bool held = false;
|
||||
|
||||
spin_lock(&cusers->lock);
|
||||
|
||||
TRACE_COMMIT_USERS(sb, cusers, hold);
|
||||
|
||||
check_holder_budget(sb, server, cusers);
|
||||
|
||||
/* +2 for our additional hold and then for the final commit work the server does */
|
||||
if (list_empty(&cusers->applying) && commit_alloc_has_room(server, cusers, 2)) {
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &hold->avail, &hold->freed);
|
||||
if (cusers->nr_holders == 0) {
|
||||
cusers->avail_before = hold->avail;
|
||||
cusers->freed_before = hold->freed;
|
||||
cusers->exceeded = false;
|
||||
}
|
||||
hold->exceeded = false;
|
||||
hold->start = ktime_get();
|
||||
list_add_tail(&hold->entry, &cusers->holding);
|
||||
cusers->nr_holders++;
|
||||
held = true;
|
||||
}
|
||||
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
return held;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hold the server commit so that we can make a consistent change to the
|
||||
* dirty blocks in the commit. The commit won't be written while we
|
||||
* hold it.
|
||||
*/
|
||||
static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct commit_users *cusers = &server->cusers;
|
||||
|
||||
BUG_ON(!list_empty(&hold->entry));
|
||||
|
||||
scoutfs_inc_counter(sb, server_commit_hold);
|
||||
|
||||
down_read(&server->commit_rwsem);
|
||||
wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called while holding the commit and returns once the commit
|
||||
* is successfully written. Many holders can all wait for all holders
|
||||
* to drain before their shared commit is applied and they're all woken.
|
||||
*
|
||||
* It's important to realize that our commit_waiter list node might be
|
||||
* serviced by a currently executing commit work that is blocked waiting
|
||||
* for the holders to release the commit_rwsem. This caller can return
|
||||
* from wait_for_commit() while another future commit_work is still
|
||||
* queued.
|
||||
*
|
||||
* This could queue delayed work but we're first trying to have batching
|
||||
* work by having concurrent modification line up behind a commit in
|
||||
* flight. Once the commit finishes it'll unlock and hopefully everyone
|
||||
* will race to make their changes and they'll all be applied by the
|
||||
* next commit after that.
|
||||
*/
|
||||
int scoutfs_server_apply_commit(struct super_block *sb, int err)
|
||||
static int server_apply_commit(struct super_block *sb, struct commit_hold *hold, int err)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct commit_waiter cw;
|
||||
struct commit_users *cusers = &server->cusers;
|
||||
struct timespec ts;
|
||||
bool start_commit;
|
||||
|
||||
spin_lock(&cusers->lock);
|
||||
|
||||
TRACE_COMMIT_USERS(sb, cusers, apply);
|
||||
|
||||
check_holder_budget(sb, server, cusers);
|
||||
|
||||
if (hold->exceeded) {
|
||||
ts = ktime_to_timespec(hold->start);
|
||||
scoutfs_err(sb, "exceeding hold start %llu.%09llu stack:",
|
||||
(u64)ts.tv_sec, (u64)ts.tv_nsec);
|
||||
dump_stack();
|
||||
}
|
||||
|
||||
if (err == 0) {
|
||||
cw.ret = 0;
|
||||
init_completion(&cw.comp);
|
||||
llist_add(&cw.node, &server->commit_waiters);
|
||||
scoutfs_inc_counter(sb, server_commit_queue);
|
||||
list_move_tail(&hold->entry, &cusers->applying);
|
||||
} else {
|
||||
list_del_init(&hold->entry);
|
||||
hold->ret = err;
|
||||
}
|
||||
cusers->nr_holders--;
|
||||
start_commit = cusers->nr_holders == 0 && !list_empty(&cusers->applying);
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
if (start_commit)
|
||||
queue_work(server->wq, &server->commit_work);
|
||||
}
|
||||
|
||||
up_read(&server->commit_rwsem);
|
||||
wait_event(cusers->waitq, list_empty_careful(&hold->entry));
|
||||
smp_rmb(); /* entry load before ret */
|
||||
return hold->ret;
|
||||
}
|
||||
|
||||
if (err == 0) {
|
||||
wait_for_completion(&cw.comp);
|
||||
err = cw.ret;
|
||||
}
|
||||
/*
|
||||
* Start a commit from the commit work. We should only have been queued
|
||||
* while a holder is waiting to apply after all active holders have
|
||||
* finished.
|
||||
*/
|
||||
static int commit_start(struct super_block *sb, struct commit_users *cusers)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
return err;
|
||||
/* make sure holders held off once commit started */
|
||||
spin_lock(&cusers->lock);
|
||||
TRACE_COMMIT_USERS(sb, cusers, start);
|
||||
if (WARN_ON_ONCE(list_empty(&cusers->applying) || cusers->nr_holders != 0))
|
||||
ret = -EINVAL;
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finish a commit from the commit work. Give the result to all the
|
||||
* holders who are waiting for the commit to be applied.
|
||||
*/
|
||||
static void commit_end(struct super_block *sb, struct commit_users *cusers, int ret)
|
||||
{
|
||||
struct commit_hold *hold;
|
||||
struct commit_hold *tmp;
|
||||
|
||||
spin_lock(&cusers->lock);
|
||||
TRACE_COMMIT_USERS(sb, cusers, end);
|
||||
list_for_each_entry(hold, &cusers->applying, entry)
|
||||
hold->ret = ret;
|
||||
smp_wmb(); /* ret stores before list updates */
|
||||
list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
|
||||
list_del_init(&hold->entry);
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
wake_up(&cusers->waitq);
|
||||
}
|
||||
|
||||
static void get_roots(struct super_block *sb,
|
||||
@@ -297,19 +528,17 @@ static void set_roots(struct server_info *server,
|
||||
* Concurrent request processing dirties blocks in a commit and makes
|
||||
* the modifications persistent before replying. We'd like to batch
|
||||
* these commits as much as is reasonable so that we don't degrade to a
|
||||
* few IO round trips per request.
|
||||
* few synchronous IOs per request.
|
||||
*
|
||||
* Getting that batching right is bound up in the concurrency of request
|
||||
* processing so a clear way to implement the batched commits is to
|
||||
* implement commits with a single pending work func like the
|
||||
* processing.
|
||||
* implement commits with a single pending work func.
|
||||
*
|
||||
* Processing paths acquire the rwsem for reading while they're making
|
||||
* multiple dependent changes. When they're done and want it persistent
|
||||
* they add themselves to the list of waiters and queue the commit work.
|
||||
* This work runs, acquires the lock to exclude other writers, and
|
||||
* performs the commit. Readers can run concurrently with these
|
||||
* commits.
|
||||
* Processing paths hold the commit while they're making multiple
|
||||
* dependent changes. When they're done and want it persistent they add
|
||||
* queue the commit work. This work runs, performs the commit, and
|
||||
* wakes all the applying waiters with the result. Readers can run
|
||||
* concurrently with these commits.
|
||||
*/
|
||||
static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
{
|
||||
@@ -317,15 +546,15 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
commit_work);
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct commit_waiter *cw;
|
||||
struct commit_waiter *pos;
|
||||
struct llist_node *node;
|
||||
struct commit_users *cusers = &server->cusers;
|
||||
int ret;
|
||||
|
||||
trace_scoutfs_server_commit_work_enter(sb, 0, 0);
|
||||
scoutfs_inc_counter(sb, server_commit_worker);
|
||||
|
||||
down_write(&server->commit_rwsem);
|
||||
ret = commit_start(sb, cusers);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
ret = -EIO;
|
||||
@@ -402,15 +631,8 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
node = llist_del_all(&server->commit_waiters);
|
||||
commit_end(sb, cusers, ret);
|
||||
|
||||
/* waiters always wait on completion, cw could be free after complete */
|
||||
llist_for_each_entry_safe(cw, pos, node, node) {
|
||||
cw->ret = ret;
|
||||
complete(&cw->comp);
|
||||
}
|
||||
|
||||
up_write(&server->commit_rwsem);
|
||||
trace_scoutfs_server_commit_work_exit(sb, 0, ret);
|
||||
}
|
||||
|
||||
@@ -421,6 +643,7 @@ static int server_alloc_inodes(struct super_block *sb,
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_net_inode_alloc ial = { 0, };
|
||||
COMMIT_HOLD(hold);
|
||||
__le64 lecount;
|
||||
u64 ino;
|
||||
u64 nr;
|
||||
@@ -433,7 +656,7 @@ static int server_alloc_inodes(struct super_block *sb,
|
||||
|
||||
memcpy(&lecount, arg, arg_len);
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
spin_lock(&sbi->next_ino_lock);
|
||||
ino = le64_to_cpu(super->next_ino);
|
||||
@@ -441,7 +664,7 @@ static int server_alloc_inodes(struct super_block *sb,
|
||||
le64_add_cpu(&super->next_ino, nr);
|
||||
spin_unlock(&sbi->next_ino_lock);
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, 0);
|
||||
ret = server_apply_commit(sb, &hold, 0);
|
||||
if (ret == 0) {
|
||||
ial.ino = cpu_to_le64(ino);
|
||||
ial.nr = cpu_to_le64(nr);
|
||||
@@ -819,7 +1042,7 @@ static int next_log_merge_item(struct super_block *sb,
|
||||
#define FINALIZE_POLL_MS (11)
|
||||
#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2)
|
||||
static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
|
||||
u64 rid)
|
||||
u64 rid, struct commit_hold *hold)
|
||||
{
|
||||
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
@@ -945,13 +1168,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
/* wait a bit for mounts to arrive */
|
||||
if (others_active) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = scoutfs_server_apply_commit(sb, 0);
|
||||
ret = server_apply_commit(sb, hold, 0);
|
||||
if (ret < 0)
|
||||
err_str = "applying commit before waiting for finalized";
|
||||
|
||||
msleep(FINALIZE_POLL_MS);
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
/* done if we timed out */
|
||||
@@ -1044,6 +1267,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
bool unlock_alloc = false;
|
||||
COMMIT_HOLD(hold);
|
||||
u64 data_zone_blocks;
|
||||
char *err_str = NULL;
|
||||
u64 nr;
|
||||
@@ -1054,7 +1278,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
@@ -1092,7 +1316,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
}
|
||||
|
||||
/* drops and re-acquires the mutex and commit if it has to wait */
|
||||
ret = finalize_and_start_log_merge(sb, <, rid);
|
||||
ret = finalize_and_start_log_merge(sb, <, rid, &hold);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
@@ -1187,7 +1411,7 @@ unlock:
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
out:
|
||||
if (ret < 0)
|
||||
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
|
||||
@@ -1213,6 +1437,7 @@ static int server_commit_log_trees(struct super_block *sb,
|
||||
struct scoutfs_log_trees *exist;
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
char *err_str = NULL;
|
||||
bool committed = false;
|
||||
int ret;
|
||||
@@ -1231,7 +1456,7 @@ static int server_commit_log_trees(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
@@ -1280,7 +1505,7 @@ static int server_commit_log_trees(struct super_block *sb,
|
||||
unlock:
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
if (ret < 0)
|
||||
scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s",
|
||||
ret, rid, err_str);
|
||||
@@ -1589,6 +1814,7 @@ static int server_srch_get_compact(struct super_block *sb,
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_srch_compact *sc = NULL;
|
||||
COMMIT_HOLD(hold);
|
||||
int ret;
|
||||
|
||||
if (arg_len != 0) {
|
||||
@@ -1602,7 +1828,7 @@ static int server_srch_get_compact(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
mutex_lock(&server->srch_mutex);
|
||||
ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
|
||||
@@ -1630,7 +1856,7 @@ static int server_srch_get_compact(struct super_block *sb,
|
||||
mutex_unlock(&server->srch_mutex);
|
||||
|
||||
apply:
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
WARN_ON_ONCE(ret < 0 && ret != -ENOENT); /* XXX leaked busy item */
|
||||
out:
|
||||
ret = scoutfs_net_response(sb, conn, cmd, id, ret,
|
||||
@@ -1656,6 +1882,7 @@ static int server_srch_commit_compact(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *sc;
|
||||
struct scoutfs_alloc_list_head av;
|
||||
struct scoutfs_alloc_list_head fr;
|
||||
COMMIT_HOLD(hold);
|
||||
int ret;
|
||||
|
||||
if (arg_len != sizeof(struct scoutfs_srch_compact)) {
|
||||
@@ -1664,7 +1891,7 @@ static int server_srch_commit_compact(struct super_block *sb,
|
||||
}
|
||||
sc = arg;
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
mutex_lock(&server->srch_mutex);
|
||||
ret = scoutfs_srch_commit_compact(sb, &server->alloc, &server->wri,
|
||||
@@ -1682,7 +1909,7 @@ static int server_srch_commit_compact(struct super_block *sb,
|
||||
server->other_freed, &fr);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
apply:
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
out:
|
||||
WARN_ON(ret < 0); /* XXX leaks allocators */
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
||||
@@ -2047,13 +2274,14 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_log_merge_freeing fr;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
char *err_str = NULL;
|
||||
bool commit = false;
|
||||
int ret = 0;
|
||||
|
||||
/* shutdown waits for us, we'll eventually load set shutting_down */
|
||||
while (!server->shutting_down) {
|
||||
scoutfs_server_hold_commit(sb);
|
||||
while (!server_is_stopping(server)) {
|
||||
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
|
||||
@@ -2083,7 +2311,7 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
|
||||
ret = scoutfs_btree_free_blocks(sb, &server->alloc,
|
||||
&server->wri, &fr.key,
|
||||
&fr.root, 10);
|
||||
&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2);
|
||||
if (ret < 0) {
|
||||
err_str = "freeing log btree";
|
||||
break;
|
||||
@@ -2103,7 +2331,7 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
BUG_ON(ret < 0);
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
@@ -2113,7 +2341,7 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
|
||||
if (commit) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
if (ret < 0)
|
||||
err_str = "final commit del/upd freeing item";
|
||||
}
|
||||
@@ -2145,6 +2373,7 @@ static int server_get_log_merge(struct super_block *sb,
|
||||
struct scoutfs_key par_end;
|
||||
struct scoutfs_key next_key;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
char *err_str = NULL;
|
||||
bool ins_rng;
|
||||
bool del_remain;
|
||||
@@ -2158,7 +2387,7 @@ static int server_get_log_merge(struct super_block *sb,
|
||||
if (arg_len != 0)
|
||||
return -EINVAL;
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
restart:
|
||||
@@ -2401,7 +2630,7 @@ out:
|
||||
}
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, &req, sizeof(req));
|
||||
}
|
||||
@@ -2425,6 +2654,7 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
char *err_str = NULL;
|
||||
bool deleted = false;
|
||||
int ret = 0;
|
||||
@@ -2442,7 +2672,7 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
le64_to_cpu(comp->seq),
|
||||
le64_to_cpu(comp->flags));
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
/* find the status of the current log merge */
|
||||
@@ -2535,7 +2765,7 @@ out:
|
||||
if (ret < 0 && err_str)
|
||||
scoutfs_err(sb, "error %d committing log merge: %s", ret, err_str);
|
||||
|
||||
err = scoutfs_server_apply_commit(sb, ret);
|
||||
err = server_apply_commit(sb, &hold, ret);
|
||||
BUG_ON(ret < 0 && deleted); /* inconsistent */
|
||||
|
||||
if (ret == 0)
|
||||
@@ -2655,6 +2885,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_volume_options *volopt;
|
||||
COMMIT_HOLD(hold);
|
||||
u64 opt;
|
||||
u64 nr;
|
||||
int ret = 0;
|
||||
@@ -2672,7 +2903,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
|
||||
|
||||
mutex_lock(&server->volopt_mutex);
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
if (le64_to_cpu(volopt->set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
|
||||
opt = le64_to_cpu(volopt->data_alloc_zone_blocks);
|
||||
@@ -2703,7 +2934,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
|
||||
}
|
||||
|
||||
apply:
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
if (ret == 0)
|
||||
@@ -2723,6 +2954,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_volume_options *volopt;
|
||||
COMMIT_HOLD(hold);
|
||||
__le64 *opt;
|
||||
u64 bit;
|
||||
int ret = 0;
|
||||
@@ -2741,7 +2973,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
|
||||
|
||||
mutex_lock(&server->volopt_mutex);
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
for (i = 0, bit = 1, opt = first_valopt(&super->volopt); i < 64; i++, bit <<= 1, opt++) {
|
||||
if (le64_to_cpu(volopt->set_bits) & bit) {
|
||||
@@ -2750,7 +2982,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
if (ret == 0)
|
||||
@@ -2776,6 +3008,7 @@ static int server_resize_devices(struct super_block *sb, struct scoutfs_net_conn
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_net_resize_devices *nrd;
|
||||
COMMIT_HOLD(hold);
|
||||
u64 meta_tot;
|
||||
u64 meta_start;
|
||||
u64 meta_len;
|
||||
@@ -2794,7 +3027,7 @@ static int server_resize_devices(struct super_block *sb, struct scoutfs_net_conn
|
||||
meta_tot = le64_to_cpu(nrd->new_total_meta_blocks);
|
||||
data_tot = le64_to_cpu(nrd->new_total_data_blocks);
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
|
||||
if (meta_tot == le64_to_cpu(super->total_meta_blocks))
|
||||
@@ -2856,7 +3089,7 @@ static int server_resize_devices(struct super_block *sb, struct scoutfs_net_conn
|
||||
ret = 0;
|
||||
unlock:
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
|
||||
};
|
||||
@@ -3180,7 +3413,7 @@ out:
|
||||
*/
|
||||
static void queue_farewell_work(struct server_info *server)
|
||||
{
|
||||
if (!test_shutting_down(server))
|
||||
if (!server_is_stopping(server))
|
||||
queue_work(server->wq, &server->farewell_work);
|
||||
}
|
||||
|
||||
@@ -3210,6 +3443,7 @@ static int server_greeting(struct super_block *sb,
|
||||
struct scoutfs_net_greeting *gr = arg;
|
||||
struct scoutfs_net_greeting greet;
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
COMMIT_HOLD(hold);
|
||||
bool reconnecting;
|
||||
bool first_contact;
|
||||
bool farewell;
|
||||
@@ -3237,12 +3471,12 @@ static int server_greeting(struct super_block *sb,
|
||||
}
|
||||
|
||||
if (gr->server_term == 0) {
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
ret = insert_mounted_client(sb, le64_to_cpu(gr->rid), le64_to_cpu(gr->flags),
|
||||
&conn->peername);
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
queue_work(server->wq, &server->farewell_work);
|
||||
if (ret < 0)
|
||||
goto send_err;
|
||||
@@ -3308,9 +3542,10 @@ struct farewell_request {
|
||||
*/
|
||||
static int reclaim_rid(struct super_block *sb, u64 rid)
|
||||
{
|
||||
COMMIT_HOLD(hold);
|
||||
int ret;
|
||||
|
||||
scoutfs_server_hold_commit(sb);
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
/* delete mounted client last, recovery looks for it */
|
||||
ret = scoutfs_lock_server_farewell(sb, rid) ?:
|
||||
@@ -3320,7 +3555,7 @@ static int reclaim_rid(struct super_block *sb, u64 rid)
|
||||
scoutfs_omap_remove_rid(sb, rid) ?:
|
||||
delete_mounted_client(sb, rid);
|
||||
|
||||
return scoutfs_server_apply_commit(sb, ret);
|
||||
return server_apply_commit(sb, &hold, ret);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3693,14 +3928,14 @@ static void fence_pending_recov_worker(struct work_struct *work)
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
scoutfs_server_abort(sb);
|
||||
stop_server(server);
|
||||
}
|
||||
|
||||
static void recovery_timeout(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
if (!test_shutting_down(server))
|
||||
if (!server_is_stopping(server))
|
||||
queue_work(server->wq, &server->fence_pending_recov_work);
|
||||
}
|
||||
|
||||
@@ -3765,7 +4000,7 @@ out:
|
||||
|
||||
static void queue_reclaim_work(struct server_info *server, unsigned long delay)
|
||||
{
|
||||
if (!test_shutting_down(server))
|
||||
if (!server_is_stopping(server))
|
||||
queue_delayed_work(server->wq, &server->reclaim_dwork, delay);
|
||||
}
|
||||
|
||||
@@ -3800,7 +4035,7 @@ static void reclaim_worker(struct work_struct *work)
|
||||
if (error == true) {
|
||||
scoutfs_err(sb, "saw error indicator on fence request for rid %016llx, shutting down server",
|
||||
rid);
|
||||
scoutfs_server_abort(sb);
|
||||
stop_server(server);
|
||||
ret = -ESHUTDOWN;
|
||||
goto out;
|
||||
}
|
||||
@@ -3809,7 +4044,7 @@ static void reclaim_worker(struct work_struct *work)
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failure to reclaim fenced rid %016llx: err %d, shutting down server",
|
||||
rid, ret);
|
||||
scoutfs_server_abort(sb);
|
||||
stop_server(server);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -3817,16 +4052,7 @@ static void reclaim_worker(struct work_struct *work)
|
||||
scoutfs_fence_free(sb, rid);
|
||||
scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_ALL);
|
||||
|
||||
/* tell quorum we've finished fencing all previous leaders */
|
||||
if (reason == SCOUTFS_FENCE_QUORUM_BLOCK_LEADER &&
|
||||
!scoutfs_fence_reason_pending(sb, reason)) {
|
||||
ret = scoutfs_quorum_fence_complete(sb, server->term);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
/* queue next reclaim immediately if we're making progress */
|
||||
if (ret == 0)
|
||||
@@ -3942,12 +4168,12 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
scoutfs_net_listen(sb, conn);
|
||||
|
||||
scoutfs_info(sb, "server ready at "SIN_FMT, SIN_ARG(&sin));
|
||||
complete(&server->start_comp);
|
||||
server_up(server);
|
||||
|
||||
queue_reclaim_work(server, 0);
|
||||
|
||||
/* interruptible mostly to avoid stuck messages */
|
||||
wait_event_interruptible(server->waitq, test_shutting_down(server));
|
||||
wait_event_interruptible(server->waitq, server_is_stopping(server));
|
||||
|
||||
shutdown:
|
||||
scoutfs_info(sb, "server shutting down at "SIN_FMT, SIN_ARG(&sin));
|
||||
@@ -3981,60 +4207,44 @@ out:
|
||||
scoutfs_fence_stop(sb);
|
||||
scoutfs_net_free_conn(sb, conn);
|
||||
|
||||
/* let quorum know that we've shutdown */
|
||||
scoutfs_quorum_server_shutdown(sb, server->term);
|
||||
server_down(server);
|
||||
|
||||
scoutfs_info(sb, "server stopped at "SIN_FMT, SIN_ARG(&sin));
|
||||
trace_scoutfs_server_work_exit(sb, 0, ret);
|
||||
|
||||
server->err = ret;
|
||||
complete(&server->start_comp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the server to successfully start. If this returns error then
|
||||
* the super block's fence_term has been set to the new server's term so
|
||||
* that it won't be fenced.
|
||||
* Start the server but don't wait for it to complete.
|
||||
*/
|
||||
int scoutfs_server_start(struct super_block *sb, u64 term)
|
||||
void scoutfs_server_start(struct super_block *sb, u64 term)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
server->err = 0;
|
||||
set_shutting_down(server, false);
|
||||
server->term = term;
|
||||
init_completion(&server->start_comp);
|
||||
|
||||
queue_work(server->wq, &server->work);
|
||||
|
||||
wait_for_completion(&server->start_comp);
|
||||
return server->err;
|
||||
if (cmpxchg(&server->status, SERVER_DOWN, SERVER_STARTING) == SERVER_DOWN) {
|
||||
server->term = term;
|
||||
queue_work(server->wq, &server->work);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Start shutdown on the server but don't want for it to finish.
|
||||
*/
|
||||
void scoutfs_server_abort(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
stop_server(server);
|
||||
}
|
||||
|
||||
/*
|
||||
* Once the server is stopped we give the caller our election info
|
||||
* which might have been modified while we were running.
|
||||
*/
|
||||
void scoutfs_server_stop(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
stop_server(server);
|
||||
}
|
||||
|
||||
cancel_work_sync(&server->work);
|
||||
cancel_work_sync(&server->farewell_work);
|
||||
cancel_work_sync(&server->commit_work);
|
||||
cancel_work_sync(&server->log_merge_free_work);
|
||||
/*
|
||||
* Start shutdown on the server and wait for it to finish.
|
||||
*/
|
||||
void scoutfs_server_stop_wait(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
stop_server(server);
|
||||
flush_work_sync(&server->work);
|
||||
}
|
||||
|
||||
int scoutfs_server_setup(struct super_block *sb)
|
||||
@@ -4050,8 +4260,8 @@ int scoutfs_server_setup(struct super_block *sb)
|
||||
spin_lock_init(&server->lock);
|
||||
init_waitqueue_head(&server->waitq);
|
||||
INIT_WORK(&server->work, scoutfs_server_worker);
|
||||
init_rwsem(&server->commit_rwsem);
|
||||
init_llist_head(&server->commit_waiters);
|
||||
server->status = SERVER_DOWN;
|
||||
init_commit_users(&server->cusers);
|
||||
INIT_WORK(&server->commit_work, scoutfs_server_commit_func);
|
||||
INIT_LIST_HEAD(&server->clients);
|
||||
spin_lock_init(&server->farewell_lock);
|
||||
|
||||
@@ -64,8 +64,6 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
|
||||
struct scoutfs_key *key);
|
||||
void scoutfs_server_hold_commit(struct super_block *sb);
|
||||
int scoutfs_server_apply_commit(struct super_block *sb, int err);
|
||||
void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);
|
||||
|
||||
int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
|
||||
@@ -77,9 +75,12 @@ u64 scoutfs_server_seq(struct super_block *sb);
|
||||
u64 scoutfs_server_next_seq(struct super_block *sb);
|
||||
void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
|
||||
|
||||
int scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_abort(struct super_block *sb);
|
||||
void scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_stop(struct super_block *sb);
|
||||
void scoutfs_server_stop_wait(struct super_block *sb);
|
||||
bool scoutfs_server_is_running(struct super_block *sb);
|
||||
bool scoutfs_server_is_up(struct super_block *sb);
|
||||
bool scoutfs_server_is_down(struct super_block *sb);
|
||||
|
||||
int scoutfs_server_setup(struct super_block *sb);
|
||||
void scoutfs_server_destroy(struct super_block *sb);
|
||||
|
||||
@@ -37,6 +37,15 @@ struct attr_funcs {
|
||||
#define ATTR_FUNCS_RO(_name) \
|
||||
static struct attr_funcs _name##_attr_funcs = __ATTR_RO(_name)
|
||||
|
||||
static ssize_t data_device_maj_min_show(struct kobject *kobj, struct attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = KOBJ_TO_SB(kobj, sb_id_kobj);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u:%u\n",
|
||||
MAJOR(sb->s_bdev->bd_dev), MINOR(sb->s_bdev->bd_dev));
|
||||
}
|
||||
ATTR_FUNCS_RO(data_device_maj_min);
|
||||
|
||||
static ssize_t format_version_show(struct kobject *kobj, struct attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
@@ -101,6 +110,7 @@ static ssize_t attr_funcs_show(struct kobject *kobj, struct attribute *attr,
|
||||
|
||||
|
||||
static struct attribute *sb_id_attrs[] = {
|
||||
&data_device_maj_min_attr_funcs.attr,
|
||||
&format_version_attr_funcs.attr,
|
||||
&fsid_attr_funcs.attr,
|
||||
&rid_attr_funcs.attr,
|
||||
|
||||
289
kmod/src/xattr.c
289
kmod/src/xattr.c
@@ -57,12 +57,6 @@ static u32 xattr_names_equal(const char *a_name, unsigned int a_len,
|
||||
return a_len == b_len && memcmp(a_name, b_name, a_len) == 0;
|
||||
}
|
||||
|
||||
static unsigned int xattr_full_bytes(struct scoutfs_xattr *xat)
|
||||
{
|
||||
return offsetof(struct scoutfs_xattr,
|
||||
name[xat->name_len + le16_to_cpu(xat->val_len)]);
|
||||
}
|
||||
|
||||
static unsigned int xattr_nr_parts(struct scoutfs_xattr *xat)
|
||||
{
|
||||
return SCOUTFS_XATTR_NR_PARTS(xat->name_len,
|
||||
@@ -137,12 +131,29 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next xattr and copy the key, xattr header, and as much of
|
||||
* the name and value into the callers buffer as we can. Returns the
|
||||
* number of bytes copied which include the header, name, and value and
|
||||
* can be limited by the xattr length or the callers buffer. The caller
|
||||
* is responsible for comparing their lengths, the header, and the
|
||||
* returned length before safely using the xattr.
|
||||
* xattrs are stored in multiple items. The first item is a
|
||||
* concatenation of an initial header, the name, and then as much of the
|
||||
* value as fits in the remainder of the first item. This return the
|
||||
* size of the first item that'd store an xattr with the given name
|
||||
* length and value payload size.
|
||||
*/
|
||||
static int first_item_bytes(int name_len, size_t size)
|
||||
{
|
||||
if (WARN_ON_ONCE(name_len <= 0) ||
|
||||
WARN_ON_ONCE(name_len > SCOUTFS_XATTR_MAX_NAME_LEN))
|
||||
return 0;
|
||||
|
||||
return min_t(int, sizeof(struct scoutfs_xattr) + name_len + size,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next xattr, set the caller's key, and copy as much of the
|
||||
* first item into the callers buffer as we can. Returns the number of
|
||||
* bytes copied which can include the header, name, and start of the
|
||||
* value from the first item. The caller is responsible for comparing
|
||||
* their lengths, the header, and the returned length before safely
|
||||
* using the buffer.
|
||||
*
|
||||
* If a name is provided then we'll iterate over items with a matching
|
||||
* name_hash until we find a matching name. If we don't find a matching
|
||||
@@ -154,20 +165,17 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
|
||||
* Returns -ENOENT if it didn't find a next item.
|
||||
*/
|
||||
static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
struct scoutfs_xattr *xat, unsigned int bytes,
|
||||
struct scoutfs_xattr *xat, unsigned int xat_bytes,
|
||||
const char *name, unsigned int name_len,
|
||||
u64 name_hash, u64 id, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key last;
|
||||
u8 last_part;
|
||||
int total;
|
||||
u8 part;
|
||||
int ret;
|
||||
|
||||
/* need to be able to see the name we're looking for */
|
||||
if (WARN_ON_ONCE(name_len > 0 && bytes < offsetof(struct scoutfs_xattr,
|
||||
name[name_len])))
|
||||
if (WARN_ON_ONCE(name_len > 0 &&
|
||||
xat_bytes < offsetof(struct scoutfs_xattr, name[name_len])))
|
||||
return -EINVAL;
|
||||
|
||||
if (name_len)
|
||||
@@ -176,26 +184,15 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
init_xattr_key(key, scoutfs_ino(inode), name_hash, id);
|
||||
init_xattr_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
|
||||
|
||||
last_part = 0;
|
||||
part = 0;
|
||||
total = 0;
|
||||
|
||||
for (;;) {
|
||||
key->skx_part = part;
|
||||
ret = scoutfs_item_next(sb, key, &last,
|
||||
(void *)xat + total, bytes - total,
|
||||
lock);
|
||||
if (ret < 0) {
|
||||
/* XXX corruption, ran out of parts */
|
||||
if (ret == -ENOENT && part > 0)
|
||||
ret = -EIO;
|
||||
ret = scoutfs_item_next(sb, key, &last, xat, xat_bytes, lock);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
trace_scoutfs_xattr_get_next_key(sb, key);
|
||||
|
||||
/* XXX corruption */
|
||||
if (key->skx_part != part) {
|
||||
if (key->skx_part != 0) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
@@ -205,8 +202,7 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
* the first part and if the next xattr name fits in our
|
||||
* buffer then the item must have included it.
|
||||
*/
|
||||
if (part == 0 &&
|
||||
(ret < sizeof(struct scoutfs_xattr) ||
|
||||
if ((ret < sizeof(struct scoutfs_xattr) ||
|
||||
(xat->name_len <= name_len &&
|
||||
ret < offsetof(struct scoutfs_xattr,
|
||||
name[xat->name_len])) ||
|
||||
@@ -216,7 +212,7 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
break;
|
||||
}
|
||||
|
||||
if (part == 0 && name_len) {
|
||||
if (name_len > 0) {
|
||||
/* ran out of names that could match */
|
||||
if (le64_to_cpu(key->skx_name_hash) != name_hash) {
|
||||
ret = -ENOENT;
|
||||
@@ -224,64 +220,126 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
}
|
||||
|
||||
/* keep looking for our name */
|
||||
if (!xattr_names_equal(name, name_len,
|
||||
xat->name, xat->name_len)) {
|
||||
part = 0;
|
||||
if (!xattr_names_equal(name, name_len, xat->name, xat->name_len)) {
|
||||
le64_add_cpu(&key->skx_id, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* use the matching name we found */
|
||||
last_part = xattr_nr_parts(xat) - 1;
|
||||
}
|
||||
|
||||
total += ret;
|
||||
if (total == bytes || part == last_part) {
|
||||
/* copied as much as we could */
|
||||
ret = total;
|
||||
break;
|
||||
}
|
||||
part++;
|
||||
/* found next name */
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has already read and verified the xattr's first item.
|
||||
* Copy the value from the tail of the first item and from any future
|
||||
* items into the destination buffer.
|
||||
*/
|
||||
static int copy_xattr_value(struct super_block *sb, struct scoutfs_key *xat_key,
|
||||
struct scoutfs_xattr *xat, int xat_bytes,
|
||||
char *buffer, size_t size,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_key key;
|
||||
size_t copied = 0;
|
||||
int val_tail;
|
||||
int bytes;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
/* must have first item up to value */
|
||||
if (WARN_ON_ONCE(xat_bytes < sizeof(struct scoutfs_xattr)) ||
|
||||
WARN_ON_ONCE(xat_bytes < offsetof(struct scoutfs_xattr, name[xat->name_len])))
|
||||
return -EINVAL;
|
||||
|
||||
/* only ever copy up to the full value */
|
||||
size = min_t(size_t, size, le16_to_cpu(xat->val_len));
|
||||
|
||||
/* must have full first item if caller needs value from second item */
|
||||
val_tail = SCOUTFS_XATTR_MAX_PART_SIZE -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
if (WARN_ON_ONCE(size > val_tail && xat_bytes != SCOUTFS_XATTR_MAX_PART_SIZE))
|
||||
return -EINVAL;
|
||||
|
||||
/* copy from tail of first item */
|
||||
bytes = min_t(unsigned int, size, val_tail);
|
||||
if (bytes > 0) {
|
||||
memcpy(buffer, &xat->name[xat->name_len], bytes);
|
||||
copied += bytes;
|
||||
}
|
||||
|
||||
key = *xat_key;
|
||||
for (i = 1; copied < size; i++) {
|
||||
key.skx_part = i;
|
||||
bytes = min_t(unsigned int, size - copied, SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
ret = scoutfs_item_lookup(sb, &key, buffer + copied, bytes, lock);
|
||||
if (ret >= 0 && ret != bytes)
|
||||
ret = -EIO;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
copied += ret;
|
||||
}
|
||||
|
||||
return copied;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is working with items that are either in the allocated
|
||||
* first compound item or further items that are offsets into a value
|
||||
* buffer. Give them a pointer and length of the start of the item.
|
||||
*/
|
||||
static void xattr_item_part_buffer(void **buf, int *len, int part,
|
||||
struct scoutfs_xattr *xat, unsigned int xat_bytes,
|
||||
const char *value, size_t size)
|
||||
{
|
||||
int off;
|
||||
|
||||
if (part == 0) {
|
||||
*buf = xat;
|
||||
*len = xat_bytes;
|
||||
} else {
|
||||
off = (part * SCOUTFS_XATTR_MAX_PART_SIZE) -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
BUG_ON(off >= size); /* calls limited by number of parts */
|
||||
*buf = (void *)value + off;
|
||||
*len = min_t(size_t, size - off, SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create all the items associated with the given xattr. If this
|
||||
* returns an error it will have already cleaned up any items it created
|
||||
* before seeing the error.
|
||||
*/
|
||||
static int create_xattr_items(struct inode *inode, u64 id,
|
||||
struct scoutfs_xattr *xat, unsigned int bytes,
|
||||
static int create_xattr_items(struct inode *inode, u64 id, struct scoutfs_xattr *xat,
|
||||
int xat_bytes, const char *value, size_t size, u8 new_parts,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key key;
|
||||
unsigned int part_bytes;
|
||||
unsigned int total;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
void *buf;
|
||||
int len;
|
||||
int i;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
total = 0;
|
||||
ret = 0;
|
||||
while (total < bytes) {
|
||||
part_bytes = min_t(unsigned int, bytes - total,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
for (i = 0; i < new_parts; i++) {
|
||||
key.skx_part = i;
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
|
||||
ret = scoutfs_item_create(sb, &key,
|
||||
(void *)xat + total, part_bytes,
|
||||
lock);
|
||||
if (ret) {
|
||||
ret = scoutfs_item_create(sb, &key, buf, len, lock);
|
||||
if (ret < 0) {
|
||||
while (key.skx_part-- > 0)
|
||||
scoutfs_item_delete(sb, &key, lock);
|
||||
break;
|
||||
}
|
||||
|
||||
total += part_bytes;
|
||||
key.skx_part++;
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -329,20 +387,20 @@ out:
|
||||
* deleted items.
|
||||
*/
|
||||
static int change_xattr_items(struct inode *inode, u64 id,
|
||||
struct scoutfs_xattr *new_xat,
|
||||
unsigned int new_bytes, u8 new_parts,
|
||||
u8 old_parts, struct scoutfs_lock *lock)
|
||||
struct scoutfs_xattr *xat, int xat_bytes,
|
||||
const char *value, size_t size,
|
||||
u8 new_parts, u8 old_parts, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key key;
|
||||
int last_created = -1;
|
||||
int bytes;
|
||||
int off;
|
||||
void *buf;
|
||||
int len;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(new_xat->name, new_xat->name_len), id);
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
/* dirty existing old items */
|
||||
for (i = 0; i < old_parts; i++) {
|
||||
@@ -354,13 +412,10 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
|
||||
/* create any new items past the old */
|
||||
for (i = old_parts; i < new_parts; i++) {
|
||||
off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
|
||||
bytes = min_t(unsigned int, new_bytes - off,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
key.skx_part = i;
|
||||
ret = scoutfs_item_create(sb, &key, (void *)new_xat + off,
|
||||
bytes, lock);
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, buf, len, lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -369,13 +424,10 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
|
||||
/* update dirtied overlapping existing items, last partial first */
|
||||
for (i = min(old_parts, new_parts) - 1; i >= 0; i--) {
|
||||
off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
|
||||
bytes = min_t(unsigned int, new_bytes - off,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
key.skx_part = i;
|
||||
ret = scoutfs_item_update(sb, &key, (void *)new_xat + off,
|
||||
bytes, lock);
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
|
||||
ret = scoutfs_item_update(sb, &key, buf, len, lock);
|
||||
/* only last partial can fail, then we unwind created */
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@@ -412,7 +464,7 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_key key;
|
||||
unsigned int bytes;
|
||||
unsigned int xat_bytes;
|
||||
size_t name_len;
|
||||
int ret;
|
||||
|
||||
@@ -423,9 +475,8 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
if (name_len > SCOUTFS_XATTR_MAX_NAME_LEN)
|
||||
return -ENODATA;
|
||||
|
||||
/* only need enough for caller's name and value sizes */
|
||||
bytes = sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
|
||||
xat_bytes = first_item_bytes(name_len, size);
|
||||
xat = kmalloc(xat_bytes, GFP_NOFS);
|
||||
if (!xat)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -435,40 +486,32 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
|
||||
down_read(&si->xattr_rwsem);
|
||||
|
||||
ret = get_next_xattr(inode, &key, xat, bytes,
|
||||
name, name_len, 0, 0, lck);
|
||||
|
||||
up_read(&si->xattr_rwsem);
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_READ);
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes, name, name_len, 0, 0, lck);
|
||||
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = -ENODATA;
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* the caller just wants to know the size */
|
||||
if (size == 0) {
|
||||
ret = le16_to_cpu(xat->val_len);
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* the caller's buffer wasn't big enough */
|
||||
if (size < le16_to_cpu(xat->val_len)) {
|
||||
ret = -ERANGE;
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* XXX corruption, the items didn't match the header */
|
||||
if (ret < xattr_full_bytes(xat)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = le16_to_cpu(xat->val_len);
|
||||
memcpy(buffer, &xat->name[xat->name_len], ret);
|
||||
ret = copy_xattr_value(sb, &key, xat, xat_bytes, buffer, size, lck);
|
||||
unlock:
|
||||
up_read(&si->xattr_rwsem);
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_READ);
|
||||
out:
|
||||
vfree(xat);
|
||||
kfree(xat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -596,7 +639,8 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
bool undo_totl = false;
|
||||
LIST_HEAD(ind_locks);
|
||||
u8 found_parts;
|
||||
unsigned int bytes;
|
||||
unsigned int xat_bytes_totl;
|
||||
unsigned int xat_bytes;
|
||||
unsigned int val_len;
|
||||
u64 ind_seq;
|
||||
u64 total;
|
||||
@@ -629,9 +673,12 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
if (tgs.totl && ((ret = parse_totl_key(&totl_key, name, name_len)) != 0))
|
||||
return ret;
|
||||
|
||||
bytes = sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
/* alloc enough to read old totl value */
|
||||
xat = __vmalloc(bytes + SCOUTFS_XATTR_MAX_TOTL_U64, GFP_NOFS, PAGE_KERNEL);
|
||||
/* allocate enough to always read an existing xattr's totl */
|
||||
xat_bytes_totl = first_item_bytes(name_len,
|
||||
max_t(size_t, size, SCOUTFS_XATTR_MAX_TOTL_U64));
|
||||
/* but store partial first item that only includes the new xattr's value */
|
||||
xat_bytes = first_item_bytes(name_len, size);
|
||||
xat = kmalloc(xat_bytes_totl, GFP_NOFS);
|
||||
if (!xat) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@@ -645,9 +692,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
down_write(&si->xattr_rwsem);
|
||||
|
||||
/* find an existing xattr to delete, including possible totl value */
|
||||
ret = get_next_xattr(inode, &key, xat,
|
||||
sizeof(struct scoutfs_xattr) + name_len + SCOUTFS_XATTR_MAX_TOTL_U64,
|
||||
name, name_len, 0, 0, lck);
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes_totl, name, name_len, 0, 0, lck);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto unlock;
|
||||
|
||||
@@ -683,7 +728,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
le64_add_cpu(&tval.total, -total);
|
||||
}
|
||||
|
||||
/* prepare our xattr */
|
||||
/* prepare the xattr header, name, and start of value in first item */
|
||||
if (value) {
|
||||
if (found_parts)
|
||||
id = le64_to_cpu(key.skx_id);
|
||||
@@ -693,7 +738,9 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
xat->val_len = cpu_to_le16(size);
|
||||
memset(xat->__pad, 0, sizeof(xat->__pad));
|
||||
memcpy(xat->name, name, name_len);
|
||||
memcpy(&xat->name[xat->name_len], value, size);
|
||||
memcpy(&xat->name[name_len], value,
|
||||
min(size, SCOUTFS_XATTR_MAX_PART_SIZE -
|
||||
offsetof(struct scoutfs_xattr, name[name_len])));
|
||||
|
||||
if (tgs.totl) {
|
||||
ret = parse_totl_u64(value, size, &total);
|
||||
@@ -741,14 +788,15 @@ retry:
|
||||
}
|
||||
|
||||
if (found_parts && value)
|
||||
ret = change_xattr_items(inode, id, xat, bytes,
|
||||
ret = change_xattr_items(inode, id, xat, xat_bytes, value, size,
|
||||
xattr_nr_parts(xat), found_parts, lck);
|
||||
else if (found_parts)
|
||||
ret = delete_xattr_items(inode, le64_to_cpu(key.skx_name_hash),
|
||||
le64_to_cpu(key.skx_id), found_parts,
|
||||
lck);
|
||||
else
|
||||
ret = create_xattr_items(inode, id, xat, bytes, lck);
|
||||
ret = create_xattr_items(inode, id, xat, xat_bytes, value, size,
|
||||
xattr_nr_parts(xat), lck);
|
||||
if (ret < 0)
|
||||
goto release;
|
||||
|
||||
@@ -778,7 +826,7 @@ unlock:
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
out:
|
||||
vfree(xat);
|
||||
kfree(xat);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -807,7 +855,7 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_key key;
|
||||
unsigned int bytes;
|
||||
unsigned int xat_bytes;
|
||||
ssize_t total = 0;
|
||||
u32 name_hash = 0;
|
||||
bool is_hidden;
|
||||
@@ -820,8 +868,8 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
id = *id_pos;
|
||||
|
||||
/* need a buffer large enough for all possible names */
|
||||
bytes = sizeof(struct scoutfs_xattr) + SCOUTFS_XATTR_MAX_NAME_LEN;
|
||||
xat = kmalloc(bytes, GFP_NOFS);
|
||||
xat_bytes = first_item_bytes(SCOUTFS_XATTR_MAX_NAME_LEN, 0);
|
||||
xat = kmalloc(xat_bytes, GFP_NOFS);
|
||||
if (!xat) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@@ -834,8 +882,7 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
down_read(&si->xattr_rwsem);
|
||||
|
||||
for (;;) {
|
||||
ret = get_next_xattr(inode, &key, xat, bytes,
|
||||
NULL, 0, name_hash, id, lck);
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes, NULL, 0, name_hash, id, lck);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = total;
|
||||
|
||||
@@ -1,5 +1,18 @@
|
||||
#!/usr/bin/bash
|
||||
|
||||
#
|
||||
# This fencing script is used for testing clusters of multiple mounts on
|
||||
# a single host. It finds mounts to fence by looking for their rids and
|
||||
# only knows how to "fence" by using forced unmount.
|
||||
#
|
||||
|
||||
echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
|
||||
|
||||
log() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo_fail() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
@@ -7,29 +20,24 @@ echo_fail() {
|
||||
|
||||
rid="$SCOUTFS_FENCED_REQ_RID"
|
||||
|
||||
#
|
||||
# Look for a local mount with the rid to fence. Typically we'll at
|
||||
# least find the mount with the server that requested the fence that
|
||||
# we're processing. But it's possible that mounts are unmounted
|
||||
# before, or while, we're running.
|
||||
#
|
||||
mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \
|
||||
echo_fail "findmnt -t scoutfs failed" > /dev/stderr
|
||||
for fs in /sys/fs/scoutfs/*; do
|
||||
[ ! -d "$fs" ] && continue
|
||||
|
||||
for mnt in $mnts; do
|
||||
mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \
|
||||
echo_fail "scoutfs statfs $mnt failed"
|
||||
|
||||
if [ "$mnt_rid" == "$rid" ]; then
|
||||
umount -f "$mnt" || \
|
||||
echo_fail "umout -f $mnt"
|
||||
|
||||
exit 0
|
||||
fs_rid="$(cat $fs/rid)" || \
|
||||
echo_fail "failed to get rid in $fs"
|
||||
if [ "$fs_rid" != "$rid" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
nr="$(cat $fs/data_device_maj_min)" || \
|
||||
echo_fail "failed to get data device major:minor in $fs"
|
||||
|
||||
mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
|
||||
echo_fail "findmnt -t scoutfs -S $nr failed"
|
||||
for mnt in $mnts; do
|
||||
umount -f "$mnt" || \
|
||||
echo_fail "umout -f $mnt failed"
|
||||
done
|
||||
done
|
||||
|
||||
#
|
||||
# If the mount doesn't exist on this host then it can't access the
|
||||
# devices by definition and can be considered fenced.
|
||||
#
|
||||
exit 0
|
||||
|
||||
@@ -75,6 +75,20 @@ t_fs_nrs()
|
||||
seq 0 $((T_NR_MOUNTS - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
|
||||
# All other cases output 0, including the fs nr being a client which
|
||||
# won't have a quorum/ dir.
|
||||
#
|
||||
t_fs_is_leader()
|
||||
{
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader 2>/dev/null)" == "1" ]; then
|
||||
echo "1"
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# Output the mount nr of the current server. This takes no steps to
|
||||
# ensure that the server doesn't shut down and have some other mount
|
||||
@@ -83,7 +97,7 @@ t_fs_nrs()
|
||||
t_server_nr()
|
||||
{
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "1" ]; then
|
||||
if [ "$(t_fs_is_leader $i)" == "1" ]; then
|
||||
echo $i
|
||||
return
|
||||
fi
|
||||
@@ -101,7 +115,7 @@ t_server_nr()
|
||||
t_first_client_nr()
|
||||
{
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "0" ]; then
|
||||
if [ "$(t_fs_is_leader $i)" == "0" ]; then
|
||||
echo $i
|
||||
return
|
||||
fi
|
||||
|
||||
3
tests/golden/lock-recover-invalidate
Normal file
3
tests/golden/lock-recover-invalidate
Normal file
@@ -0,0 +1,3 @@
|
||||
== starting background invalidating read/write load
|
||||
== 60s of lock recovery during invalidating load
|
||||
== stopping background load
|
||||
0
tests/golden/lock-rever-invalidate
Normal file
0
tests/golden/lock-rever-invalidate
Normal file
@@ -380,13 +380,14 @@ cmd grep . /sys/kernel/debug/tracing/options/trace_printk \
|
||||
# Build a fenced config that runs scripts out of the repository rather
|
||||
# than the default system directory
|
||||
#
|
||||
conf="$T_RESULTS/scoutfs-fencd.conf"
|
||||
conf="$T_RESULTS/scoutfs-fenced.conf"
|
||||
cat > $conf << EOF
|
||||
SCOUTFS_FENCED_DELAY=1
|
||||
SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh
|
||||
SCOUTFS_FENCED_RUN_ARGS=""
|
||||
SCOUTFS_FENCED_RUN_ARGS="ignored run args"
|
||||
EOF
|
||||
export SCOUTFS_FENCED_CONFIG_FILE="$conf"
|
||||
T_FENCED_LOG="$T_RESULTS/fenced.log"
|
||||
|
||||
#
|
||||
# Run the agent in the background, log its output, an kill it if we
|
||||
@@ -394,7 +395,7 @@ export SCOUTFS_FENCED_CONFIG_FILE="$conf"
|
||||
#
|
||||
fenced_log()
|
||||
{
|
||||
echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log"
|
||||
echo "[$(timestamp)] $*" >> "$T_FENCED_LOG"
|
||||
}
|
||||
fenced_pid=""
|
||||
kill_fenced()
|
||||
@@ -405,7 +406,7 @@ kill_fenced()
|
||||
fi
|
||||
}
|
||||
trap kill_fenced EXIT
|
||||
$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" &
|
||||
$T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
|
||||
fenced_pid=$!
|
||||
fenced_log "started fenced pid $fenced_pid in the background"
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ lock-refleak.sh
|
||||
lock-shrink-consistency.sh
|
||||
lock-pr-cw-conflict.sh
|
||||
lock-revoke-getcwd.sh
|
||||
lock-recover-invalidate.sh
|
||||
export-lookup-evict-race.sh
|
||||
createmany-parallel.sh
|
||||
createmany-large-names.sh
|
||||
|
||||
@@ -45,6 +45,18 @@ check_read_write()
|
||||
fi
|
||||
}
|
||||
|
||||
# verify that fenced ran our testing fence script
|
||||
verify_fenced_run()
|
||||
{
|
||||
local rids="$@"
|
||||
local rid
|
||||
|
||||
for rid in $rids; do
|
||||
grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \
|
||||
t_fail "fenced didn't execute RUN script for rid $rid"
|
||||
done
|
||||
}
|
||||
|
||||
echo "== make sure all mounts can see each other"
|
||||
check_read_write
|
||||
|
||||
@@ -62,12 +74,14 @@ done
|
||||
while t_rid_is_fencing $rid; do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rid
|
||||
t_mount $cl
|
||||
check_read_write
|
||||
|
||||
echo "== force unmount all non-server, connection timeout, fence nop, mount"
|
||||
sv=$(t_server_nr)
|
||||
pattern="nonsense"
|
||||
rids=""
|
||||
sync
|
||||
for cl in $(t_fs_nrs); do
|
||||
if [ $cl == $sv ]; then
|
||||
@@ -75,6 +89,7 @@ for cl in $(t_fs_nrs); do
|
||||
fi
|
||||
|
||||
rid=$(t_mount_rid $cl)
|
||||
rids="$rids $rid"
|
||||
pattern="$pattern|$rid"
|
||||
echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
|
||||
|
||||
@@ -89,6 +104,7 @@ done
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rids
|
||||
# remount all the clients
|
||||
for cl in $(t_fs_nrs); do
|
||||
if [ $cl == $sv ]; then
|
||||
@@ -109,11 +125,17 @@ t_wait_for_leader
|
||||
while t_rid_is_fencing $rid; do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rid
|
||||
t_mount $sv
|
||||
check_read_write
|
||||
|
||||
echo "== force unmount everything, new server fences all previous"
|
||||
sync
|
||||
rids=""
|
||||
# get rids before forced unmount breaks scoutfs statfs
|
||||
for nr in $(t_fs_nrs); do
|
||||
rids="$rids $(t_mount_rid $nr)"
|
||||
done
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_force_umount $nr
|
||||
done
|
||||
@@ -122,6 +144,7 @@ t_mount_all
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rids
|
||||
check_read_write
|
||||
|
||||
t_pass
|
||||
|
||||
43
tests/tests/lock-recover-invalidate.sh
Normal file
43
tests/tests/lock-recover-invalidate.sh
Normal file
@@ -0,0 +1,43 @@
|
||||
#
|
||||
# trigger server failover and lock recovery during heavy invalidating
|
||||
# load on multiple mounts
|
||||
#
|
||||
|
||||
majority_nr=$(t_majority_count)
|
||||
quorum_nr=$T_QUORUM
|
||||
|
||||
test "$quorum_nr" == "$majority_nr" && \
|
||||
t_skip "need remaining majority when leader unmounted"
|
||||
|
||||
test "$T_NR_MOUNTS" -lt "$((quorum_nr + 2))" && \
|
||||
t_skip "need at least 2 non-quorum load mounts"
|
||||
|
||||
echo "== starting background invalidating read/write load"
|
||||
touch "$T_D0/file"
|
||||
load_pids=""
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$i" -ge "$quorum_nr" ]; then
|
||||
eval path="\$T_D${i}/file"
|
||||
|
||||
(while true; do touch $path > /dev/null 2>&1; done) &
|
||||
load_pids="$load_pids $!"
|
||||
(while true; do stat $path > /dev/null 2>&1; done) &
|
||||
load_pids="$load_pids $!"
|
||||
fi
|
||||
done
|
||||
|
||||
# had it reproduce in ~40s on wimpy debug kernel guests
|
||||
LENGTH=60
|
||||
echo "== ${LENGTH}s of lock recovery during invalidating load"
|
||||
END=$((SECONDS + LENGTH))
|
||||
while [ "$SECONDS" -lt "$END" ]; do
|
||||
sv=$(t_server_nr)
|
||||
t_umount $sv
|
||||
t_mount $sv
|
||||
# new server had to process greeting for mount to finish
|
||||
done
|
||||
|
||||
echo "== stopping background load"
|
||||
kill $load_pids
|
||||
|
||||
t_pass
|
||||
@@ -55,9 +55,21 @@ test -x "$SCOUTFS_FENCED_RUN" || \
|
||||
error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable"
|
||||
|
||||
#
|
||||
# main loop watching for fence request across all filesystems
|
||||
# Main loop watching for fence request across all filesystems. The
|
||||
# server can shut down without waiting for pending fence requests to
|
||||
# finish. All of the interaction with the fence directory and files can
|
||||
# fail at any moment. We will generate log messages when the dir or
|
||||
# files disappear.
|
||||
#
|
||||
|
||||
# generate failure messages to stderr while still echoing 0 for the caller
|
||||
careful_cat()
|
||||
{
|
||||
local path="$@"
|
||||
|
||||
cat "$@" || echo 0
|
||||
}
|
||||
|
||||
while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
for fence in /sys/fs/scoutfs/*/fence/*; do
|
||||
# catches unmatched regex when no dirs
|
||||
@@ -66,7 +78,8 @@ while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
fi
|
||||
|
||||
# skip requests that have been handled
|
||||
if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then
|
||||
if [ "$(careful_cat $fence/fenced)" == 1 -o \
|
||||
"$(careful_cat $fence/error)" == 1 ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -81,10 +94,10 @@ while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
export SCOUTFS_FENCED_REQ_RID="$rid"
|
||||
export SCOUTFS_FENCED_REQ_IP="$ip"
|
||||
|
||||
$run $SCOUTFS_FENCED_RUN_ARGS
|
||||
$SCOUTFS_FENCED_RUN $SCOUTFS_FENCED_RUN_ARGS
|
||||
rc=$?
|
||||
if [ "$rc" != 0 ]; then
|
||||
log_message "server $srv fencing rid $rid saw error status $rc from $run"
|
||||
log_message "server $srv fencing rid $rid saw error status $rc"
|
||||
echo 1 > "$fence/error"
|
||||
continue
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user