Merge pull request #64 from bgly/bduffyly/quorum_race

Fix client/server race between lock recov and farewell processing
This commit is contained in:
Zach Brown
2022-01-14 09:03:00 -08:00
committed by GitHub
7 changed files with 183 additions and 58 deletions

View File

@@ -153,30 +153,30 @@ enum {
*/
static void add_client_entry(struct server_lock_node *snode,
struct list_head *list,
struct client_lock_entry *clent)
struct client_lock_entry *c_ent)
{
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
if (list_empty(&clent->head))
list_add_tail(&clent->head, list);
if (list_empty(&c_ent->head))
list_add_tail(&c_ent->head, list);
else
list_move_tail(&clent->head, list);
list_move_tail(&c_ent->head, list);
clent->on_list = list == &snode->granted ? OL_GRANTED :
c_ent->on_list = list == &snode->granted ? OL_GRANTED :
list == &snode->requested ? OL_REQUESTED :
OL_INVALIDATED;
}
static void free_client_entry(struct lock_server_info *inf,
struct server_lock_node *snode,
struct client_lock_entry *clent)
struct client_lock_entry *c_ent)
{
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
if (!list_empty(&clent->head))
list_del_init(&clent->head);
scoutfs_tseq_del(&inf->tseq_tree, &clent->tseq_entry);
kfree(clent);
if (!list_empty(&c_ent->head))
list_del_init(&c_ent->head);
scoutfs_tseq_del(&inf->tseq_tree, &c_ent->tseq_entry);
kfree(c_ent);
}
static bool invalid_mode(u8 mode)
@@ -339,13 +339,13 @@ static struct client_lock_entry *find_entry(struct server_lock_node *snode,
struct list_head *list,
u64 rid)
{
struct client_lock_entry *clent;
struct client_lock_entry *c_ent;
WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));
list_for_each_entry(clent, list, head) {
if (clent->rid == rid)
return clent;
list_for_each_entry(c_ent, list, head) {
if (c_ent->rid == rid)
return c_ent;
}
return NULL;
@@ -364,7 +364,7 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
u64 net_id, struct scoutfs_net_lock *nl)
{
DECLARE_LOCK_SERVER_INFO(sb, inf);
struct client_lock_entry *clent;
struct client_lock_entry *c_ent;
struct server_lock_node *snode;
int ret;
@@ -376,29 +376,29 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
goto out;
}
clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
if (!clent) {
c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
if (!c_ent) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&clent->head);
clent->rid = rid;
clent->net_id = net_id;
clent->mode = nl->new_mode;
INIT_LIST_HEAD(&c_ent->head);
c_ent->rid = rid;
c_ent->net_id = net_id;
c_ent->mode = nl->new_mode;
snode = alloc_server_lock(inf, &nl->key);
if (snode == NULL) {
kfree(clent);
kfree(c_ent);
ret = -ENOMEM;
goto out;
}
snode->stats[SLT_REQUEST]++;
clent->snode = snode;
add_client_entry(snode, &snode->requested, clent);
scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
c_ent->snode = snode;
add_client_entry(snode, &snode->requested, c_ent);
scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);
ret = process_waiting_requests(sb, snode);
out:
@@ -417,7 +417,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
struct scoutfs_net_lock *nl)
{
DECLARE_LOCK_SERVER_INFO(sb, inf);
struct client_lock_entry *clent;
struct client_lock_entry *c_ent;
struct server_lock_node *snode;
int ret;
@@ -438,18 +438,18 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
snode->stats[SLT_RESPONSE]++;
clent = find_entry(snode, &snode->invalidated, rid);
if (!clent) {
c_ent = find_entry(snode, &snode->invalidated, rid);
if (!c_ent) {
put_server_lock(inf, snode);
ret = -EINVAL;
goto out;
}
if (nl->new_mode == SCOUTFS_LOCK_NULL) {
free_client_entry(inf, snode, clent);
free_client_entry(inf, snode, c_ent);
} else {
clent->mode = nl->new_mode;
add_client_entry(snode, &snode->granted, clent);
c_ent->mode = nl->new_mode;
add_client_entry(snode, &snode->granted, c_ent);
}
ret = process_waiting_requests(sb, snode);
@@ -632,7 +632,7 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
{
DECLARE_LOCK_SERVER_INFO(sb, inf);
struct client_lock_entry *existing;
struct client_lock_entry *clent;
struct client_lock_entry *c_ent;
struct server_lock_node *snode;
struct scoutfs_key key;
int ret = 0;
@@ -652,35 +652,35 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
}
for (i = 0; i < le16_to_cpu(nlr->nr); i++) {
clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
if (!clent) {
c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
if (!c_ent) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&clent->head);
clent->rid = rid;
clent->net_id = 0;
clent->mode = nlr->locks[i].new_mode;
INIT_LIST_HEAD(&c_ent->head);
c_ent->rid = rid;
c_ent->net_id = 0;
c_ent->mode = nlr->locks[i].new_mode;
snode = alloc_server_lock(inf, &nlr->locks[i].key);
if (snode == NULL) {
kfree(clent);
kfree(c_ent);
ret = -ENOMEM;
goto out;
}
existing = find_entry(snode, &snode->granted, rid);
if (existing) {
kfree(clent);
kfree(c_ent);
put_server_lock(inf, snode);
ret = -EEXIST;
goto out;
}
clent->snode = snode;
add_client_entry(snode, &snode->granted, clent);
scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
c_ent->snode = snode;
add_client_entry(snode, &snode->granted, c_ent);
scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);
put_server_lock(inf, snode);
@@ -707,7 +707,7 @@ out:
int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
{
DECLARE_LOCK_SERVER_INFO(sb, inf);
struct client_lock_entry *clent;
struct client_lock_entry *c_ent;
struct client_lock_entry *tmp;
struct server_lock_node *snode;
struct scoutfs_key key;
@@ -724,9 +724,9 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
(list == &snode->requested) ? &snode->invalidated :
NULL) {
list_for_each_entry_safe(clent, tmp, list, head) {
if (clent->rid == rid) {
free_client_entry(inf, snode, clent);
list_for_each_entry_safe(c_ent, tmp, list, head) {
if (c_ent->rid == rid) {
free_client_entry(inf, snode, c_ent);
freed = true;
}
}
@@ -787,15 +787,15 @@ static char *lock_on_list_string(u8 on_list)
static void lock_server_tseq_show(struct seq_file *m,
struct scoutfs_tseq_entry *ent)
{
struct client_lock_entry *clent = container_of(ent,
struct client_lock_entry *c_ent = container_of(ent,
struct client_lock_entry,
tseq_entry);
struct server_lock_node *snode = clent->snode;
struct server_lock_node *snode = c_ent->snode;
seq_printf(m, SK_FMT" %s %s rid %016llx net_id %llu\n",
SK_ARG(&snode->key), lock_mode_string(clent->mode),
lock_on_list_string(clent->on_list), clent->rid,
clent->net_id);
SK_ARG(&snode->key), lock_mode_string(c_ent->mode),
lock_on_list_string(c_ent->on_list), c_ent->rid,
c_ent->net_id);
}
static void stats_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
@@ -857,7 +857,7 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
DECLARE_LOCK_SERVER_INFO(sb, inf);
struct server_lock_node *snode;
struct server_lock_node *stmp;
struct client_lock_entry *clent;
struct client_lock_entry *c_ent;
struct client_lock_entry *ctmp;
LIST_HEAD(list);
@@ -873,8 +873,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
list_splice_init(&snode->invalidated, &list);
mutex_lock(&snode->mutex);
list_for_each_entry_safe(clent, ctmp, &list, head) {
free_client_entry(inf, snode, clent);
list_for_each_entry_safe(c_ent, ctmp, &list, head) {
free_client_entry(inf, snode, c_ent);
}
mutex_unlock(&snode->mutex);

View File

@@ -675,18 +675,28 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
/*
* We want to drain the proc_workq in order to ensure that
* that the inflight lock recovery work is fully flushed out
* so that we can prevent the client/server racing trying to
* do lock recovery and processing farewell at the same time.
*/
if (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn)
drain_workqueue(conn->proc_workq);
/*
* Initial received greetings are processed
* Initial received greetings and farewell are processed
* synchronously before any other incoming messages.
*
* Incoming requests or responses to the lock client are
* called synchronously to avoid reordering.
*/
if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
(nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn) ||
(nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
scoutfs_net_proc_worker(&mrecv->proc_work);
else
queue_work(conn->workq, &mrecv->proc_work);
queue_work(conn->proc_workq, &mrecv->proc_work);
}
if (ret)
@@ -851,6 +861,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
}
destroy_workqueue(conn->workq);
destroy_workqueue(conn->proc_workq);
scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
kfree(conn->info);
trace_scoutfs_conn_destroy_free(conn);
@@ -1150,6 +1161,8 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
/* wait for socket and proc work to finish, includes chained work */
drain_workqueue(conn->workq);
drain_workqueue(conn->proc_workq);
/* tear down the sock now that all work is done */
if (conn->sock) {
sock_release(conn->sock);
@@ -1347,7 +1360,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
return NULL;
}
conn->workq = alloc_workqueue("scoutfs_net_%s",
conn->workq = alloc_workqueue("scoutfs_net_workq_%s",
WQ_UNBOUND | WQ_NON_REENTRANT, 0,
name_suffix);
if (!conn->workq) {
@@ -1356,6 +1369,16 @@ scoutfs_net_alloc_conn(struct super_block *sb,
return NULL;
}
conn->proc_workq = alloc_workqueue("scoutfs_net_proc_workq_%s",
WQ_UNBOUND | WQ_NON_REENTRANT, 0,
name_suffix);
if (!conn->proc_workq) {
destroy_workqueue(conn->workq);
kfree(conn->info);
kfree(conn);
return NULL;
}
conn->sb = sb;
conn->notify_up = notify_up;
conn->notify_down = notify_down;
@@ -1388,6 +1411,14 @@ scoutfs_net_alloc_conn(struct super_block *sb,
return conn;
}
/* Give the caller the client info of the connection. This is used by
* server processing the server_farewell, and lock response/recovery.
*/
void *scoutfs_net_client_info(struct scoutfs_net_connection *conn)
{
return conn->info;
}
/*
* Give the caller the client rid of the connection. This used by rare
* server processing callers who want to send async responses after

View File

@@ -63,6 +63,7 @@ struct scoutfs_net_connection {
atomic64_t recv_seq;
struct workqueue_struct *workq;
struct workqueue_struct *proc_workq;
struct work_struct listen_work;
struct work_struct connect_work;
struct work_struct send_work;
@@ -115,6 +116,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
scoutfs_net_notify_t notify_up,
scoutfs_net_notify_t notify_down, size_t info_size,
scoutfs_net_request_t *req_funcs, char *name_suffix);
void *scoutfs_net_client_info(struct scoutfs_net_connection *conn);
u64 scoutfs_net_client_rid(struct scoutfs_net_connection *conn);
int scoutfs_net_connect(struct super_block *sb,
struct scoutfs_net_connection *conn,

View File

@@ -122,6 +122,7 @@ struct server_info {
struct server_client_info {
u64 rid;
struct list_head head;
bool received_farewell;
};
static __le64 *first_valopt(struct scoutfs_volume_options *valopt)
@@ -1505,11 +1506,15 @@ static int server_lock(struct super_block *sb,
struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
u64 rid = scoutfs_net_client_rid(conn);
if (arg_len != sizeof(struct scoutfs_net_lock))
return -EINVAL;
if (sci->received_farewell)
return scoutfs_net_response(sb, conn, cmd, id, -EINVAL, NULL, 0);
return scoutfs_lock_server_request(sb, rid, id, arg);
}
@@ -1518,11 +1523,15 @@ static int lock_response(struct super_block *sb,
void *resp, unsigned int resp_len,
int error, void *data)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
u64 rid = scoutfs_net_client_rid(conn);
if (resp_len != sizeof(struct scoutfs_net_lock))
return -EINVAL;
if (sci->received_farewell)
return 0;
return scoutfs_lock_server_response(sb, rid, resp);
}
@@ -1560,11 +1569,15 @@ static int lock_recover_response(struct super_block *sb,
void *resp, unsigned int resp_len,
int error, void *data)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
u64 rid = scoutfs_net_client_rid(conn);
if (invalid_recover(resp, resp_len))
return -EINVAL;
if (sci->received_farewell)
return 0;
return scoutfs_lock_server_recover_response(sb, rid, resp);
}
@@ -3516,9 +3529,11 @@ static int server_farewell(struct super_block *sb,
struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
struct server_info *server = SCOUTFS_SB(sb)->server_info;
u64 rid = scoutfs_net_client_rid(conn);
struct farewell_request *fw;
int ret;
if (arg_len != 0)
return -EINVAL;
@@ -3536,6 +3551,20 @@ static int server_farewell(struct super_block *sb,
list_add_tail(&fw->entry, &server->farewell_requests);
spin_unlock(&server->farewell_lock);
/*
* Tear down client lock server state and set that we recieved farewell
* to ensure that we do not race between client and server trying to process
* lock recovery at the same time (race). We also want to mark that the recovery
* finished so that if client's try to send stuff later; the server doesnt care.
*/
sci->received_farewell = true;
ret = scoutfs_lock_server_farewell(sb, rid);
if (ret < 0) {
kfree(fw);
return ret;
}
scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_LOCKS);
queue_farewell_work(server);
/* response will be sent later */

View File

@@ -0,0 +1 @@
== 60s of unmounting non-quorum clients during recovery

View File

@@ -33,6 +33,7 @@ resize-devices.sh
fence-and-reclaim.sh
orphan-inodes.sh
mount-unmount-race.sh
client-unmount-recovery.sh
createmany-parallel-mounts.sh
archive-light-cycle.sh
block-stale-reads.sh

View File

@@ -0,0 +1,61 @@
#
# Unmount Server and unmount a client as it's replaying to a remaining server
#
majority_nr=$(t_majority_count)
quorum_nr=$T_QUORUM
test "$quorum_nr" == "$majority_nr" && \
t_skip "all quorum members make up majority, need more mounts to unmount"
test "$T_NR_MOUNTS" -lt "$T_QUORUM" && \
t_skip "Need enough non-quorum clients to unmount"
for i in $(t_fs_nrs); do
mounted[$i]=1
done
LENGTH=60
echo "== ${LENGTH}s of unmounting non-quorum clients during recovery"
END=$((SECONDS + LENGTH))
while [ "$SECONDS" -lt "$END" ]; do
sv=$(t_server_nr)
rid=$(t_mount_rid $sv)
echo "sv $sv rid $rid" >> "$T_TMP.log"
sync
t_umount $sv &
for i in $(t_fs_nrs); do
if [ "$i" -ge "$quorum_nr" ]; then
t_umount $i &
echo "umount $i pid $pid quo $quorum_nr" \
>> $T_TMP.log
mounted[$i]=0
fi
done
wait
t_mount $sv &
for i in $(t_fs_nrs); do
if [ "${mounted[$i]}" == 0 ]; then
t_mount $i &
fi
done
wait
declare RID_LIST=$(cat /sys/fs/scoutfs/*/rid | sort -u)
read -a rid_arr <<< $RID_LIST
declare LOCK_LIST=$(cut -d' ' -f 5 /sys/kernel/debug/scoutfs/*/server_locks | sort -u)
read -a lock_arr <<< $LOCK_LIST
for i in "${lock_arr[@]}"; do
if [[ ! " ${rid_arr[*]} " =~ " $i " ]]; then
t_fail "RID($i): exists when not mounted"
fi
done
done
t_pass