Fix client/server race btwn lock recov and farewell

Tear down client lock server state and set a boolean so that
there is no race between client/server processing lock recovery
at the same time as farewell.

Currently there is a bug where if server and clients are unmounted
then work from the client is processed out of order, which leaves
behind a server_lock for a RID that no longer exists.
In order to fix this we need to serialize SCOUTFS_NET_CMD_FAREWELL
in recv_worker.

Signed-off-by: Bryant G. Duffy-Ly <bduffyly@versity.com>
This commit is contained in:
Bryant G. Duffy-Ly
2022-01-03 15:12:44 -06:00
parent 8a504cd5ae
commit 61ad844891
3 changed files with 65 additions and 3 deletions

View File

@@ -675,18 +675,28 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
/*
* We want to drain the proc_workq in order to ensure that
* that the inflight lock recovery work is fully flushed out
* so that we can prevent the client/server racing trying to
* do lock recovery and processing farewell at the same time.
*/
if (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn)
drain_workqueue(conn->proc_workq);
/*
* Initial received greetings are processed
* Initial received greetings and farewell are processed
* synchronously before any other incoming messages.
*
* Incoming requests or responses to the lock client are
* called synchronously to avoid reordering.
*/
if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
(nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn) ||
(nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
scoutfs_net_proc_worker(&mrecv->proc_work);
else
queue_work(conn->workq, &mrecv->proc_work);
queue_work(conn->proc_workq, &mrecv->proc_work);
}
if (ret)
@@ -851,6 +861,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
}
destroy_workqueue(conn->workq);
destroy_workqueue(conn->proc_workq);
scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
kfree(conn->info);
trace_scoutfs_conn_destroy_free(conn);
@@ -1150,6 +1161,8 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
/* wait for socket and proc work to finish, includes chained work */
drain_workqueue(conn->workq);
drain_workqueue(conn->proc_workq);
/* tear down the sock now that all work is done */
if (conn->sock) {
sock_release(conn->sock);
@@ -1347,7 +1360,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
return NULL;
}
conn->workq = alloc_workqueue("scoutfs_net_%s",
conn->workq = alloc_workqueue("scoutfs_net_workq_%s",
WQ_UNBOUND | WQ_NON_REENTRANT, 0,
name_suffix);
if (!conn->workq) {
@@ -1356,6 +1369,16 @@ scoutfs_net_alloc_conn(struct super_block *sb,
return NULL;
}
conn->proc_workq = alloc_workqueue("scoutfs_net_proc_workq_%s",
WQ_UNBOUND | WQ_NON_REENTRANT, 0,
name_suffix);
if (!conn->proc_workq) {
destroy_workqueue(conn->workq);
kfree(conn->info);
kfree(conn);
return NULL;
}
conn->sb = sb;
conn->notify_up = notify_up;
conn->notify_down = notify_down;
@@ -1388,6 +1411,14 @@ scoutfs_net_alloc_conn(struct super_block *sb,
return conn;
}
/* Give the caller the client info of the connection. This is used by
* server processing the server_farewell, and lock response/recovery.
*/
void *scoutfs_net_client_info(struct scoutfs_net_connection *conn)
{
return conn->info;
}
/*
* Give the caller the client rid of the connection. This used by rare
* server processing callers who want to send async responses after

View File

@@ -63,6 +63,7 @@ struct scoutfs_net_connection {
atomic64_t recv_seq;
struct workqueue_struct *workq;
struct workqueue_struct *proc_workq;
struct work_struct listen_work;
struct work_struct connect_work;
struct work_struct send_work;
@@ -115,6 +116,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
scoutfs_net_notify_t notify_up,
scoutfs_net_notify_t notify_down, size_t info_size,
scoutfs_net_request_t *req_funcs, char *name_suffix);
void *scoutfs_net_client_info(struct scoutfs_net_connection *conn);
u64 scoutfs_net_client_rid(struct scoutfs_net_connection *conn);
int scoutfs_net_connect(struct super_block *sb,
struct scoutfs_net_connection *conn,

View File

@@ -122,6 +122,7 @@ struct server_info {
struct server_client_info {
u64 rid;
struct list_head head;
bool received_farewell;
};
static __le64 *first_valopt(struct scoutfs_volume_options *valopt)
@@ -1505,11 +1506,15 @@ static int server_lock(struct super_block *sb,
struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
u64 rid = scoutfs_net_client_rid(conn);
if (arg_len != sizeof(struct scoutfs_net_lock))
return -EINVAL;
if (sci->received_farewell)
return scoutfs_net_response(sb, conn, cmd, id, -EINVAL, NULL, 0);
return scoutfs_lock_server_request(sb, rid, id, arg);
}
@@ -1518,11 +1523,15 @@ static int lock_response(struct super_block *sb,
void *resp, unsigned int resp_len,
int error, void *data)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
u64 rid = scoutfs_net_client_rid(conn);
if (resp_len != sizeof(struct scoutfs_net_lock))
return -EINVAL;
if (sci->received_farewell)
return 0;
return scoutfs_lock_server_response(sb, rid, resp);
}
@@ -1560,11 +1569,15 @@ static int lock_recover_response(struct super_block *sb,
void *resp, unsigned int resp_len,
int error, void *data)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
u64 rid = scoutfs_net_client_rid(conn);
if (invalid_recover(resp, resp_len))
return -EINVAL;
if (sci->received_farewell)
return 0;
return scoutfs_lock_server_recover_response(sb, rid, resp);
}
@@ -3516,9 +3529,11 @@ static int server_farewell(struct super_block *sb,
struct scoutfs_net_connection *conn,
u8 cmd, u64 id, void *arg, u16 arg_len)
{
struct server_client_info *sci = scoutfs_net_client_info(conn);
struct server_info *server = SCOUTFS_SB(sb)->server_info;
u64 rid = scoutfs_net_client_rid(conn);
struct farewell_request *fw;
int ret;
if (arg_len != 0)
return -EINVAL;
@@ -3536,6 +3551,20 @@ static int server_farewell(struct super_block *sb,
list_add_tail(&fw->entry, &server->farewell_requests);
spin_unlock(&server->farewell_lock);
/*
* Tear down client lock server state and set that we recieved farewell
* to ensure that we do not race between client and server trying to process
* lock recovery at the same time (race). We also want to mark that the recovery
* finished so that if client's try to send stuff later; the server doesnt care.
*/
sci->received_farewell = true;
ret = scoutfs_lock_server_farewell(sb, rid);
if (ret < 0) {
kfree(fw);
return ret;
}
scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_LOCKS);
queue_farewell_work(server);
/* response will be sent later */