Merge pull request #64 from bgly/bduffyly/quorum_race

Fix client/server race between lock recov and farewell processing
2026-01-08 13:01:23 +00:00 · 2022-01-14 09:03:00 -08:00
parent 2c8f5d8fc1 8db5c118c3
commit e97ea5407d
7 changed files with 183 additions and 58 deletions
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -153,30 +153,30 @@ enum {
 */
 static void add_client_entry(struct server_lock_node *snode,
 			     struct list_head *list,
-			     struct client_lock_entry *clent)
+			     struct client_lock_entry *c_ent)
 {
 	WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));

-	if (list_empty(&clent->head))
-		list_add_tail(&clent->head, list);
+	if (list_empty(&c_ent->head))
+		list_add_tail(&c_ent->head, list);
 	else
-		list_move_tail(&clent->head, list);
+		list_move_tail(&c_ent->head, list);

-	clent->on_list = list == &snode->granted ? OL_GRANTED :
+	c_ent->on_list = list == &snode->granted ? OL_GRANTED :
 			 list == &snode->requested ? OL_REQUESTED :
 			 OL_INVALIDATED;
 }

 static void free_client_entry(struct lock_server_info *inf,
 			      struct server_lock_node *snode,
-			      struct client_lock_entry *clent)
+			      struct client_lock_entry *c_ent)
 {
 	WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));

-	if (!list_empty(&clent->head))
-		list_del_init(&clent->head);
-	scoutfs_tseq_del(&inf->tseq_tree, &clent->tseq_entry);
-	kfree(clent);
+	if (!list_empty(&c_ent->head))
+		list_del_init(&c_ent->head);
+	scoutfs_tseq_del(&inf->tseq_tree, &c_ent->tseq_entry);
+	kfree(c_ent);
 }

 static bool invalid_mode(u8 mode)
@@ -339,13 +339,13 @@ static struct client_lock_entry *find_entry(struct server_lock_node *snode,
 					    struct list_head *list,
 					    u64 rid)
 {
-	struct client_lock_entry *clent;
+	struct client_lock_entry *c_ent;

 	WARN_ON_ONCE(!mutex_is_locked(&snode->mutex));

-	list_for_each_entry(clent, list, head) {
-		if (clent->rid == rid)
-			return clent;
+	list_for_each_entry(c_ent, list, head) {
+		if (c_ent->rid == rid)
+			return c_ent;
 	}

 	return NULL;
@@ -364,7 +364,7 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
 				u64 net_id, struct scoutfs_net_lock *nl)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct client_lock_entry *clent;
+	struct client_lock_entry *c_ent;
 	struct server_lock_node *snode;
 	int ret;

@@ -376,29 +376,29 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
 		goto out;
 	}

-	clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
-	if (!clent) {
+	c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
+	if (!c_ent) {
 		ret = -ENOMEM;
 		goto out;
 	}

-	INIT_LIST_HEAD(&clent->head);
-	clent->rid = rid;
-	clent->net_id = net_id;
-	clent->mode = nl->new_mode;
+	INIT_LIST_HEAD(&c_ent->head);
+	c_ent->rid = rid;
+	c_ent->net_id = net_id;
+	c_ent->mode = nl->new_mode;

 	snode = alloc_server_lock(inf, &nl->key);
 	if (snode == NULL) {
-		kfree(clent);
+		kfree(c_ent);
 		ret = -ENOMEM;
 		goto out;
 	}

 	snode->stats[SLT_REQUEST]++;

-	clent->snode = snode;
-	add_client_entry(snode, &snode->requested, clent);
-	scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
+	c_ent->snode = snode;
+	add_client_entry(snode, &snode->requested, c_ent);
+	scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);

 	ret = process_waiting_requests(sb, snode);
 out:
@@ -417,7 +417,7 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 				 struct scoutfs_net_lock *nl)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct client_lock_entry *clent;
+	struct client_lock_entry *c_ent;
 	struct server_lock_node *snode;
 	int ret;

@@ -438,18 +438,18 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,

 	snode->stats[SLT_RESPONSE]++;

-	clent = find_entry(snode, &snode->invalidated, rid);
-	if (!clent) {
+	c_ent = find_entry(snode, &snode->invalidated, rid);
+	if (!c_ent) {
 		put_server_lock(inf, snode);
 		ret = -EINVAL;
 		goto out;
 	}

 	if (nl->new_mode == SCOUTFS_LOCK_NULL) {
-		free_client_entry(inf, snode, clent);
+		free_client_entry(inf, snode, c_ent);
 	} else {
-		clent->mode = nl->new_mode;
-		add_client_entry(snode, &snode->granted, clent);
+		c_ent->mode = nl->new_mode;
+		add_client_entry(snode, &snode->granted, c_ent);
 	}

 	ret = process_waiting_requests(sb, snode);
@@ -632,7 +632,7 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct client_lock_entry *existing;
-	struct client_lock_entry *clent;
+	struct client_lock_entry *c_ent;
 	struct server_lock_node *snode;
 	struct scoutfs_key key;
 	int ret = 0;
@@ -652,35 +652,35 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 	}

 	for (i = 0; i < le16_to_cpu(nlr->nr); i++) {
-		clent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
-		if (!clent) {
+		c_ent = kzalloc(sizeof(struct client_lock_entry), GFP_NOFS);
+		if (!c_ent) {
 			ret = -ENOMEM;
 			goto out;
 		}

-		INIT_LIST_HEAD(&clent->head);
-		clent->rid = rid;
-		clent->net_id = 0;
-		clent->mode = nlr->locks[i].new_mode;
+		INIT_LIST_HEAD(&c_ent->head);
+		c_ent->rid = rid;
+		c_ent->net_id = 0;
+		c_ent->mode = nlr->locks[i].new_mode;

 		snode = alloc_server_lock(inf, &nlr->locks[i].key);
 		if (snode == NULL) {
-			kfree(clent);
+			kfree(c_ent);
 			ret = -ENOMEM;
 			goto out;
 		}

 		existing = find_entry(snode, &snode->granted, rid);
 		if (existing) {
-			kfree(clent);
+			kfree(c_ent);
 			put_server_lock(inf, snode);
 			ret = -EEXIST;
 			goto out;
 		}

-		clent->snode = snode;
-		add_client_entry(snode, &snode->granted, clent);
-		scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
+		c_ent->snode = snode;
+		add_client_entry(snode, &snode->granted, c_ent);
+		scoutfs_tseq_add(&inf->tseq_tree, &c_ent->tseq_entry);

 		put_server_lock(inf, snode);

@@ -707,7 +707,7 @@ out:
 int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct client_lock_entry *clent;
+	struct client_lock_entry *c_ent;
 	struct client_lock_entry *tmp;
 	struct server_lock_node *snode;
 	struct scoutfs_key key;
@@ -724,9 +724,9 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 			    (list == &snode->requested) ? &snode->invalidated :
 			    NULL) {

-			list_for_each_entry_safe(clent, tmp, list, head) {
-				if (clent->rid == rid) {
-					free_client_entry(inf, snode, clent);
+			list_for_each_entry_safe(c_ent, tmp, list, head) {
+				if (c_ent->rid == rid) {
+					free_client_entry(inf, snode, c_ent);
 					freed = true;
 				}
 			}
@@ -787,15 +787,15 @@ static char *lock_on_list_string(u8 on_list)
 static void lock_server_tseq_show(struct seq_file *m,
 				  struct scoutfs_tseq_entry *ent)
 {
-	struct client_lock_entry *clent = container_of(ent,
+	struct client_lock_entry *c_ent = container_of(ent,
 						       struct client_lock_entry,
 						       tseq_entry);
-	struct server_lock_node *snode = clent->snode;
+	struct server_lock_node *snode = c_ent->snode;

 	seq_printf(m, SK_FMT" %s %s rid %016llx net_id %llu\n",
-		   SK_ARG(&snode->key), lock_mode_string(clent->mode),
-		   lock_on_list_string(clent->on_list), clent->rid,
-		   clent->net_id);
+		   SK_ARG(&snode->key), lock_mode_string(c_ent->mode),
+		   lock_on_list_string(c_ent->on_list), c_ent->rid,
+		   c_ent->net_id);
 }

 static void stats_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
@@ -857,7 +857,7 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct server_lock_node *snode;
 	struct server_lock_node *stmp;
-	struct client_lock_entry *clent;
+	struct client_lock_entry *c_ent;
 	struct client_lock_entry *ctmp;
 	LIST_HEAD(list);

@@ -873,8 +873,8 @@ void scoutfs_lock_server_destroy(struct super_block *sb)
 			list_splice_init(&snode->invalidated, &list);

 			mutex_lock(&snode->mutex);
-			list_for_each_entry_safe(clent, ctmp, &list, head) {
-				free_client_entry(inf, snode, clent);
+			list_for_each_entry_safe(c_ent, ctmp, &list, head) {
+				free_client_entry(inf, snode, c_ent);
 			}
 			mutex_unlock(&snode->mutex);

--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -675,18 +675,28 @@ static void scoutfs_net_recv_worker(struct work_struct *work)

 		scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);

+		/* 
+		 * We want to drain the proc_workq in order to ensure that
+		 * that the inflight lock recovery work is fully flushed out
+		 * so that we can prevent the client/server racing trying to
+		 * do lock recovery and processing farewell at the same time.
+		 */
+		if (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn)
+			drain_workqueue(conn->proc_workq);
+
 		/*
-		 * Initial received greetings are processed
+		 * Initial received greetings and farewell are processed
 		 * synchronously before any other incoming messages.
 		 *
 		 * Incoming requests or responses to the lock client are
 		 * called synchronously to avoid reordering.
 		 */
 		if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
+		    (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn) ||
 		    (nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
 			scoutfs_net_proc_worker(&mrecv->proc_work);
 		else
-			queue_work(conn->workq, &mrecv->proc_work);
+			queue_work(conn->proc_workq, &mrecv->proc_work);
 	}

 	if (ret)
@@ -851,6 +861,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	}

 	destroy_workqueue(conn->workq);
+	destroy_workqueue(conn->proc_workq);
 	scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
 	kfree(conn->info);
 	trace_scoutfs_conn_destroy_free(conn);
@@ -1150,6 +1161,8 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
 	/* wait for socket and proc work to finish, includes chained work */
 	drain_workqueue(conn->workq);

+	drain_workqueue(conn->proc_workq);
+
 	/* tear down the sock now that all work is done */
 	if (conn->sock) {
 		sock_release(conn->sock);
@@ -1347,7 +1360,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 		return NULL;
 	}

-	conn->workq = alloc_workqueue("scoutfs_net_%s",
+	conn->workq = alloc_workqueue("scoutfs_net_workq_%s",
 				      WQ_UNBOUND | WQ_NON_REENTRANT, 0,
 				      name_suffix);
 	if (!conn->workq) {
@@ -1356,6 +1369,16 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 		return NULL;
 	}

+	conn->proc_workq = alloc_workqueue("scoutfs_net_proc_workq_%s",
+					   WQ_UNBOUND | WQ_NON_REENTRANT, 0,
+					   name_suffix);
+	if (!conn->proc_workq) {
+		destroy_workqueue(conn->workq);
+		kfree(conn->info);
+		kfree(conn);
+		return NULL;
+	}
+
 	conn->sb = sb;
 	conn->notify_up = notify_up;
 	conn->notify_down = notify_down;
@@ -1388,6 +1411,14 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	return conn;
 }

+/* Give the caller the client info of the connection. This is used by
+ * server processing the server_farewell, and lock response/recovery.
+ */
+void *scoutfs_net_client_info(struct scoutfs_net_connection *conn)
+{
+	return conn->info;
+}
+
 /*
 * Give the caller the client rid of the connection.  This used by rare
 * server processing callers who want to send async responses after
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -63,6 +63,7 @@ struct scoutfs_net_connection {
 	atomic64_t recv_seq;

 	struct workqueue_struct *workq;
+	struct workqueue_struct *proc_workq;
 	struct work_struct listen_work;
 	struct work_struct connect_work;
 	struct work_struct send_work;
@@ -115,6 +116,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 		       scoutfs_net_notify_t notify_up,
 		       scoutfs_net_notify_t notify_down, size_t info_size,
 		       scoutfs_net_request_t *req_funcs, char *name_suffix);
+void *scoutfs_net_client_info(struct scoutfs_net_connection *conn);
 u64 scoutfs_net_client_rid(struct scoutfs_net_connection *conn);
 int scoutfs_net_connect(struct super_block *sb,
 			struct scoutfs_net_connection *conn,
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -122,6 +122,7 @@ struct server_info {
 struct server_client_info {
 	u64 rid;
 	struct list_head head;
+	bool received_farewell;
 };

 static __le64 *first_valopt(struct scoutfs_volume_options *valopt)
@@ -1505,11 +1506,15 @@ static int server_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn,
 		       u8 cmd, u64 id, void *arg, u16 arg_len)
 {
+	struct server_client_info *sci = scoutfs_net_client_info(conn);
 	u64 rid = scoutfs_net_client_rid(conn);

 	if (arg_len != sizeof(struct scoutfs_net_lock))
 		return -EINVAL;

+	if (sci->received_farewell)
+		return scoutfs_net_response(sb, conn, cmd, id, -EINVAL, NULL, 0);
+
 	return scoutfs_lock_server_request(sb, rid, id, arg);
 }

@@ -1518,11 +1523,15 @@ static int lock_response(struct super_block *sb,
 			 void *resp, unsigned int resp_len,
 			 int error, void *data)
 {
+	struct server_client_info *sci = scoutfs_net_client_info(conn);
 	u64 rid = scoutfs_net_client_rid(conn);

 	if (resp_len != sizeof(struct scoutfs_net_lock))
 		return -EINVAL;

+	if (sci->received_farewell)
+		return 0;
+
 	return scoutfs_lock_server_response(sb, rid, resp);
 }

@@ -1560,11 +1569,15 @@ static int lock_recover_response(struct super_block *sb,
 				 void *resp, unsigned int resp_len,
 				 int error, void *data)
 {
+	struct server_client_info *sci = scoutfs_net_client_info(conn);
 	u64 rid = scoutfs_net_client_rid(conn);

 	if (invalid_recover(resp, resp_len))
 		return -EINVAL;

+	if (sci->received_farewell)
+		return 0;
+
 	return scoutfs_lock_server_recover_response(sb, rid, resp);
 }

@@ -3516,9 +3529,11 @@ static int server_farewell(struct super_block *sb,
 			   struct scoutfs_net_connection *conn,
 			   u8 cmd, u64 id, void *arg, u16 arg_len)
 {
+	struct server_client_info *sci = scoutfs_net_client_info(conn);
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;
 	u64 rid = scoutfs_net_client_rid(conn);
 	struct farewell_request *fw;
+	int ret;

 	if (arg_len != 0)
 		return -EINVAL;
@@ -3536,6 +3551,20 @@ static int server_farewell(struct super_block *sb,
 	list_add_tail(&fw->entry, &server->farewell_requests);
 	spin_unlock(&server->farewell_lock);

+	/*
+	 * Tear down client lock server state and set that we recieved farewell
+	 * to ensure that we do not race between client and server trying to process
+	 * lock recovery at the same time (race). We also want to mark that the recovery
+	 * finished so that if client's try to send stuff later; the server doesnt care.
+	 */
+	sci->received_farewell = true;
+	ret = scoutfs_lock_server_farewell(sb, rid);
+	if (ret < 0) {
+		kfree(fw);
+		return ret;
+	}
+	scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_LOCKS);
+
 	queue_farewell_work(server);

 	/* response will be sent later */
--- a/tests/golden/client-unmount-recovery
+++ b/tests/golden/client-unmount-recovery
@@ -0,0 +1 @@
+== 60s of unmounting non-quorum clients during recovery
--- a/tests/sequence
+++ b/tests/sequence
@@ -33,6 +33,7 @@ resize-devices.sh
 fence-and-reclaim.sh
 orphan-inodes.sh
 mount-unmount-race.sh
+client-unmount-recovery.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
 block-stale-reads.sh
--- a/tests/tests/client-unmount-recovery.sh
+++ b/tests/tests/client-unmount-recovery.sh
@@ -0,0 +1,61 @@
+#
+# Unmount Server and unmount a client as it's replaying to a remaining server
+#
+
+majority_nr=$(t_majority_count)
+quorum_nr=$T_QUORUM
+
+test "$quorum_nr" == "$majority_nr" && \
+        t_skip "all quorum members make up majority, need more mounts to unmount"
+
+test "$T_NR_MOUNTS" -lt "$T_QUORUM" && \
+        t_skip "Need enough non-quorum clients to unmount"
+
+for i in $(t_fs_nrs); do
+        mounted[$i]=1
+done
+
+LENGTH=60
+echo "== ${LENGTH}s of unmounting non-quorum clients during recovery"
+END=$((SECONDS + LENGTH))
+while [ "$SECONDS" -lt "$END" ]; do
+        sv=$(t_server_nr)
+        rid=$(t_mount_rid $sv)
+        echo "sv $sv rid $rid" >> "$T_TMP.log"
+        sync
+        t_umount $sv &
+
+        for i in $(t_fs_nrs); do
+                if [ "$i" -ge "$quorum_nr" ]; then
+                        t_umount $i &
+                        echo "umount $i pid $pid quo $quorum_nr" \
+                                >> $T_TMP.log
+                        mounted[$i]=0
+                fi
+        done
+
+        wait
+
+        t_mount $sv &
+        for i in $(t_fs_nrs); do
+                if [ "${mounted[$i]}" == 0 ]; then
+                        t_mount $i &
+                fi
+        done
+
+        wait
+
+        declare RID_LIST=$(cat /sys/fs/scoutfs/*/rid | sort -u)
+        read -a rid_arr <<< $RID_LIST
+
+        declare LOCK_LIST=$(cut -d' ' -f 5 /sys/kernel/debug/scoutfs/*/server_locks | sort -u)
+        read -a lock_arr <<< $LOCK_LIST
+
+        for i in "${lock_arr[@]}"; do
+                if [[ ! " ${rid_arr[*]} " =~ " $i " ]]; then
+                        t_fail "RID($i): exists when not mounted"
+                fi
+        done
+done
+
+t_pass
				`@@ -0,0 +1 @@`
				`== 60s of unmounting non-quorum clients during recovery`