Handle advance_seq being replayed in new server

As a core principle, all server message processing needs to be safe to replay as servers shut down and requests are resent to new servers. The advance_seq handler got this wrong. It would only try to remove a trans_seq item for the seq sent by the client before inserting a new item for the next seq. This change could be committed before the reply was lost as the server shuts down. The next server would process the resent request but wouldn't find the old item for the seq that the client sent, and would ignore the new item that the previous server inserted. It would then insert another greater seq for the same client. This would leave behind a stale old trans_seq that would be returned as the last_seq which would forever limit the results that could be returned from the seq index walks. This fix is to always remove all previous seq items for the client before inserting a new one. This creates O(clients) server work, but it's minimal. This manifest as occasional simple-inode-index test failures (say 1 in 5?) which would trigger if the unmounts during previous tests would happen to have advance_seq resent across server shutdowns. With this change the test now reliably passes. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-04 11:24:21 +00:00 · 2021-01-26 12:58:02 -08:00
parent 5a90234c94
commit ade539217e
3 changed files with 100 additions and 92 deletions
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -121,16 +121,14 @@ int scoutfs_client_get_roots(struct super_block *sb,
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-	__le64 before = cpu_to_le64p(seq);
-	__le64 after;
+	__le64 leseq;
 	int ret;

 	ret = scoutfs_net_sync_request(sb, client->conn,
 				       SCOUTFS_NET_CMD_ADVANCE_SEQ,
-				       &before, sizeof(before),
-				       &after, sizeof(after));
+				       NULL, 0, &leseq, sizeof(leseq));
 	if (ret == 0)
-		*seq = le64_to_cpu(after);
+		*seq = le64_to_cpu(leseq);

 	return ret;
 }
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1938,31 +1938,27 @@ DEFINE_EVENT(scoutfs_clock_sync_class, scoutfs_recv_clock_sync,
 );

 TRACE_EVENT(scoutfs_trans_seq_advance,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 prev_seq,
-		 u64 next_seq),
+	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),

-	TP_ARGS(sb, rid, prev_seq, next_seq),
+	TP_ARGS(sb, rid, trans_seq),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, s_rid)
-		__field(__u64, prev_seq)
-		__field(__u64, next_seq)
+		__field(__u64, trans_seq)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->s_rid = rid;
-		__entry->prev_seq = prev_seq;
-		__entry->next_seq = next_seq;
+		__entry->trans_seq = trans_seq;
 	),

-	TP_printk(SCSBF" rid %016llx prev_seq %llu next_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->prev_seq,
-		  __entry->next_seq)
+	TP_printk(SCSBF" rid %016llx trans_seq %llu\n",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_trans_seq_farewell,
+TRACE_EVENT(scoutfs_trans_seq_remove,
 	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),

 	TP_ARGS(sb, rid, trans_seq),
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -649,79 +649,10 @@ static void init_trans_seq_key(struct scoutfs_key *key, u64 seq, u64 rid)
 }

 /*
- * Give the client the next sequence number for their transaction.  They
- * provide their previous transaction sequence number that they've
- * committed.
- *
- * We track the sequence numbers of transactions that clients have open.
- * This limits the transaction sequence numbers that can be returned in
- * the index of inodes by meta and data transaction numbers.  We
- * communicate the largest possible sequence number to clients via an
- * rpc.
- *
- * The transaction sequence tracking is stored in a btree so it is
- * shared across servers.  Final entries are removed when processing a
- * client's farewell or when it's removed.
+ * Remove all trans_seq items owned by the client rid, the caller holds
+ * the seq_rwsem.
 */
-static int server_advance_seq(struct super_block *sb,
-			      struct scoutfs_net_connection *conn,
-			      u8 cmd, u64 id, void *arg, u16 arg_len)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	__le64 their_seq;
-	__le64 next_seq;
-	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_key key;
-	int ret;
-
-	if (arg_len != sizeof(__le64)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	memcpy(&their_seq, arg, sizeof(their_seq));
-
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
-
-	down_write(&server->seq_rwsem);
-
-	if (their_seq != 0) {
-		init_trans_seq_key(&key, le64_to_cpu(their_seq), rid);
-		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
-					   &super->trans_seqs, &key);
-		if (ret < 0 && ret != -ENOENT)
-			goto unlock;
-	}
-
-	next_seq = super->next_trans_seq;
-	le64_add_cpu(&super->next_trans_seq, 1);
-
-	trace_scoutfs_trans_seq_advance(sb, rid, le64_to_cpu(their_seq),
-					le64_to_cpu(next_seq));
-
-	init_trans_seq_key(&key, le64_to_cpu(next_seq), rid);
-	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
-				   &super->trans_seqs, &key, NULL, 0);
-unlock:
-	up_write(&server->seq_rwsem);
-	ret = scoutfs_server_apply_commit(sb, ret);
-
-out:
-	return scoutfs_net_response(sb, conn, cmd, id, ret,
-				    &next_seq, sizeof(next_seq));
-}
-
-/*
- * Remove any transaction sequences owned by the client.  They must have
- * committed any final transaction by the time they get here via sending
- * their farewell message.  This can be called multiple times as the
- * client's farewell is retransmitted so it's OK to not find any
- * entries.  This is called with the server commit rwsem held.
- */
-static int remove_trans_seq(struct super_block *sb, u64 rid)
+static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -730,8 +661,6 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 	struct scoutfs_key key;
 	int ret = 0;

-	down_write(&server->seq_rwsem);
-
 	init_trans_seq_key(&key, 0, 0);

 	for (;;) {
@@ -746,17 +675,102 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 		scoutfs_btree_put_iref(&iref);

 		if (le64_to_cpu(key.skts_rid) == rid) {
-			trace_scoutfs_trans_seq_farewell(sb, rid,
+			trace_scoutfs_trans_seq_remove(sb, rid,
 					le64_to_cpu(key.skts_trans_seq));
 			ret = scoutfs_btree_delete(sb, &server->alloc,
 						   &server->wri,
 						   &super->trans_seqs, &key);
-			break;
+			if (ret < 0)
+				break;
 		}

 		scoutfs_key_inc(&key);
 	}

+	return ret;
+}
+
+/*
+ * Give the client the next sequence number for the transaction that
+ * they're opening.
+ *
+ * We track the sequence numbers of transactions that clients have open.
+ * This limits the transaction sequence numbers that can be returned in
+ * the index of inodes by meta and data transaction numbers.  We
+ * communicate the largest possible sequence number to clients via an
+ * rpc.
+ *
+ * The transaction sequence tracking is stored in a btree so it is
+ * shared across servers.  Final entries are removed when processing a
+ * client's farewell or when it's removed.  We can be processent a
+ * resent request that was committed by a previous server before the
+ * reply was lost.  At this point the client has no transactions open
+ * and may or may not have just finished one.  To keep it simple we
+ * always remove any previous seq items, if there are any, and then
+ * insert a new item for the client at the next greatest seq.
+ */
+static int server_advance_seq(struct super_block *sb,
+			      struct scoutfs_net_connection *conn,
+			      u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	u64 rid = scoutfs_net_client_rid(conn);
+	struct scoutfs_key key;
+	__le64 leseq = 0;
+	u64 seq;
+	int ret;
+
+	if (arg_len != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_server_hold_commit(sb);
+	if (ret)
+		goto out;
+
+	down_write(&server->seq_rwsem);
+
+	ret = remove_trans_seq_locked(sb, rid);
+	if (ret < 0)
+		goto unlock;
+
+	seq = le64_to_cpu(super->next_trans_seq);
+	le64_add_cpu(&super->next_trans_seq, 1);
+
+	trace_scoutfs_trans_seq_advance(sb, rid, seq);
+
+	init_trans_seq_key(&key, seq, rid);
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->trans_seqs, &key, NULL, 0);
+	if (ret == 0)
+		leseq = cpu_to_le64(seq);
+unlock:
+	up_write(&server->seq_rwsem);
+	ret = scoutfs_server_apply_commit(sb, ret);
+
+out:
+	return scoutfs_net_response(sb, conn, cmd, id, ret,
+				    &leseq, sizeof(leseq));
+}
+
+/*
+ * Remove any transaction sequences owned by the client who's sent a
+ * farewell They must have committed any final transaction by the time
+ * they get here via sending their farewell message.  This can be called
+ * multiple times as the client's farewell is retransmitted so it's OK
+ * to not find any entries.  This is called with the server commit rwsem
+ * held.
+ */
+static int remove_trans_seq(struct super_block *sb, u64 rid)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	int ret = 0;
+
+	down_write(&server->seq_rwsem);
+	ret = remove_trans_seq_locked(sb, rid);
 	up_write(&server->seq_rwsem);

 	return ret;