From 46640e4ff902b97df486f4d5154ea3c50f6ab86c Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 16 May 2023 14:28:12 -0700 Subject: [PATCH] Add counter for quorum heartbeat send failures Add a counter which tracks the number of heartbeat message send attempts which fail. Signed-off-by: Zach Brown --- kmod/src/counters.h | 1 + kmod/src/quorum.c | 23 +++++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 378fcdc1..4784e73d 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -166,6 +166,7 @@ EXPAND_COUNTER(quorum_recv_resignation) \ EXPAND_COUNTER(quorum_recv_vote) \ EXPAND_COUNTER(quorum_send_heartbeat) \ + EXPAND_COUNTER(quorum_send_heartbeat_dropped) \ EXPAND_COUNTER(quorum_send_resignation) \ EXPAND_COUNTER(quorum_send_request) \ EXPAND_COUNTER(quorum_send_vote) \ diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index e15f4d77..83193bba 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -210,12 +210,16 @@ static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes) return cpu_to_le32(crc32c(~0, qmes, len)); } -static void send_msg_members(struct super_block *sb, int type, u64 term, - int only) +/* + * Returns the number of failures from sendmsg. + */ +static int send_msg_members(struct super_block *sb, int type, u64 term, int only) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); DECLARE_QUORUM_INFO(sb, qinf); + int failed = 0; ktime_t now; + int ret; int i; struct scoutfs_quorum_message qmes = { @@ -241,7 +245,6 @@ static void send_msg_members(struct super_block *sb, int type, u64 term, qmes.crc = quorum_message_crc(&qmes); - for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { if (!quorum_slot_present(&qinf->qconf, i) || (only >= 0 && i != only) || i == qinf->our_quorum_slot_nr) @@ -249,7 +252,9 @@ static void send_msg_members(struct super_block *sb, int type, u64 term, scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin); now = ktime_get(); - kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len); + ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len); + if (ret != kv.iov_len) + failed++; spin_lock(&qinf->show_lock); qinf->last_send[i].msg.term = term; @@ -260,6 +265,8 @@ static void send_msg_members(struct super_block *sb, int type, u64 term, if (i == only) break; } + + return failed; } #define send_msg_to(sb, type, term, nr) send_msg_members(sb, type, term, nr) @@ -839,8 +846,12 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* leaders regularly send heartbeats to delay elections */ if (qst.role == LEADER && ktime_after(ktime_get(), qst.timeout)) { - send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, - qst.term); + ret = send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, qst.term); + if (ret > 0) { + scoutfs_add_counter(sb, quorum_send_heartbeat_dropped, ret); + ret = 0; + } + qst.timeout = heartbeat_interval(); scoutfs_inc_counter(sb, quorum_send_heartbeat); }