Merge pull request #239 from versity/auke/keepalive

Add tcp_keepalive_timeout_ms option, change default to 60s
This commit is contained in:
Zach Brown
2025-10-29 17:15:17 -07:00
committed by GitHub
4 changed files with 60 additions and 9 deletions

View File

@@ -32,6 +32,7 @@
#include "endian_swap.h" #include "endian_swap.h"
#include "tseq.h" #include "tseq.h"
#include "fence.h" #include "fence.h"
#include "options.h"
/* /*
* scoutfs networking delivers requests and responses between nodes. * scoutfs networking delivers requests and responses between nodes.
@@ -998,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
* The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle. * The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
* TCP_USER_TIMEOUT only applies if there is unacked written data in the * TCP_USER_TIMEOUT only applies if there is unacked written data in the
* send queue. It doesn't work if the connection is idle. Adding * send queue. It doesn't work if the connection is idle. Adding
* keepalice probes with user_timeout set changes how the keepalive * keepalive probes with user_timeout set changes how the keepalive
* timeout is calculated. CNT no longer matters. Each time * timeout is calculated. CNT no longer matters. Each time
* additional probes (not the first) are sent the user timeout is * additional probes (not the first) are sent the user timeout is
* checked against the last time data was received. If none of the * checked against the last time data was received. If none of the
@@ -1010,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
* elapses during the probe timer processing after the unsuccessful * elapses during the probe timer processing after the unsuccessful
* probes. * probes.
*/ */
#define UNRESPONSIVE_TIMEOUT_SECS 10 static int sock_opts_and_names(struct super_block *sb,
#define UNRESPONSIVE_PROBES 3 struct scoutfs_net_connection *conn,
static int sock_opts_and_names(struct scoutfs_net_connection *conn,
struct socket *sock) struct socket *sock)
{ {
struct scoutfs_mount_options opts;
int optval; int optval;
int ret; int ret;
scoutfs_options_read(sb, &opts);
/* we use a keepalive timeout instead of send timeout */ /* we use a keepalive timeout instead of send timeout */
ret = kc_sock_set_sndtimeo(sock, 0); ret = kc_sock_set_sndtimeo(sock, 0);
if (ret) if (ret)
@@ -1030,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
if (ret) if (ret)
goto out; goto out;
BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS); optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
ret = kc_tcp_sock_set_keepidle(sock, optval); ret = kc_tcp_sock_set_keepidle(sock, optval);
if (ret) if (ret)
goto out; goto out;
@@ -1041,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
if (ret) if (ret)
goto out; goto out;
optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC; optval = opts.tcp_keepalive_timeout_ms;
ret = kc_tcp_sock_set_user_timeout(sock, optval); ret = kc_tcp_sock_set_user_timeout(sock, optval);
if (ret) if (ret)
goto out; goto out;
@@ -1115,7 +1117,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
continue; continue;
} }
ret = sock_opts_and_names(acc_conn, acc_sock); ret = sock_opts_and_names(sb, acc_conn, acc_sock);
if (ret) { if (ret) {
sock_release(acc_sock); sock_release(acc_sock);
destroy_conn(acc_conn); destroy_conn(acc_conn);
@@ -1186,7 +1188,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
if (ret) if (ret)
goto out; goto out;
ret = sock_opts_and_names(conn, sock); ret = sock_opts_and_names(sb, conn, sock);
if (ret) if (ret)
goto out; goto out;

View File

@@ -39,6 +39,7 @@ enum {
Opt_orphan_scan_delay_ms, Opt_orphan_scan_delay_ms,
Opt_quorum_heartbeat_timeout_ms, Opt_quorum_heartbeat_timeout_ms,
Opt_quorum_slot_nr, Opt_quorum_slot_nr,
Opt_tcp_keepalive_timeout_ms,
Opt_err, Opt_err,
}; };
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"}, {Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"}, {Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
{Opt_err, NULL} {Opt_err, NULL}
}; };
@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
#define MIN_DATA_PREALLOC_BLOCKS 1ULL #define MIN_DATA_PREALLOC_BLOCKS 1ULL
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX) #define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS (60 * MSEC_PER_SEC)
static void init_default_options(struct scoutfs_mount_options *opts) static void init_default_options(struct scoutfs_mount_options *opts)
{ {
memset(opts, 0, sizeof(*opts)); memset(opts, 0, sizeof(*opts));
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->orphan_scan_delay_ms = -1; opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS; opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
opts->quorum_slot_nr = -1; opts->quorum_slot_nr = -1;
opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
} }
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val) static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
return 0; return 0;
} }
static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
{
if (ret < 0) {
scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
return -EINVAL;
}
if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
return -EINVAL;
}
return 0;
}
/* /*
* Parse the option string into our options struct. This can allocate * Parse the option string into our options struct. This can allocate
* memory in the struct. The caller is responsible for always calling * memory in the struct. The caller is responsible for always calling
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->data_prealloc_contig_only = nr; opts->data_prealloc_contig_only = nr;
break; break;
case Opt_tcp_keepalive_timeout_ms:
ret = match_int(args, &nr);
ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
if (ret < 0)
return ret;
opts->tcp_keepalive_timeout_ms = nr;
break;
case Opt_log_merge_wait_timeout_ms: case Opt_log_merge_wait_timeout_ms:
ret = match_int(args, &nr); ret = match_int(args, &nr);
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr); ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms); seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
if (opts.quorum_slot_nr >= 0) if (opts.quorum_slot_nr >= 0)
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr); seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);
return 0; return 0;
} }

View File

@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
unsigned int orphan_scan_delay_ms; unsigned int orphan_scan_delay_ms;
int quorum_slot_nr; int quorum_slot_nr;
u64 quorum_heartbeat_timeout_ms; u64 quorum_heartbeat_timeout_ms;
int tcp_keepalive_timeout_ms;
}; };
#define UNRESPONSIVE_PROBES 3
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts); void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
int scoutfs_options_show(struct seq_file *seq, struct dentry *root); int scoutfs_options_show(struct seq_file *seq, struct dentry *root);

View File

@@ -130,6 +130,23 @@ the server for the filesystem if it is elected leader.
The assigned number must match one of the slots defined with \-Q options The assigned number must match one of the slots defined with \-Q options
when the filesystem was created with mkfs. If the number assigned when the filesystem was created with mkfs. If the number assigned
doesn't match a number created during mkfs then the mount will fail. doesn't match a number created during mkfs then the mount will fail.
.TP
.B tcp_keepalive_timeout_ms=<number>
This option sets the amount of time, in milliseconds, that a client
connection will wait for active TCP packets, before deciding that
the connection is dead. This setting is per-mount and only changes
the behavior of that mount.
.sp
The default value of this setting is 60000msec (60s). Any precision
beyond a whole second is likely unrealistic due to the nature of
TCP keepalive mechanisms in the Linux kernel. Valid values are any
value higher than 3000 (3s).
.sp
The TCP keepalive mechanism is complex and observing a lost connection
quickly is important to maintain cluster stability. If the local
network suffers from intermittent outages this option may provide
some respite to overcome these outages without the cluster becoming
desynchronized.
.SH VOLUME OPTIONS .SH VOLUME OPTIONS
Volume options are persistent options which are stored in the super Volume options are persistent options which are stored in the super
block in the metadata device and which apply to all mounts of the volume. block in the metadata device and which apply to all mounts of the volume.