Add tcp_keepalive_timeout_ms option.

The default TCP keepalive value is currently 10s, resulting in clients
being disconnected after 10 seconds of not replying to a TCP keepalive
packet. These keepalive values are reasonable most of the times, but
we've seen client disconnects where this timeout has been exceeded,
resulting in fencing. The cause for this is unknown at this time, but it
is suspected that network intermissions are happening.

This change adds a configurable value for this specific client socket
timeout. It enforces that its value is above UNRESPONSIVE_PROBES, whose
value remains unchanged.

The default value of 10000ms (10s) remains the trusted value. It is
enirely unclear and untested what values are reasonable and which
ones are not.  Since the value of this setting can and will interact
with other timeout values, care must be taken to not exceed certain
other timeout values.  I've tested this only briefly with values of
5000 and 25000. Outside that range is likely problematic.

Signed-off-by: Auke Kok <auke.kok@versity.com>
This commit is contained in:
Auke Kok
2025-09-09 09:58:25 -07:00
parent 65ea250de9
commit ad79ee94f9
4 changed files with 61 additions and 9 deletions

View File

@@ -32,6 +32,7 @@
#include "endian_swap.h"
#include "tseq.h"
#include "fence.h"
#include "options.h"
/*
* scoutfs networking delivers requests and responses between nodes.
@@ -998,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
* The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
* TCP_USER_TIMEOUT only applies if there is unacked written data in the
* send queue. It doesn't work if the connection is idle. Adding
* keepalice probes with user_timeout set changes how the keepalive
* keepalive probes with user_timeout set changes how the keepalive
* timeout is calculated. CNT no longer matters. Each time
* additional probes (not the first) are sent the user timeout is
* checked against the last time data was received. If none of the
@@ -1010,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
* elapses during the probe timer processing after the unsuccessful
* probes.
*/
#define UNRESPONSIVE_TIMEOUT_SECS 10
#define UNRESPONSIVE_PROBES 3
static int sock_opts_and_names(struct scoutfs_net_connection *conn,
static int sock_opts_and_names(struct super_block *sb,
struct scoutfs_net_connection *conn,
struct socket *sock)
{
struct scoutfs_mount_options opts;
int optval;
int ret;
scoutfs_options_read(sb, &opts);
/* we use a keepalive timeout instead of send timeout */
ret = kc_sock_set_sndtimeo(sock, 0);
if (ret)
@@ -1030,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
if (ret)
goto out;
BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
ret = kc_tcp_sock_set_keepidle(sock, optval);
if (ret)
goto out;
@@ -1041,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
if (ret)
goto out;
optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
optval = opts.tcp_keepalive_timeout_ms;
ret = kc_tcp_sock_set_user_timeout(sock, optval);
if (ret)
goto out;
@@ -1109,7 +1111,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
continue;
}
ret = sock_opts_and_names(acc_conn, acc_sock);
ret = sock_opts_and_names(sb, acc_conn, acc_sock);
if (ret) {
sock_release(acc_sock);
destroy_conn(acc_conn);
@@ -1180,7 +1182,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
if (ret)
goto out;
ret = sock_opts_and_names(conn, sock);
ret = sock_opts_and_names(sb, conn, sock);
if (ret)
goto out;

View File

@@ -39,6 +39,7 @@ enum {
Opt_orphan_scan_delay_ms,
Opt_quorum_heartbeat_timeout_ms,
Opt_quorum_slot_nr,
Opt_tcp_keepalive_timeout_ms,
Opt_err,
};
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
{Opt_err, NULL}
};
@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS (10 * MSEC_PER_SEC)
static void init_default_options(struct scoutfs_mount_options *opts)
{
memset(opts, 0, sizeof(*opts));
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
opts->quorum_slot_nr = -1;
opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
}
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
return 0;
}
static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
{
if (ret < 0) {
scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
return -EINVAL;
}
if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
return -EINVAL;
}
return 0;
}
/*
* Parse the option string into our options struct. This can allocate
* memory in the struct. The caller is responsible for always calling
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->data_prealloc_contig_only = nr;
break;
case Opt_tcp_keepalive_timeout_ms:
ret = match_int(args, &nr);
ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
if (ret < 0)
return ret;
opts->tcp_keepalive_timeout_ms = nr;
break;
case Opt_log_merge_wait_timeout_ms:
ret = match_int(args, &nr);
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
if (opts.quorum_slot_nr >= 0)
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);
return 0;
}

View File

@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
unsigned int orphan_scan_delay_ms;
int quorum_slot_nr;
u64 quorum_heartbeat_timeout_ms;
int tcp_keepalive_timeout_ms;
};
#define UNRESPONSIVE_PROBES 3
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);

View File

@@ -130,6 +130,24 @@ the server for the filesystem if it is elected leader.
The assigned number must match one of the slots defined with \-Q options
when the filesystem was created with mkfs. If the number assigned
doesn't match a number created during mkfs then the mount will fail.
.TP
.B tcp_keepalive_timeout_ms=<number>
This option sets the amount of time, in milliseconds, that a client
connection will wait for active TCP packets, before deciding that
the connection is dead. This setting is per-mount and only changes
the behavior of that mount.
.sp
The default value of this setting is 10000msec (10s). Any precision
beyond a whole second is likely unrealistic due to the nature of
TCP keepalive mechanisms in the Linux kernel. Valid values are any
value higher than 3000 (3s). Values that are higher than 30000msec
(30s) will likely interfere with other embedded timeout values.
.sp
The TCP keepalive mechanism is complex and observing a lost connection
quickly is important to maintain cluster stability. If the local
network suffers from intermittent outages this option may provide
some respite to overcome these outages without the cluster becoming
desynchronized.
.SH VOLUME OPTIONS
Volume options are persistent options which are stored in the super
block in the metadata device and which apply to all mounts of the volume.