diff --git a/kmod/src/net.c b/kmod/src/net.c index 47a7059f..7e633e44 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -32,6 +32,7 @@ #include "endian_swap.h" #include "tseq.h" #include "fence.h" +#include "options.h" /* * scoutfs networking delivers requests and responses between nodes. @@ -998,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn) * The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle. * TCP_USER_TIMEOUT only applies if there is unacked written data in the * send queue. It doesn't work if the connection is idle. Adding - * keepalice probes with user_timeout set changes how the keepalive + * keepalive probes with user_timeout set changes how the keepalive * timeout is calculated. CNT no longer matters. Each time * additional probes (not the first) are sent the user timeout is * checked against the last time data was received. If none of the @@ -1010,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn) * elapses during the probe timer processing after the unsuccessful * probes. */ -#define UNRESPONSIVE_TIMEOUT_SECS 10 -#define UNRESPONSIVE_PROBES 3 -static int sock_opts_and_names(struct scoutfs_net_connection *conn, +static int sock_opts_and_names(struct super_block *sb, + struct scoutfs_net_connection *conn, struct socket *sock) { + struct scoutfs_mount_options opts; int optval; int ret; + scoutfs_options_read(sb, &opts); + /* we use a keepalive timeout instead of send timeout */ ret = kc_sock_set_sndtimeo(sock, 0); if (ret) @@ -1030,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn, if (ret) goto out; - BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS); - optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES); + optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES; ret = kc_tcp_sock_set_keepidle(sock, optval); if (ret) goto out; @@ -1041,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn, if (ret) goto out; - optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC; + optval = opts.tcp_keepalive_timeout_ms; ret = kc_tcp_sock_set_user_timeout(sock, optval); if (ret) goto out; @@ -1115,7 +1117,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work) continue; } - ret = sock_opts_and_names(acc_conn, acc_sock); + ret = sock_opts_and_names(sb, acc_conn, acc_sock); if (ret) { sock_release(acc_sock); destroy_conn(acc_conn); @@ -1186,7 +1188,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work) if (ret) goto out; - ret = sock_opts_and_names(conn, sock); + ret = sock_opts_and_names(sb, conn, sock); if (ret) goto out; diff --git a/kmod/src/options.c b/kmod/src/options.c index 8ce78067..b72344a1 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -39,6 +39,7 @@ enum { Opt_orphan_scan_delay_ms, Opt_quorum_heartbeat_timeout_ms, Opt_quorum_slot_nr, + Opt_tcp_keepalive_timeout_ms, Opt_err, }; @@ -52,6 +53,7 @@ static const match_table_t tokens = { {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, {Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"}, {Opt_quorum_slot_nr, "quorum_slot_nr=%s"}, + {Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"}, {Opt_err, NULL} }; @@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts) #define MIN_DATA_PREALLOC_BLOCKS 1ULL #define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX) +#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS (60 * MSEC_PER_SEC) + static void init_default_options(struct scoutfs_mount_options *opts) { memset(opts, 0, sizeof(*opts)); @@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts) opts->orphan_scan_delay_ms = -1; opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS; opts->quorum_slot_nr = -1; + opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS; } static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val) @@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u return 0; } +static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val) +{ + if (ret < 0) { + scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value"); + return -EINVAL; + } + if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) { + scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu", + val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC)); + return -EINVAL; + } + + return 0; +} + /* * Parse the option string into our options struct. This can allocate * memory in the struct. The caller is responsible for always calling @@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m opts->data_prealloc_contig_only = nr; break; + case Opt_tcp_keepalive_timeout_ms: + ret = match_int(args, &nr); + ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr); + if (ret < 0) + return ret; + opts->tcp_keepalive_timeout_ms = nr; + break; + case Opt_log_merge_wait_timeout_ms: ret = match_int(args, &nr); ret = verify_log_merge_wait_timeout_ms(sb, ret, nr); @@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms); if (opts.quorum_slot_nr >= 0) seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr); + seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms); return 0; } diff --git a/kmod/src/options.h b/kmod/src/options.h index 4eebd669..540c82b8 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -13,8 +13,11 @@ struct scoutfs_mount_options { unsigned int orphan_scan_delay_ms; int quorum_slot_nr; u64 quorum_heartbeat_timeout_ms; + int tcp_keepalive_timeout_ms; }; +#define UNRESPONSIVE_PROBES 3 + void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts); int scoutfs_options_show(struct seq_file *seq, struct dentry *root); diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index f57d8fe8..d8cca732 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -130,6 +130,23 @@ the server for the filesystem if it is elected leader. The assigned number must match one of the slots defined with \-Q options when the filesystem was created with mkfs. If the number assigned doesn't match a number created during mkfs then the mount will fail. +.TP +.B tcp_keepalive_timeout_ms= +This option sets the amount of time, in milliseconds, that a client +connection will wait for active TCP packets, before deciding that +the connection is dead. This setting is per-mount and only changes +the behavior of that mount. +.sp +The default value of this setting is 60000msec (60s). Any precision +beyond a whole second is likely unrealistic due to the nature of +TCP keepalive mechanisms in the Linux kernel. Valid values are any +value higher than 3000 (3s). +.sp +The TCP keepalive mechanism is complex and observing a lost connection +quickly is important to maintain cluster stability. If the local +network suffers from intermittent outages this option may provide +some respite to overcome these outages without the cluster becoming +desynchronized. .SH VOLUME OPTIONS Volume options are persistent options which are stored in the super block in the metadata device and which apply to all mounts of the volume.