mirror of
https://github.com/versity/scoutfs.git
synced 2025-12-23 13:35:18 +00:00
Add tcp_keepalive_timeout_ms option, change default to 60s
The default TCP keepalive value is currently 10s, resulting in clients being disconnected after 10 seconds of not replying to a TCP keepalive packet. These keepalive values are reasonable most of the times, but we've seen client disconnects where this timeout has been exceeded, resulting in fencing. The cause for this is unknown at this time, but it is suspected that network intermissions are happening. This change adds a configurable value for this specific client socket timeout. It enforces that its value is above UNRESPONSIVE_PROBES, whose value remains unchanged. The default value of 10000ms (10s) is changed to 60s. This is the value we're assuming is much better suited for customers and has been briefly trialed, showing that it may help to avoid network level interruptions better. Signed-off-by: Auke Kok <auke.kok@versity.com>
This commit is contained in:
@@ -32,6 +32,7 @@
|
|||||||
#include "endian_swap.h"
|
#include "endian_swap.h"
|
||||||
#include "tseq.h"
|
#include "tseq.h"
|
||||||
#include "fence.h"
|
#include "fence.h"
|
||||||
|
#include "options.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* scoutfs networking delivers requests and responses between nodes.
|
* scoutfs networking delivers requests and responses between nodes.
|
||||||
@@ -998,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
|
|||||||
* The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
|
* The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
|
||||||
* TCP_USER_TIMEOUT only applies if there is unacked written data in the
|
* TCP_USER_TIMEOUT only applies if there is unacked written data in the
|
||||||
* send queue. It doesn't work if the connection is idle. Adding
|
* send queue. It doesn't work if the connection is idle. Adding
|
||||||
* keepalice probes with user_timeout set changes how the keepalive
|
* keepalive probes with user_timeout set changes how the keepalive
|
||||||
* timeout is calculated. CNT no longer matters. Each time
|
* timeout is calculated. CNT no longer matters. Each time
|
||||||
* additional probes (not the first) are sent the user timeout is
|
* additional probes (not the first) are sent the user timeout is
|
||||||
* checked against the last time data was received. If none of the
|
* checked against the last time data was received. If none of the
|
||||||
@@ -1010,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
|
|||||||
* elapses during the probe timer processing after the unsuccessful
|
* elapses during the probe timer processing after the unsuccessful
|
||||||
* probes.
|
* probes.
|
||||||
*/
|
*/
|
||||||
#define UNRESPONSIVE_TIMEOUT_SECS 10
|
static int sock_opts_and_names(struct super_block *sb,
|
||||||
#define UNRESPONSIVE_PROBES 3
|
struct scoutfs_net_connection *conn,
|
||||||
static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
|
||||||
struct socket *sock)
|
struct socket *sock)
|
||||||
{
|
{
|
||||||
|
struct scoutfs_mount_options opts;
|
||||||
int optval;
|
int optval;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
scoutfs_options_read(sb, &opts);
|
||||||
|
|
||||||
/* we use a keepalive timeout instead of send timeout */
|
/* we use a keepalive timeout instead of send timeout */
|
||||||
ret = kc_sock_set_sndtimeo(sock, 0);
|
ret = kc_sock_set_sndtimeo(sock, 0);
|
||||||
if (ret)
|
if (ret)
|
||||||
@@ -1030,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
|
optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
|
||||||
optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
|
|
||||||
ret = kc_tcp_sock_set_keepidle(sock, optval);
|
ret = kc_tcp_sock_set_keepidle(sock, optval);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
@@ -1041,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
|
optval = opts.tcp_keepalive_timeout_ms;
|
||||||
ret = kc_tcp_sock_set_user_timeout(sock, optval);
|
ret = kc_tcp_sock_set_user_timeout(sock, optval);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
@@ -1115,7 +1117,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = sock_opts_and_names(acc_conn, acc_sock);
|
ret = sock_opts_and_names(sb, acc_conn, acc_sock);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
sock_release(acc_sock);
|
sock_release(acc_sock);
|
||||||
destroy_conn(acc_conn);
|
destroy_conn(acc_conn);
|
||||||
@@ -1186,7 +1188,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
ret = sock_opts_and_names(conn, sock);
|
ret = sock_opts_and_names(sb, conn, sock);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ enum {
|
|||||||
Opt_orphan_scan_delay_ms,
|
Opt_orphan_scan_delay_ms,
|
||||||
Opt_quorum_heartbeat_timeout_ms,
|
Opt_quorum_heartbeat_timeout_ms,
|
||||||
Opt_quorum_slot_nr,
|
Opt_quorum_slot_nr,
|
||||||
|
Opt_tcp_keepalive_timeout_ms,
|
||||||
Opt_err,
|
Opt_err,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
|
|||||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||||
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
||||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||||
|
{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
|
||||||
{Opt_err, NULL}
|
{Opt_err, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
|
|||||||
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
|
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
|
||||||
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
|
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
|
||||||
|
|
||||||
|
#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS (60 * MSEC_PER_SEC)
|
||||||
|
|
||||||
static void init_default_options(struct scoutfs_mount_options *opts)
|
static void init_default_options(struct scoutfs_mount_options *opts)
|
||||||
{
|
{
|
||||||
memset(opts, 0, sizeof(*opts));
|
memset(opts, 0, sizeof(*opts));
|
||||||
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
|||||||
opts->orphan_scan_delay_ms = -1;
|
opts->orphan_scan_delay_ms = -1;
|
||||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||||
opts->quorum_slot_nr = -1;
|
opts->quorum_slot_nr = -1;
|
||||||
|
opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
||||||
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
|
||||||
|
{
|
||||||
|
if (ret < 0) {
|
||||||
|
scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
|
||||||
|
scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
|
||||||
|
val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Parse the option string into our options struct. This can allocate
|
* Parse the option string into our options struct. This can allocate
|
||||||
* memory in the struct. The caller is responsible for always calling
|
* memory in the struct. The caller is responsible for always calling
|
||||||
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
|||||||
opts->data_prealloc_contig_only = nr;
|
opts->data_prealloc_contig_only = nr;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case Opt_tcp_keepalive_timeout_ms:
|
||||||
|
ret = match_int(args, &nr);
|
||||||
|
ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
opts->tcp_keepalive_timeout_ms = nr;
|
||||||
|
break;
|
||||||
|
|
||||||
case Opt_log_merge_wait_timeout_ms:
|
case Opt_log_merge_wait_timeout_ms:
|
||||||
ret = match_int(args, &nr);
|
ret = match_int(args, &nr);
|
||||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
||||||
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
|||||||
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
||||||
if (opts.quorum_slot_nr >= 0)
|
if (opts.quorum_slot_nr >= 0)
|
||||||
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
||||||
|
seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
|
|||||||
unsigned int orphan_scan_delay_ms;
|
unsigned int orphan_scan_delay_ms;
|
||||||
int quorum_slot_nr;
|
int quorum_slot_nr;
|
||||||
u64 quorum_heartbeat_timeout_ms;
|
u64 quorum_heartbeat_timeout_ms;
|
||||||
|
int tcp_keepalive_timeout_ms;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define UNRESPONSIVE_PROBES 3
|
||||||
|
|
||||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
||||||
|
|
||||||
|
|||||||
@@ -130,6 +130,23 @@ the server for the filesystem if it is elected leader.
|
|||||||
The assigned number must match one of the slots defined with \-Q options
|
The assigned number must match one of the slots defined with \-Q options
|
||||||
when the filesystem was created with mkfs. If the number assigned
|
when the filesystem was created with mkfs. If the number assigned
|
||||||
doesn't match a number created during mkfs then the mount will fail.
|
doesn't match a number created during mkfs then the mount will fail.
|
||||||
|
.TP
|
||||||
|
.B tcp_keepalive_timeout_ms=<number>
|
||||||
|
This option sets the amount of time, in milliseconds, that a client
|
||||||
|
connection will wait for active TCP packets, before deciding that
|
||||||
|
the connection is dead. This setting is per-mount and only changes
|
||||||
|
the behavior of that mount.
|
||||||
|
.sp
|
||||||
|
The default value of this setting is 60000msec (60s). Any precision
|
||||||
|
beyond a whole second is likely unrealistic due to the nature of
|
||||||
|
TCP keepalive mechanisms in the Linux kernel. Valid values are any
|
||||||
|
value higher than 3000 (3s).
|
||||||
|
.sp
|
||||||
|
The TCP keepalive mechanism is complex and observing a lost connection
|
||||||
|
quickly is important to maintain cluster stability. If the local
|
||||||
|
network suffers from intermittent outages this option may provide
|
||||||
|
some respite to overcome these outages without the cluster becoming
|
||||||
|
desynchronized.
|
||||||
.SH VOLUME OPTIONS
|
.SH VOLUME OPTIONS
|
||||||
Volume options are persistent options which are stored in the super
|
Volume options are persistent options which are stored in the super
|
||||||
block in the metadata device and which apply to all mounts of the volume.
|
block in the metadata device and which apply to all mounts of the volume.
|
||||||
|
|||||||
Reference in New Issue
Block a user