mirror of
https://github.com/versity/scoutfs.git
synced 2025-12-23 05:25:18 +00:00
Merge pull request #239 from versity/auke/keepalive
Add tcp_keepalive_timeout_ms option, change default to 60s
This commit is contained in:
@@ -32,6 +32,7 @@
|
|||||||
#include "endian_swap.h"
|
#include "endian_swap.h"
|
||||||
#include "tseq.h"
|
#include "tseq.h"
|
||||||
#include "fence.h"
|
#include "fence.h"
|
||||||
|
#include "options.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* scoutfs networking delivers requests and responses between nodes.
|
* scoutfs networking delivers requests and responses between nodes.
|
||||||
@@ -998,7 +999,7 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
|
|||||||
* The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
|
* The TCP_KEEP* and TCP_USER_TIMEOUT option interaction is subtle.
|
||||||
* TCP_USER_TIMEOUT only applies if there is unacked written data in the
|
* TCP_USER_TIMEOUT only applies if there is unacked written data in the
|
||||||
* send queue. It doesn't work if the connection is idle. Adding
|
* send queue. It doesn't work if the connection is idle. Adding
|
||||||
* keepalice probes with user_timeout set changes how the keepalive
|
* keepalive probes with user_timeout set changes how the keepalive
|
||||||
* timeout is calculated. CNT no longer matters. Each time
|
* timeout is calculated. CNT no longer matters. Each time
|
||||||
* additional probes (not the first) are sent the user timeout is
|
* additional probes (not the first) are sent the user timeout is
|
||||||
* checked against the last time data was received. If none of the
|
* checked against the last time data was received. If none of the
|
||||||
@@ -1010,14 +1011,16 @@ static void destroy_conn(struct scoutfs_net_connection *conn)
|
|||||||
* elapses during the probe timer processing after the unsuccessful
|
* elapses during the probe timer processing after the unsuccessful
|
||||||
* probes.
|
* probes.
|
||||||
*/
|
*/
|
||||||
#define UNRESPONSIVE_TIMEOUT_SECS 10
|
static int sock_opts_and_names(struct super_block *sb,
|
||||||
#define UNRESPONSIVE_PROBES 3
|
struct scoutfs_net_connection *conn,
|
||||||
static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
|
||||||
struct socket *sock)
|
struct socket *sock)
|
||||||
{
|
{
|
||||||
|
struct scoutfs_mount_options opts;
|
||||||
int optval;
|
int optval;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
scoutfs_options_read(sb, &opts);
|
||||||
|
|
||||||
/* we use a keepalive timeout instead of send timeout */
|
/* we use a keepalive timeout instead of send timeout */
|
||||||
ret = kc_sock_set_sndtimeo(sock, 0);
|
ret = kc_sock_set_sndtimeo(sock, 0);
|
||||||
if (ret)
|
if (ret)
|
||||||
@@ -1030,8 +1033,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
BUILD_BUG_ON(UNRESPONSIVE_PROBES >= UNRESPONSIVE_TIMEOUT_SECS);
|
optval = (opts.tcp_keepalive_timeout_ms / MSEC_PER_SEC) - UNRESPONSIVE_PROBES;
|
||||||
optval = UNRESPONSIVE_TIMEOUT_SECS - (UNRESPONSIVE_PROBES);
|
|
||||||
ret = kc_tcp_sock_set_keepidle(sock, optval);
|
ret = kc_tcp_sock_set_keepidle(sock, optval);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
@@ -1041,7 +1043,7 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
optval = UNRESPONSIVE_TIMEOUT_SECS * MSEC_PER_SEC;
|
optval = opts.tcp_keepalive_timeout_ms;
|
||||||
ret = kc_tcp_sock_set_user_timeout(sock, optval);
|
ret = kc_tcp_sock_set_user_timeout(sock, optval);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
@@ -1115,7 +1117,7 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = sock_opts_and_names(acc_conn, acc_sock);
|
ret = sock_opts_and_names(sb, acc_conn, acc_sock);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
sock_release(acc_sock);
|
sock_release(acc_sock);
|
||||||
destroy_conn(acc_conn);
|
destroy_conn(acc_conn);
|
||||||
@@ -1186,7 +1188,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
ret = sock_opts_and_names(conn, sock);
|
ret = sock_opts_and_names(sb, conn, sock);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ enum {
|
|||||||
Opt_orphan_scan_delay_ms,
|
Opt_orphan_scan_delay_ms,
|
||||||
Opt_quorum_heartbeat_timeout_ms,
|
Opt_quorum_heartbeat_timeout_ms,
|
||||||
Opt_quorum_slot_nr,
|
Opt_quorum_slot_nr,
|
||||||
|
Opt_tcp_keepalive_timeout_ms,
|
||||||
Opt_err,
|
Opt_err,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -52,6 +53,7 @@ static const match_table_t tokens = {
|
|||||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||||
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
||||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||||
|
{Opt_tcp_keepalive_timeout_ms, "tcp_keepalive_timeout_ms=%s"},
|
||||||
{Opt_err, NULL}
|
{Opt_err, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -126,6 +128,8 @@ static void free_options(struct scoutfs_mount_options *opts)
|
|||||||
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
|
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
|
||||||
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
|
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
|
||||||
|
|
||||||
|
#define DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS (60 * MSEC_PER_SEC)
|
||||||
|
|
||||||
static void init_default_options(struct scoutfs_mount_options *opts)
|
static void init_default_options(struct scoutfs_mount_options *opts)
|
||||||
{
|
{
|
||||||
memset(opts, 0, sizeof(*opts));
|
memset(opts, 0, sizeof(*opts));
|
||||||
@@ -136,6 +140,7 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
|||||||
opts->orphan_scan_delay_ms = -1;
|
opts->orphan_scan_delay_ms = -1;
|
||||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||||
opts->quorum_slot_nr = -1;
|
opts->quorum_slot_nr = -1;
|
||||||
|
opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
||||||
@@ -168,6 +173,21 @@ static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int verify_tcp_keepalive_timeout_ms(struct super_block *sb, int ret, int val)
|
||||||
|
{
|
||||||
|
if (ret < 0) {
|
||||||
|
scoutfs_err(sb, "failed to parse tcp_keepalive_timeout_ms value");
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
if (val <= (UNRESPONSIVE_PROBES * MSEC_PER_SEC)) {
|
||||||
|
scoutfs_err(sb, "invalid tcp_keepalive_timeout_ms value %d, must be larger than %lu",
|
||||||
|
val, (UNRESPONSIVE_PROBES * MSEC_PER_SEC));
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Parse the option string into our options struct. This can allocate
|
* Parse the option string into our options struct. This can allocate
|
||||||
* memory in the struct. The caller is responsible for always calling
|
* memory in the struct. The caller is responsible for always calling
|
||||||
@@ -218,6 +238,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
|||||||
opts->data_prealloc_contig_only = nr;
|
opts->data_prealloc_contig_only = nr;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case Opt_tcp_keepalive_timeout_ms:
|
||||||
|
ret = match_int(args, &nr);
|
||||||
|
ret = verify_tcp_keepalive_timeout_ms(sb, ret, nr);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
opts->tcp_keepalive_timeout_ms = nr;
|
||||||
|
break;
|
||||||
|
|
||||||
case Opt_log_merge_wait_timeout_ms:
|
case Opt_log_merge_wait_timeout_ms:
|
||||||
ret = match_int(args, &nr);
|
ret = match_int(args, &nr);
|
||||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
||||||
@@ -371,6 +399,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
|||||||
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
||||||
if (opts.quorum_slot_nr >= 0)
|
if (opts.quorum_slot_nr >= 0)
|
||||||
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
||||||
|
seq_printf(seq, ",tcp_keepalive_timeout_ms=%d", opts.tcp_keepalive_timeout_ms);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,8 +13,11 @@ struct scoutfs_mount_options {
|
|||||||
unsigned int orphan_scan_delay_ms;
|
unsigned int orphan_scan_delay_ms;
|
||||||
int quorum_slot_nr;
|
int quorum_slot_nr;
|
||||||
u64 quorum_heartbeat_timeout_ms;
|
u64 quorum_heartbeat_timeout_ms;
|
||||||
|
int tcp_keepalive_timeout_ms;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define UNRESPONSIVE_PROBES 3
|
||||||
|
|
||||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
||||||
|
|
||||||
|
|||||||
@@ -130,6 +130,23 @@ the server for the filesystem if it is elected leader.
|
|||||||
The assigned number must match one of the slots defined with \-Q options
|
The assigned number must match one of the slots defined with \-Q options
|
||||||
when the filesystem was created with mkfs. If the number assigned
|
when the filesystem was created with mkfs. If the number assigned
|
||||||
doesn't match a number created during mkfs then the mount will fail.
|
doesn't match a number created during mkfs then the mount will fail.
|
||||||
|
.TP
|
||||||
|
.B tcp_keepalive_timeout_ms=<number>
|
||||||
|
This option sets the amount of time, in milliseconds, that a client
|
||||||
|
connection will wait for active TCP packets, before deciding that
|
||||||
|
the connection is dead. This setting is per-mount and only changes
|
||||||
|
the behavior of that mount.
|
||||||
|
.sp
|
||||||
|
The default value of this setting is 60000msec (60s). Any precision
|
||||||
|
beyond a whole second is likely unrealistic due to the nature of
|
||||||
|
TCP keepalive mechanisms in the Linux kernel. Valid values are any
|
||||||
|
value higher than 3000 (3s).
|
||||||
|
.sp
|
||||||
|
The TCP keepalive mechanism is complex and observing a lost connection
|
||||||
|
quickly is important to maintain cluster stability. If the local
|
||||||
|
network suffers from intermittent outages this option may provide
|
||||||
|
some respite to overcome these outages without the cluster becoming
|
||||||
|
desynchronized.
|
||||||
.SH VOLUME OPTIONS
|
.SH VOLUME OPTIONS
|
||||||
Volume options are persistent options which are stored in the super
|
Volume options are persistent options which are stored in the super
|
||||||
block in the metadata device and which apply to all mounts of the volume.
|
block in the metadata device and which apply to all mounts of the volume.
|
||||||
|
|||||||
Reference in New Issue
Block a user