diff --git a/kmod/src/counters.h b/kmod/src/counters.h index dd291816..42deed1d 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -145,6 +145,7 @@ EXPAND_COUNTER(lock_shrink_work) \ EXPAND_COUNTER(lock_unlock) \ EXPAND_COUNTER(lock_wait) \ + EXPAND_COUNTER(log_merge_wait_timeout) \ EXPAND_COUNTER(net_dropped_response) \ EXPAND_COUNTER(net_send_bytes) \ EXPAND_COUNTER(net_send_error) \ diff --git a/kmod/src/options.c b/kmod/src/options.c index b7a1148b..2520b75a 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -33,6 +33,7 @@ enum { Opt_acl, Opt_data_prealloc_blocks, Opt_data_prealloc_contig_only, + Opt_log_merge_wait_timeout_ms, Opt_metadev_path, Opt_noacl, Opt_orphan_scan_delay_ms, @@ -45,6 +46,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"}, {Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"}, + {Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"}, {Opt_metadev_path, "metadev_path=%s"}, {Opt_noacl, "noacl"}, {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, @@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts) kfree(opts->metadev_path); } +#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL +#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500 +#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC) + #define MIN_ORPHAN_SCAN_DELAY_MS 100UL #define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC) #define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC) @@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts) opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS; opts->data_prealloc_contig_only = 1; + opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS; opts->orphan_scan_delay_ms = -1; opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS; opts->quorum_slot_nr = -1; } +static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val) +{ + if (ret < 0) { + scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value"); + return -EINVAL; + } + if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) { + scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu", + val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS); + return -EINVAL; + } + + return 0; +} + static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val) { if (ret < 0) { @@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m opts->data_prealloc_contig_only = nr; break; + case Opt_log_merge_wait_timeout_ms: + ret = match_int(args, &nr); + ret = verify_log_merge_wait_timeout_ms(sb, ret, nr); + if (ret < 0) + return ret; + opts->log_merge_wait_timeout_ms = nr64; + break; + case Opt_metadev_path: ret = parse_bdev_path(sb, &args[0], &opts->metadev_path); if (ret < 0) @@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj } SCOUTFS_ATTR_RW(data_prealloc_contig_only); +static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms); +} +static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[30]; /* more than enough for octal -U64_MAX */ + int val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtoint(nullterm, 0, &val); + ret = verify_log_merge_wait_timeout_ms(sb, ret, val); + if (ret == 0) { + write_seqlock(&optinf->seqlock); + optinf->opts.log_merge_wait_timeout_ms = val; + write_sequnlock(&optinf->seqlock); + ret = count; + } + + return ret; +} +SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms); + static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); @@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr); static struct attribute *options_attrs[] = { SCOUTFS_ATTR_PTR(data_prealloc_blocks), SCOUTFS_ATTR_PTR(data_prealloc_contig_only), + SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms), SCOUTFS_ATTR_PTR(metadev_path), SCOUTFS_ATTR_PTR(orphan_scan_delay_ms), SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms), diff --git a/kmod/src/options.h b/kmod/src/options.h index 639f3882..4eebd669 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -8,6 +8,7 @@ struct scoutfs_mount_options { u64 data_prealloc_blocks; bool data_prealloc_contig_only; + unsigned int log_merge_wait_timeout_ms; char *metadev_path; unsigned int orphan_scan_delay_ms; int quorum_slot_nr; diff --git a/kmod/src/server.c b/kmod/src/server.c index 698b76bd..1b6952f9 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1052,21 +1052,13 @@ static int next_log_merge_item(struct super_block *sb, * abandoned log btree finalized. If it takes too long each client has * a change to make forward progress before being asked to commit again. * - * We're waiting on heavy state that is protected by mutexes and - * transaction machinery. It's tricky to recreate that state for - * lightweight condition tests that don't change task state. Instead of - * trying to get that right, particularly as we unwind after success or - * after timeouts, waiters use an unsatisfying poll. Short enough to - * not add terrible latency, given how heavy and infrequent this already - * is, and long enough to not melt the cpu. This could be tuned if it - * becomes a problem. - * * This can end up finalizing a new empty log btree if a new mount * happens to arrive at just the right time. That's fine, merging will * ignore and tear down the empty input. */ -#define FINALIZE_POLL_MS (11) -#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2) +#define FINALIZE_POLL_MIN_DELAY_MS 5U +#define FINALIZE_POLL_MAX_DELAY_MS 100U +#define FINALIZE_POLL_DELAY_GROWTH_PCT 150U static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt, u64 rid, struct commit_hold *hold) { @@ -1074,8 +1066,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); struct scoutfs_log_merge_status stat; struct scoutfs_log_merge_range rng; + struct scoutfs_mount_options opts; struct scoutfs_log_trees each_lt; struct scoutfs_log_trees fin; + unsigned int delay_ms; unsigned long timeo; bool saw_finalized; bool others_active; @@ -1083,10 +1077,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l bool ours_visible; struct scoutfs_key key; char *err_str = NULL; + ktime_t start; int ret; int err; - timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS); + scoutfs_options_read(sb, &opts); + timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms); + delay_ms = FINALIZE_POLL_MIN_DELAY_MS; + start = ktime_get_raw(); for (;;) { /* nothing to do if there's already a merge in flight */ @@ -1201,13 +1199,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l if (ret < 0) err_str = "applying commit before waiting for finalized"; - msleep(FINALIZE_POLL_MS); + msleep(delay_ms); + delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100, + FINALIZE_POLL_MAX_DELAY_MS); server_hold_commit(sb, hold); mutex_lock(&server->logs_mutex); /* done if we timed out */ if (time_after(jiffies, timeo)) { + scoutfs_inc_counter(sb, log_merge_wait_timeout); ret = 0; break; } diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index 6078bb74..fb56d19d 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to different regions) and wasted space isn't an issue (perhaps because the file population contains few small files). .TP +.B log_merge_wait_timeout_ms= +This option sets the amount of time, in milliseconds, that log merge +creation can wait before timing out. This setting is per-mount, only +changes the behavior of that mount, and only affects the server when it +is running in that mount. +.sp +This determines how long it may take for mounts to synchronize +committing their log trees to create a log merge operation. Setting it +too high can create long latencies in the event that a mount takes a +long time to commit their log. Setting it too low can result in the +creation of excessive numbers of log trees that are never merged. The +default is 500 and it can not be less than 100 nor greater than 60000. +.TP .B metadev_path= The metadev_path option specifies the path to the block device that contains the filesystem's metadata.