From 3c0616524a56ceee8856b2c62d096560a60fb923 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 17 Jan 2024 14:14:02 -0800 Subject: [PATCH 1/7] Only search last log_trees per rid for finalizing During get_log_trees the server checks log_trees items to see if it should start a log merge operation. It did this by iterating over all log_trees items and there can be quite a lot of them. It doesn't need to see all of the items. It only needs to see the most recent log_trees item for each mount. That's enough to make the decisions that start the log merging process. Signed-off-by: Zach Brown --- kmod/src/server.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index 2593ef1a..5d5eec50 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -938,22 +938,24 @@ static int find_log_trees_item(struct super_block *sb, } /* - * Find the next log_trees item from the key. Fills the caller's log_trees and sets - * the key past the returned log_trees for iteration. Returns 0 when done, > 0 for each - * item, and -errno on fatal errors. + * Find the log_trees item with the greatest nr for each rid. Fills the + * caller's log_trees and sets the key before the returned log_trees for + * the next iteration. Returns 0 when done, > 0 for each item, and + * -errno on fatal errors. */ -static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root, - struct scoutfs_key *key, struct scoutfs_log_trees *lt) +static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root, + struct scoutfs_key *key, struct scoutfs_log_trees *lt) { SCOUTFS_BTREE_ITEM_REF(iref); int ret; - ret = scoutfs_btree_next(sb, root, key, &iref); + ret = scoutfs_btree_prev(sb, root, key, &iref); if (ret == 0) { if (iref.val_len == sizeof(struct scoutfs_log_trees)) { memcpy(lt, iref.val, iref.val_len); *key = *iref.key; - scoutfs_key_inc(key); + key->sklt_nr = 0; + scoutfs_key_dec(key); ret = 1; } else { ret = -EIO; @@ -1099,8 +1101,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l saw_finalized = false; others_active = false; ours_visible = false; - scoutfs_key_init_log_trees(&key, 0, 0); - while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { + scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); + while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) saw_finalized = true; @@ -1132,9 +1134,9 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l } /* send sync requests soon to give time to commit */ - scoutfs_key_init_log_trees(&key, 0, 0); + scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); while (others_active && - (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { + (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || (le64_to_cpu(each_lt.rid) == rid)) From 50168a2d2a282bb3f04477667bf92ff146fb3df6 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 17 Jan 2024 10:04:38 -0800 Subject: [PATCH 2/7] Check each client's last log item for stable seq The server was checking all client log_trees items to search for the lowest commit seq that was still open. This can be expensive when there are a lot of finalized log_trees items that won't have open seqs. Only the last log_trees item for each client rid can be open, and the items are sorted by rid and nr, so we can easily only check the last item for each client rid. Signed-off-by: Zach Brown --- kmod/src/server.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index 5d5eec50..75bdb08e 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1785,43 +1785,29 @@ out: * Give the caller the last seq before outstanding client commits. All * seqs up to and including this are stable, new client transactions can * only have greater seqs. + * + * For each rid, only its greatest log trees nr can be an open commit. + * We look at the last log_trees item for each client rid and record its + * trans seq if it hasn't been committed. */ static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret) { struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); DECLARE_SERVER_INFO(sb, server); - SCOUTFS_BTREE_ITEM_REF(iref); - struct scoutfs_log_trees *lt; + struct scoutfs_log_trees lt; struct scoutfs_key key; u64 last_seq = 0; int ret; last_seq = scoutfs_server_seq(sb) - 1; - scoutfs_key_init_log_trees(&key, 0, 0); mutex_lock(&server->logs_mutex); - for (;; scoutfs_key_inc(&key)) { - ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref); - if (ret == 0) { - if (iref.val_len == sizeof(*lt)) { - lt = iref.val; - if ((le64_to_cpu(lt->get_trans_seq) > - le64_to_cpu(lt->commit_trans_seq)) && - le64_to_cpu(lt->get_trans_seq) <= last_seq) { - last_seq = le64_to_cpu(lt->get_trans_seq) - 1; - } - key = *iref.key; - } else { - ret = -EIO; - } - scoutfs_btree_put_iref(&iref); - } - if (ret < 0) { - if (ret == -ENOENT) { - ret = 0; - break; - } + scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); + while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) { + if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) && + le64_to_cpu(lt.get_trans_seq) <= last_seq) { + last_seq = le64_to_cpu(lt.get_trans_seq) - 1; } } From f654fa0fdaeab2ccbfa1a45296ba3be706e52884 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 17 Jan 2024 15:17:55 -0800 Subject: [PATCH 3/7] Send syncs once when starting to merge The server sends sync requests to clients when it sees that they have open log trees that need to be committed for log merging to proceed. These are currently sent in the context of each client's get_log_trees request, resulting in sync requests queued for one client from all clients. Depending on message delivery and commit latencies, this can create a sync storm. The server's sends are reliable and the open commits are marked with the seq when they opened. It's easy for us to record having sent syncs to all open commits so that future attempts can be avoided. Later open commits will have higher seqs and will get a new round of syncs sent. Signed-off-by: Zach Brown --- kmod/src/server.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index 75bdb08e..698b76bd 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -148,6 +148,8 @@ struct server_info { struct scoutfs_quorum_config qconf; /* a running server maintains a private dirty super */ struct scoutfs_super_block dirty_super; + + u64 finalize_sent_seq; }; #define DECLARE_SERVER_INFO(sb, name) \ @@ -1139,7 +1141,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || - (le64_to_cpu(each_lt.rid) == rid)) + (le64_to_cpu(each_lt.rid) == rid) || + (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq)) continue; ret = scoutfs_net_submit_request_node(sb, server->conn, @@ -1159,6 +1162,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l break; } + server->finalize_sent_seq = scoutfs_server_seq(sb); + /* Finalize ours if it's visible to others */ if (ours_visible) { fin = *lt; @@ -4286,6 +4291,7 @@ static void scoutfs_server_worker(struct work_struct *work) scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin)); scoutfs_block_writer_init(sb, &server->wri); + server->finalize_sent_seq = 0; /* first make sure no other servers are still running */ ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term); From 90a4c82363dc9bfa30f35d15e98cd8d011987f0c Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 19 Jan 2024 13:11:48 -0800 Subject: [PATCH 4/7] Make log merge wait timeout tunable Add a mount option for the amount of time that log merge creation can wait before giving up. We add some counters so we can see how often the timeout is being hit and what the average successfull wait time is. Signed-off-by: Zach Brown --- kmod/src/counters.h | 1 + kmod/src/options.c | 68 +++++++++++++++++++++++++++++++++++++++++++++ kmod/src/options.h | 1 + kmod/src/server.c | 27 +++++++++--------- utils/man/scoutfs.5 | 13 +++++++++ 5 files changed, 97 insertions(+), 13 deletions(-) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index dd291816..42deed1d 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -145,6 +145,7 @@ EXPAND_COUNTER(lock_shrink_work) \ EXPAND_COUNTER(lock_unlock) \ EXPAND_COUNTER(lock_wait) \ + EXPAND_COUNTER(log_merge_wait_timeout) \ EXPAND_COUNTER(net_dropped_response) \ EXPAND_COUNTER(net_send_bytes) \ EXPAND_COUNTER(net_send_error) \ diff --git a/kmod/src/options.c b/kmod/src/options.c index b7a1148b..2520b75a 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -33,6 +33,7 @@ enum { Opt_acl, Opt_data_prealloc_blocks, Opt_data_prealloc_contig_only, + Opt_log_merge_wait_timeout_ms, Opt_metadev_path, Opt_noacl, Opt_orphan_scan_delay_ms, @@ -45,6 +46,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"}, {Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"}, + {Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"}, {Opt_metadev_path, "metadev_path=%s"}, {Opt_noacl, "noacl"}, {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, @@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts) kfree(opts->metadev_path); } +#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL +#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500 +#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC) + #define MIN_ORPHAN_SCAN_DELAY_MS 100UL #define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC) #define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC) @@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts) opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS; opts->data_prealloc_contig_only = 1; + opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS; opts->orphan_scan_delay_ms = -1; opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS; opts->quorum_slot_nr = -1; } +static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val) +{ + if (ret < 0) { + scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value"); + return -EINVAL; + } + if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) { + scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu", + val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS); + return -EINVAL; + } + + return 0; +} + static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val) { if (ret < 0) { @@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m opts->data_prealloc_contig_only = nr; break; + case Opt_log_merge_wait_timeout_ms: + ret = match_int(args, &nr); + ret = verify_log_merge_wait_timeout_ms(sb, ret, nr); + if (ret < 0) + return ret; + opts->log_merge_wait_timeout_ms = nr64; + break; + case Opt_metadev_path: ret = parse_bdev_path(sb, &args[0], &opts->metadev_path); if (ret < 0) @@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj } SCOUTFS_ATTR_RW(data_prealloc_contig_only); +static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms); +} +static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[30]; /* more than enough for octal -U64_MAX */ + int val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtoint(nullterm, 0, &val); + ret = verify_log_merge_wait_timeout_ms(sb, ret, val); + if (ret == 0) { + write_seqlock(&optinf->seqlock); + optinf->opts.log_merge_wait_timeout_ms = val; + write_sequnlock(&optinf->seqlock); + ret = count; + } + + return ret; +} +SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms); + static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); @@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr); static struct attribute *options_attrs[] = { SCOUTFS_ATTR_PTR(data_prealloc_blocks), SCOUTFS_ATTR_PTR(data_prealloc_contig_only), + SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms), SCOUTFS_ATTR_PTR(metadev_path), SCOUTFS_ATTR_PTR(orphan_scan_delay_ms), SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms), diff --git a/kmod/src/options.h b/kmod/src/options.h index 639f3882..4eebd669 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -8,6 +8,7 @@ struct scoutfs_mount_options { u64 data_prealloc_blocks; bool data_prealloc_contig_only; + unsigned int log_merge_wait_timeout_ms; char *metadev_path; unsigned int orphan_scan_delay_ms; int quorum_slot_nr; diff --git a/kmod/src/server.c b/kmod/src/server.c index 698b76bd..1b6952f9 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1052,21 +1052,13 @@ static int next_log_merge_item(struct super_block *sb, * abandoned log btree finalized. If it takes too long each client has * a change to make forward progress before being asked to commit again. * - * We're waiting on heavy state that is protected by mutexes and - * transaction machinery. It's tricky to recreate that state for - * lightweight condition tests that don't change task state. Instead of - * trying to get that right, particularly as we unwind after success or - * after timeouts, waiters use an unsatisfying poll. Short enough to - * not add terrible latency, given how heavy and infrequent this already - * is, and long enough to not melt the cpu. This could be tuned if it - * becomes a problem. - * * This can end up finalizing a new empty log btree if a new mount * happens to arrive at just the right time. That's fine, merging will * ignore and tear down the empty input. */ -#define FINALIZE_POLL_MS (11) -#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2) +#define FINALIZE_POLL_MIN_DELAY_MS 5U +#define FINALIZE_POLL_MAX_DELAY_MS 100U +#define FINALIZE_POLL_DELAY_GROWTH_PCT 150U static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt, u64 rid, struct commit_hold *hold) { @@ -1074,8 +1066,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); struct scoutfs_log_merge_status stat; struct scoutfs_log_merge_range rng; + struct scoutfs_mount_options opts; struct scoutfs_log_trees each_lt; struct scoutfs_log_trees fin; + unsigned int delay_ms; unsigned long timeo; bool saw_finalized; bool others_active; @@ -1083,10 +1077,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l bool ours_visible; struct scoutfs_key key; char *err_str = NULL; + ktime_t start; int ret; int err; - timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS); + scoutfs_options_read(sb, &opts); + timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms); + delay_ms = FINALIZE_POLL_MIN_DELAY_MS; + start = ktime_get_raw(); for (;;) { /* nothing to do if there's already a merge in flight */ @@ -1201,13 +1199,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l if (ret < 0) err_str = "applying commit before waiting for finalized"; - msleep(FINALIZE_POLL_MS); + msleep(delay_ms); + delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100, + FINALIZE_POLL_MAX_DELAY_MS); server_hold_commit(sb, hold); mutex_lock(&server->logs_mutex); /* done if we timed out */ if (time_after(jiffies, timeo)) { + scoutfs_inc_counter(sb, log_merge_wait_timeout); ret = 0; break; } diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index 6078bb74..fb56d19d 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to different regions) and wasted space isn't an issue (perhaps because the file population contains few small files). .TP +.B log_merge_wait_timeout_ms= +This option sets the amount of time, in milliseconds, that log merge +creation can wait before timing out. This setting is per-mount, only +changes the behavior of that mount, and only affects the server when it +is running in that mount. +.sp +This determines how long it may take for mounts to synchronize +committing their log trees to create a log merge operation. Setting it +too high can create long latencies in the event that a mount takes a +long time to commit their log. Setting it too low can result in the +creation of excessive numbers of log trees that are never merged. The +default is 500 and it can not be less than 100 nor greater than 60000. +.TP .B metadev_path= The metadev_path option specifies the path to the block device that contains the filesystem's metadata. From b5630f540ddb1d0802f96c631fdc10c102c5f37a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 25 Jan 2024 10:09:13 -0800 Subject: [PATCH 5/7] Add tracing of the log merge finalizing decision Signed-off-by: Zach Brown --- kmod/src/scoutfs_trace.h | 65 ++++++++++++++++++++++++++++++++++++++++ kmod/src/server.c | 9 ++++++ 2 files changed, 74 insertions(+) diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 838fe13f..a5f41b78 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2075,6 +2075,71 @@ TRACE_EVENT(scoutfs_trans_seq_last, SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq) ); +TRACE_EVENT(scoutfs_server_finalize_items, + TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags, + u64 item_get_trans_seq), + + TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, c_rid) + __field(__u64, item_rid) + __field(__u64, item_nr) + __field(__u64, item_flags) + __field(__u64, item_get_trans_seq) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->c_rid = rid; + __entry->item_rid = item_rid; + __entry->item_nr = item_nr; + __entry->item_flags = item_flags; + __entry->item_get_trans_seq = item_get_trans_seq; + ), + + TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu", + SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr, + __entry->item_flags, __entry->item_get_trans_seq) +); + +TRACE_EVENT(scoutfs_server_finalize_decision, + TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active, + bool ours_visible, bool finalize_ours, unsigned int delay_ms, + u64 finalize_sent_seq), + + TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms, + finalize_sent_seq), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, c_rid) + __field(bool, saw_finalized) + __field(bool, others_active) + __field(bool, ours_visible) + __field(bool, finalize_ours) + __field(unsigned int, delay_ms) + __field(__u64, finalize_sent_seq) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->c_rid = rid; + __entry->saw_finalized = saw_finalized; + __entry->others_active = others_active; + __entry->ours_visible = ours_visible; + __entry->finalize_ours = finalize_ours; + __entry->delay_ms = delay_ms; + __entry->finalize_sent_seq = finalize_sent_seq; + ), + + TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu", + SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active, + __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms, + __entry->finalize_sent_seq) +); + TRACE_EVENT(scoutfs_get_log_merge_status, TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key, u64 nr_requests, u64 nr_complete, u64 seq), diff --git a/kmod/src/server.c b/kmod/src/server.c index 1b6952f9..34d0b54d 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1104,6 +1104,11 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { + trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid), + le64_to_cpu(each_lt.nr), + le64_to_cpu(each_lt.flags), + le64_to_cpu(each_lt.get_trans_seq)); + if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) saw_finalized = true; else if (le64_to_cpu(each_lt.rid) != rid) @@ -1127,6 +1132,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l finalize_ours = (lt->item_root.height > 2) || (le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW); + trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active, + ours_visible, finalize_ours, delay_ms, + server->finalize_sent_seq); + /* done if we're not finalizing and there's no finalized */ if (!finalize_ours && !saw_finalized) { ret = 0; From 91bbf90f716e32c6611a7a8960210794d7fd4c85 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 24 Jan 2024 14:10:54 -0800 Subject: [PATCH 6/7] Don't pin input btrees when merging The btree_merge code was pinning leaf blocks for all input btrees as it iterated over them. This doesn't work when there are a very large number of input btrees. It can run out of memory trying to hold a reference to a 64KiB leaf block for each input root. This reworks the btree merging code. It reads a window of blocks from all input trees to get a set of merged items. It can take multiple passes to complete the merge but by setting the merge window large enough this overhead is reduced. Merging now consumes a fixed amount of memory rather than using memory proportional to the number of input btrees. Signed-off-by: Zach Brown --- kmod/src/btree.c | 439 +++++++++++++++++++++++---------------- kmod/src/btree.h | 2 +- kmod/src/forest.c | 3 +- kmod/src/scoutfs_trace.h | 42 ++-- 4 files changed, 288 insertions(+), 198 deletions(-) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index 5c522820..7a21afc7 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb, key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL); } -struct merge_pos { +struct merged_range { + struct scoutfs_key start; + struct scoutfs_key end; + struct rb_root root; + int size; +}; + +struct merged_item { struct rb_node node; - struct scoutfs_btree_root *root; - struct scoutfs_block *bl; - struct scoutfs_btree_block *bt; - struct scoutfs_avl_node *avl; - struct scoutfs_key *key; + struct scoutfs_key key; u64 seq; u8 flags; unsigned int val_len; - u8 *val; + u8 val[0]; }; -static struct merge_pos *first_mpos(struct rb_root *root) +static inline struct merged_item *mitem_container(struct rb_node *node) { - struct rb_node *node = rb_first(root); - if (node) - return container_of(node, struct merge_pos, node); + return node ? container_of(node, struct merged_item, node) : NULL; +} + +static inline struct merged_item *first_mitem(struct rb_root *root) +{ + return mitem_container(rb_first(root)); +} + +static inline struct merged_item *last_mitem(struct rb_root *root) +{ + return mitem_container(rb_last(root)); +} + +static inline struct merged_item *next_mitem(struct merged_item *mitem) +{ + return mitem_container(mitem ? rb_next(&mitem->node) : NULL); +} + +static inline struct merged_item *prev_mitem(struct merged_item *mitem) +{ + return mitem_container(mitem ? rb_prev(&mitem->node) : NULL); +} + +static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key, + struct rb_node **parent_ret, struct rb_node ***link_ret) +{ + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct merged_item *mitem; + int cmp; + + while (*node) { + parent = *node; + mitem = container_of(*node, struct merged_item, node); + + cmp = scoutfs_key_compare(key, &mitem->key); + + if (cmp < 0) { + node = &(*node)->rb_left; + } else if (cmp > 0) { + node = &(*node)->rb_right; + } else { + *parent_ret = NULL; + *link_ret = NULL; + return mitem; + } + } + + *parent_ret = parent; + *link_ret = node; return NULL; } -static struct merge_pos *next_mpos(struct merge_pos *mpos) +static void insert_mitem(struct merged_range *rng, struct merged_item *mitem, + struct rb_node *parent, struct rb_node **link) { - struct rb_node *node; - - if (mpos && (node = rb_next(&mpos->node))) - return container_of(node, struct merge_pos, node); - else - return NULL; + rb_link_node(&mitem->node, parent, link); + rb_insert_color(&mitem->node, &rng->root); + rng->size += item_len_bytes(mitem->val_len); } -static void free_mpos(struct super_block *sb, struct merge_pos *mpos) +static void replace_mitem(struct merged_range *rng, struct merged_item *victim, + struct merged_item *new) { - scoutfs_block_put(sb, mpos->bl); - kfree(mpos); + rb_replace_node(&victim->node, &new->node, &rng->root); + RB_CLEAR_NODE(&victim->node); + rng->size -= item_len_bytes(victim->val_len); + rng->size += item_len_bytes(new->val_len); } -static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins) +static void free_mitem(struct merged_range *rng, struct merged_item *mitem) { - struct rb_node **node = &pos_root->rb_node; - struct rb_node *parent = NULL; - struct merge_pos *mpos; - int cmp; + if (IS_ERR_OR_NULL(mitem)) + return; - parent = NULL; - while (*node) { - parent = *node; - mpos = container_of(*node, struct merge_pos, node); - - /* sort merge items by key then newest to oldest */ - cmp = scoutfs_key_compare(ins->key, mpos->key) ?: - -scoutfs_cmp(ins->seq, mpos->seq); - - if (cmp < 0) - node = &(*node)->rb_left; - else - node = &(*node)->rb_right; + if (!RB_EMPTY_NODE(&mitem->node)) { + rng->size -= item_len_bytes(mitem->val_len); + rb_erase(&mitem->node, &rng->root); } - rb_link_node(&ins->node, parent, node); - rb_insert_color(&ins->node, pos_root); + kfree(mitem); +} + +static void trim_range_size(struct merged_range *rng, int merge_window) +{ + struct merged_item *mitem; + struct merged_item *tmp; + + mitem = last_mitem(&rng->root); + while (mitem && rng->size > merge_window) { + + rng->end = mitem->key; + scoutfs_key_dec(&rng->end); + + tmp = mitem; + mitem = prev_mitem(mitem); + free_mitem(rng, tmp); + } +} + +static void trim_range_end(struct merged_range *rng) +{ + struct merged_item *mitem; + struct merged_item *tmp; + + mitem = last_mitem(&rng->root); + while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) { + tmp = mitem; + mitem = prev_mitem(mitem); + free_mitem(rng, tmp); + } } /* - * Find the next item in the merge_pos root in the caller's range and - * insert it into the rbtree sorted by key and version so that merging - * can find the next newest item at the front of the rbtree. We free - * the mpos on error or if there are no more items in the range. + * Record and combine logged items from log roots for merging with the + * writable destination root. The caller is responsible for trimming + * the range if it gets too large or if the key range shrinks. */ -static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos, - struct scoutfs_key *start, struct scoutfs_key *end) +static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags, + void *val, int val_len, void *arg) { - struct scoutfs_btree_item *item; - struct scoutfs_avl_node *next; - struct btree_walk_key_range kr; - struct scoutfs_key walk_key; - int ret = 0; + struct merged_range *rng = arg; + struct merged_item *mitem; + struct merged_item *found; + struct rb_node *parent; + struct rb_node **link; + int ret; - /* always erase before freeing or inserting */ - if (!RB_EMPTY_NODE(&mpos->node)) { - rb_erase(&mpos->node, pos_root); - RB_CLEAR_NODE(&mpos->node); - } - - /* - * advance to next item via the avl tree. The caller's pos is - * only ever incremented past the last key so we can use next to - * iterate rather than using search to skip past multiple items. - */ - if (mpos->avl) - mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl); - - /* find the next leaf with the key if we run out of items */ - walk_key = *start; - while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) { - scoutfs_block_put(sb, mpos->bl); - mpos->bl = NULL; - ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key, - 0, &mpos->bl, &kr, NULL); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - free_mpos(sb, mpos); + found = find_mitem(&rng->root, key, &parent, &link); + if (found) { + ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len); + if (ret < 0) + goto out; + if (ret > 0) { + if (ret == SCOUTFS_DELTA_COMBINED) { + scoutfs_inc_counter(sb, btree_merge_delta_combined); + } else if (ret == SCOUTFS_DELTA_COMBINED_NULL) { + scoutfs_inc_counter(sb, btree_merge_delta_null); + free_mitem(rng, found); + } + ret = 0; goto out; } - mpos->bt = mpos->bl->data; - mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item, - start, NULL, NULL, &next, NULL) ?: next; - if (mpos->avl == NULL) - walk_key = kr.iter_next; + if (found->seq >= seq) { + ret = 0; + goto out; + } } - /* see if we're out of items within the range */ - item = node_item(mpos->avl); - if (!item || scoutfs_key_compare(item_key(item), end) > 0) { - free_mpos(sb, mpos); - ret = 0; + mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS); + if (!mitem) { + ret = -ENOMEM; goto out; } - /* insert the next item within range at its version */ - mpos->key = item_key(item); - mpos->seq = le64_to_cpu(item->seq); - mpos->flags = item->flags; - mpos->val_len = item_val_len(item); - mpos->val = item_val(mpos->bt, item); + mitem->key = *key; + mitem->seq = seq; + mitem->flags = flags; + mitem->val_len = val_len; + if (val_len) + memcpy(mitem->val, val, val_len); + + if (found) { + replace_mitem(rng, found, mitem); + free_mitem(rng, found); + } else { + insert_mitem(rng, mitem, parent, link); + } - insert_mpos(pos_root, mpos); ret = 0; out: return ret; } /* - * The caller has reset all the merge positions for all the input log - * btree roots and wants the next logged item it should try and merge - * with the items in the fs_root. + * Read a range of merged items. The caller has set the key bounds of + * the range. We read a merge window's worth of items from blocks in + * each input btree. * - * We look ahead in the logged item stream to see if we should merge any - * older logged delta items into one result for the caller. We also - * take this opportunity to skip and reset the mpos for any older - * versions of the first item. + * The caller can only use the smallest range that overlaps with all the + * blocks that we read. We start reading from the range's start key so + * it will always be present and we don't need to adjust it. The final + * block we read from each input might not cover the range's end so it + * needs to be adjusted. + * + * The end range can also shrink if we have to drop items because the + * items exceeded the merge window size. */ -static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root, - struct scoutfs_key *end, struct merge_pos **mpos_ret) +static int read_merged_range(struct super_block *sb, struct merged_range *rng, + struct list_head *inputs, int merge_window) { - struct merge_pos *mpos; - struct merge_pos *next; + struct scoutfs_btree_root_head *rhead; + struct scoutfs_key start; + struct scoutfs_key end; struct scoutfs_key key; int ret = 0; + int i; - while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) && - !scoutfs_key_compare(mpos->key, next->key)) { + list_for_each_entry(rhead, inputs, head) { + key = rng->start; - ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len, - next->val, next->val_len); - if (ret < 0) - break; - - /* reset advances to the next item */ - key = *mpos->key; - scoutfs_key_inc(&key); - - /* always skip next combined or older version */ - ret = reset_mpos(sb, pos_root, next, &key, end); - if (ret < 0) - break; - - if (ret == SCOUTFS_DELTA_COMBINED) { - scoutfs_inc_counter(sb, btree_merge_delta_combined); - } else if (ret == SCOUTFS_DELTA_COMBINED_NULL) { - scoutfs_inc_counter(sb, btree_merge_delta_null); - /* if merging resulted in no info, skip current */ - ret = reset_mpos(sb, pos_root, mpos, &key, end); + for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) { + start = key; + end = rng->end; + ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end, + merge_read_item, rng); if (ret < 0) + goto out; + + if (scoutfs_key_compare(&end, &rng->end) >= 0) break; + + key = end; + scoutfs_key_inc(&key); } + + if (scoutfs_key_compare(&end, &rng->end) < 0) { + rng->end = end; + trim_range_end(rng); + } + + if (rng->size > merge_window) + trim_range_size(rng, merge_window); } - *mpos_ret = mpos; + trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size); + ret = 0; +out: return ret; } @@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root, * to allocators running low or needing to join/split the parent. * *next_ret is set to the next key which hasn't been merged so that the * caller can retry with a new allocator and subtree. + * + * The number of input roots can be immense. The merge_window specifies + * the size of the set of merged items that we'll maintain as we iterate + * over all the input roots. Once we've merged items into the window + * from all the input roots the merged input items are then merged to + * the writable destination root. It may take multiple passes of + * windows of merged items to cover the input key range. */ int scoutfs_btree_merge(struct super_block *sb, struct scoutfs_alloc *alloc, @@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb, struct scoutfs_key *next_ret, struct scoutfs_btree_root *root, struct list_head *inputs, - bool subtree, int dirty_limit, int alloc_low) + bool subtree, int dirty_limit, int alloc_low, int merge_window) { - struct scoutfs_btree_root_head *rhead; - struct rb_root pos_root = RB_ROOT; struct scoutfs_btree_item *item; struct scoutfs_btree_block *bt; struct scoutfs_block *bl = NULL; struct btree_walk_key_range kr; struct scoutfs_avl_node *par; - struct scoutfs_key next; - struct merge_pos *mpos; - struct merge_pos *tmp; + struct merged_item *mitem; + struct merged_item *tmp; + struct merged_range rng; int walk_val_len; int walk_flags; bool is_del; @@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb, trace_scoutfs_btree_merge(sb, root, start, end); scoutfs_inc_counter(sb, btree_merge); - list_for_each_entry(rhead, inputs, head) { - mpos = kzalloc(sizeof(*mpos), GFP_NOFS); - if (!mpos) { - ret = -ENOMEM; - goto out; - } - - RB_CLEAR_NODE(&mpos->node); - mpos->root = &rhead->root; - - ret = reset_mpos(sb, &pos_root, mpos, start, end); - if (ret < 0) - goto out; - } - walk_flags = BTW_DIRTY; if (subtree) walk_flags |= BTW_SUBTREE; walk_val_len = 0; - while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) { + rng.start = *start; + rng.end = *end; + rng.root = RB_ROOT; + rng.size = 0; + + ret = read_merged_range(sb, &rng, inputs, merge_window); + if (ret < 0) + goto out; + + for (;;) { + /* read next window as it empties (and it is possible to read an empty range) */ + mitem = first_mitem(&rng.root); + if (!mitem) { + /* done if the read range hit the end */ + if (scoutfs_key_compare(&rng.end, end) >= 0) + break; + + /* read next batch of merged items */ + rng.start = rng.end; + scoutfs_key_inc(&rng.start); + rng.end = *end; + ret = read_merged_range(sb, &rng, inputs, merge_window); + if (ret < 0) + break; + continue; + } if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) { scoutfs_inc_counter(sb, btree_merge_dirty_limit); ret = -ERANGE; - *next_ret = *mpos->key; + *next_ret = mitem->key; goto out; } if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) { scoutfs_inc_counter(sb, btree_merge_alloc_low); ret = -ERANGE; - *next_ret = *mpos->key; + *next_ret = mitem->key; goto out; } scoutfs_block_put(sb, bl); bl = NULL; ret = btree_walk(sb, alloc, wri, root, walk_flags, - mpos->key, walk_val_len, &bl, &kr, NULL); + &mitem->key, walk_val_len, &bl, &kr, NULL); if (ret < 0) { if (ret == -ERANGE) - *next_ret = *mpos->key; + *next_ret = mitem->key; goto out; } bt = bl->data; @@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb, continue; } - while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) { - + while (mitem) { /* walk to new leaf if we exceed parent ref key */ - if (scoutfs_key_compare(mpos->key, &kr.end) > 0) + if (scoutfs_key_compare(&mitem->key, &kr.end) > 0) break; /* see if there's an existing item */ - item = leaf_item_hash_search(sb, bt, mpos->key); - is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION); + item = leaf_item_hash_search(sb, bt, &mitem->key); + is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION); /* see if we're merging delta items */ if (item && !is_del) - delta = scoutfs_forest_combine_deltas(mpos->key, + delta = scoutfs_forest_combine_deltas(&mitem->key, item_val(bt, item), item_val_len(item), - mpos->val, mpos->val_len); + mitem->val, mitem->val_len); else delta = 0; if (delta < 0) { @@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb, scoutfs_inc_counter(sb, btree_merge_delta_null); } - trace_scoutfs_btree_merge_items(sb, mpos->root, - mpos->key, mpos->val_len, + trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len, item ? root : NULL, item ? item_key(item) : NULL, item ? item_val_len(item) : 0, is_del); /* rewalk and split if ins/update needs room */ - if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) { + if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) { walk_flags |= BTW_INSERT; - walk_val_len = mpos->val_len; + walk_val_len = mitem->val_len; break; } /* insert missing non-deletion merge items */ if (!item && !is_del) { - scoutfs_avl_search(&bt->item_root, - cmp_key_item, mpos->key, + scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key, &cmp, &par, NULL, NULL); - create_item(bt, mpos->key, mpos->seq, mpos->flags, - mpos->val, mpos->val_len, par, cmp); + create_item(bt, &mitem->key, mitem->seq, mitem->flags, + mitem->val, mitem->val_len, par, cmp); scoutfs_inc_counter(sb, btree_merge_insert); } /* update existing items */ if (item && !is_del && !delta) { - item->seq = cpu_to_le64(mpos->seq); - item->flags = mpos->flags; - update_item_value(bt, item, mpos->val, mpos->val_len); + item->seq = cpu_to_le64(mitem->seq); + item->flags = mitem->flags; + update_item_value(bt, item, mitem->val, mitem->val_len); scoutfs_inc_counter(sb, btree_merge_update); } /* update combined delta item seq */ if (delta == SCOUTFS_DELTA_COMBINED) { - item->seq = cpu_to_le64(mpos->seq); + item->seq = cpu_to_le64(mitem->seq); } /* @@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb, walk_flags &= ~(BTW_INSERT | BTW_DELETE); walk_val_len = 0; - /* finished with this key, skip any older items */ - next = *mpos->key; - scoutfs_key_inc(&next); - ret = reset_mpos(sb, &pos_root, mpos, &next, end); - if (ret < 0) - goto out; + /* finished with this merged item */ + tmp = mitem; + mitem = next_mitem(mitem); + free_mitem(&rng, tmp); } } ret = 0; out: scoutfs_block_put(sb, bl); - rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) { - free_mpos(sb, mpos); - } + rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node) + free_mitem(&rng, mitem); return ret; } diff --git a/kmod/src/btree.h b/kmod/src/btree.h index 0ff5599c..8656dd65 100644 --- a/kmod/src/btree.h +++ b/kmod/src/btree.h @@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb, struct scoutfs_key *next_ret, struct scoutfs_btree_root *root, struct list_head *input_list, - bool subtree, int dirty_limit, int alloc_low); + bool subtree, int dirty_limit, int alloc_low, int merge_window); int scoutfs_btree_free_blocks(struct super_block *sb, struct scoutfs_alloc *alloc, diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 062d3713..f08724d9 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end, &next, &comp.root, &inputs, !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)), - SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10); + SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10, + (2 * 1024 * 1024)); if (ret == -ERANGE) { comp.remain = next; le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index a5f41b78..1edbee9f 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -1746,21 +1746,41 @@ TRACE_EVENT(scoutfs_btree_merge, sk_trace_args(end)) ); +TRACE_EVENT(scoutfs_btree_merge_read_range, + TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end, + int size), + + TP_ARGS(sb, start, end, size), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + sk_trace_define(start) + sk_trace_define(end) + __field(int, size) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + sk_trace_assign(start, start); + sk_trace_assign(end, end); + __entry->size = size; + ), + + TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d", + SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size) +); + TRACE_EVENT(scoutfs_btree_merge_items, TP_PROTO(struct super_block *sb, - struct scoutfs_btree_root *m_root, struct scoutfs_key *m_key, int m_val_len, struct scoutfs_btree_root *f_root, struct scoutfs_key *f_key, int f_val_len, int is_del), - TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del), + TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(__u64, m_root_blkno) - __field(__u64, m_root_seq) - __field(__u8, m_root_height) sk_trace_define(m_key) __field(int, m_val_len) __field(__u64, f_root_blkno) @@ -1773,10 +1793,6 @@ TRACE_EVENT(scoutfs_btree_merge_items, TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->m_root_blkno = m_root ? - le64_to_cpu(m_root->ref.blkno) : 0; - __entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0; - __entry->m_root_height = m_root ? m_root->height : 0; sk_trace_assign(m_key, m_key); __entry->m_val_len = m_val_len; __entry->f_root_blkno = f_root ? @@ -1788,11 +1804,9 @@ TRACE_EVENT(scoutfs_btree_merge_items, __entry->is_del = !!is_del; ), - TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d", - SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq, - __entry->m_root_height, sk_trace_args(m_key), - __entry->m_val_len, __entry->f_root_blkno, - __entry->f_root_seq, __entry->f_root_height, + TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d", + SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len, + __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height, sk_trace_args(f_key), __entry->f_val_len, __entry->is_del) ); From e9ad61b444e732fe1b93edc762d59f1edeb024fe Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 22 Jan 2024 13:18:39 -0800 Subject: [PATCH 7/7] Delete multiple log trees items per server commit server_log_merge_free_work() is responsible for freeing all the input log trees for a log merge operation that has finished. It looks for the next item to free, frees the log btree it references, and then deletes the item. It was doing this with a full server commit for each item which can take an agonizingly long time. This changes it perform multiple deletions in a commit as long as there's plenty of alloc space. The moment the commit gets low it applies the commit and opens a new one. This sped up the deletion of a few hundred thousand log tree items from taking hours to seconds. Signed-off-by: Zach Brown --- kmod/src/server.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index 34d0b54d..b49b6cd8 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -415,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold) wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold)); } +/* + * Return the higher of the avail or freed used by the active commit + * since this holder joined the commit. This is *not* the amount used + * by the holder, we don't track per-holder alloc use. + */ +static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold) +{ + DECLARE_SERVER_INFO(sb, server); + u32 avail_used; + u32 freed_used; + u32 avail_now; + u32 freed_now; + + scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now); + + avail_used = hold->avail - avail_now; + freed_used = hold->freed - freed_now; + + return max(avail_used, freed_used); +} + /* * This is called while holding the commit and returns once the commit * is successfully written. Many holders can all wait for all holders @@ -2474,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work) while (!server_is_stopping(server)) { - server_hold_commit(sb, &hold); - mutex_lock(&server->logs_mutex); - commit = true; + if (!commit) { + server_hold_commit(sb, &hold); + mutex_lock(&server->logs_mutex); + commit = true; + } ret = next_log_merge_item(sb, &super->log_merge, SCOUTFS_LOG_MERGE_FREEING_ZONE, @@ -2523,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work) /* freed blocks are in allocator, we *have* to update fr */ BUG_ON(ret < 0); - mutex_unlock(&server->logs_mutex); - ret = server_apply_commit(sb, &hold, ret); - commit = false; - if (ret < 0) { - err_str = "looping commit del/upd freeing item"; - break; + if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) { + mutex_unlock(&server->logs_mutex); + ret = server_apply_commit(sb, &hold, ret); + commit = false; + if (ret < 0) { + err_str = "looping commit del/upd freeing item"; + break; + } } }