diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 8f45124e..34297cbf 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -642,7 +642,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) scoutfs_alloc_init(&alloc, &req.meta_avail, &req.meta_freed); scoutfs_block_writer_init(sb, &wri); - /* find finalized input log trees up to last_seq */ + /* find finalized input log trees within the input seq */ for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) { if (!rhead) { @@ -658,10 +658,9 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work) if (iref.val_len == sizeof(*lt)) { key = *iref.key; lt = iref.val; - if ((le64_to_cpu(lt->flags) & - SCOUTFS_LOG_TREES_FINALIZED) && - (le64_to_cpu(lt->max_item_seq) <= - le64_to_cpu(req.last_seq))) { + if (lt->item_root.ref.blkno != 0 && + (le64_to_cpu(lt->flags) & SCOUTFS_LOG_TREES_FINALIZED) && + (le64_to_cpu(lt->finalize_seq) < le64_to_cpu(req.input_seq))) { rhead->root = lt->item_root; list_add_tail(&rhead->head, &inputs); rhead = NULL; diff --git a/kmod/src/format.h b/kmod/src/format.h index 160ebd17..39a303b7 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -457,6 +457,7 @@ struct scoutfs_log_trees { __le64 data_alloc_zone_blocks; __le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S]; __le64 max_item_seq; + __le64 finalize_seq; __le64 rid; __le64 nr; __le64 flags; @@ -508,7 +509,6 @@ struct scoutfs_log_merge_status { struct scoutfs_key next_range_key; __le64 nr_requests; __le64 nr_complete; - __le64 last_seq; __le64 seq; }; @@ -525,7 +525,7 @@ struct scoutfs_log_merge_request { struct scoutfs_btree_root root; struct scoutfs_key start; struct scoutfs_key end; - __le64 last_seq; + __le64 input_seq; __le64 rid; __le64 seq; __le64 flags; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index b92471fd..e1e53acd 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -406,21 +406,24 @@ TRACE_EVENT(scoutfs_sync_fs, ); TRACE_EVENT(scoutfs_trans_write_func, - TP_PROTO(struct super_block *sb, unsigned long dirty), + TP_PROTO(struct super_block *sb, u64 dirty_block_bytes, u64 dirty_item_pages), - TP_ARGS(sb, dirty), + TP_ARGS(sb, dirty_block_bytes, dirty_item_pages), TP_STRUCT__entry( SCSB_TRACE_FIELDS - __field(unsigned long, dirty) + __field(__u64, dirty_block_bytes) + __field(__u64, dirty_item_pages) ), TP_fast_assign( SCSB_TRACE_ASSIGN(sb); - __entry->dirty = dirty; + __entry->dirty_block_bytes = dirty_block_bytes; + __entry->dirty_item_pages = dirty_item_pages; ), - TP_printk(SCSBF" dirty %lu", SCSB_TRACE_ARGS, __entry->dirty) + TP_printk(SCSBF" dirty_block_bytes %llu dirty_item_pages %llu", + SCSB_TRACE_ARGS, __entry->dirty_block_bytes, __entry->dirty_item_pages) ); DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, @@ -2045,9 +2048,9 @@ TRACE_EVENT(scoutfs_trans_seq_last, TRACE_EVENT(scoutfs_get_log_merge_status, TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key, - u64 nr_requests, u64 nr_complete, u64 last_seq, u64 seq), + u64 nr_requests, u64 nr_complete, u64 seq), - TP_ARGS(sb, rid, next_range_key, nr_requests, nr_complete, last_seq, seq), + TP_ARGS(sb, rid, next_range_key, nr_requests, nr_complete, seq), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -2055,7 +2058,6 @@ TRACE_EVENT(scoutfs_get_log_merge_status, sk_trace_define(next_range_key) __field(__u64, nr_requests) __field(__u64, nr_complete) - __field(__u64, last_seq) __field(__u64, seq) ), @@ -2065,21 +2067,20 @@ TRACE_EVENT(scoutfs_get_log_merge_status, sk_trace_assign(next_range_key, next_range_key); __entry->nr_requests = nr_requests; __entry->nr_complete = nr_complete; - __entry->last_seq = last_seq; __entry->seq = seq; ), - TP_printk(SCSBF" rid %016llx next_range_key "SK_FMT" nr_requests %llu nr_complete %llu last_seq %llu seq %llu", + TP_printk(SCSBF" rid %016llx next_range_key "SK_FMT" nr_requests %llu nr_complete %llu seq %llu", SCSB_TRACE_ARGS, __entry->s_rid, sk_trace_args(next_range_key), - __entry->nr_requests, __entry->nr_complete, __entry->last_seq, __entry->seq) + __entry->nr_requests, __entry->nr_complete, __entry->seq) ); TRACE_EVENT(scoutfs_get_log_merge_request, TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_btree_root *root, struct scoutfs_key *start, - struct scoutfs_key *end, u64 last_seq, u64 seq), + struct scoutfs_key *end, u64 input_seq, u64 seq), - TP_ARGS(sb, rid, root, start, end, last_seq, seq), + TP_ARGS(sb, rid, root, start, end, input_seq, seq), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -2089,7 +2090,7 @@ TRACE_EVENT(scoutfs_get_log_merge_request, __field(__u8, root_height) sk_trace_define(start) sk_trace_define(end) - __field(__u64, last_seq) + __field(__u64, input_seq) __field(__u64, seq) ), @@ -2101,14 +2102,14 @@ TRACE_EVENT(scoutfs_get_log_merge_request, __entry->root_height = root->height; sk_trace_assign(start, start); sk_trace_assign(end, end); - __entry->last_seq = last_seq; + __entry->input_seq = input_seq; __entry->seq = seq; ), - TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" last_seq %llu seq %llu", + TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" input_seq %llu seq %llu", SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno, __entry->root_seq, __entry->root_height, - sk_trace_args(start), sk_trace_args(end), __entry->last_seq, + sk_trace_args(start), sk_trace_args(end), __entry->input_seq, __entry->seq) ); diff --git a/kmod/src/server.c b/kmod/src/server.c index e506d5ac..7f8ae4a4 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -695,6 +695,321 @@ static int find_log_trees_item(struct super_block *sb, return ret; } +/* + * Find the next log_trees item from the key. Fills the caller's log_trees and sets + * the key past the returned log_trees for iteration. Returns 0 when done, > 0 for each + * item, and -errno on fatal errors. + */ +static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root, + struct scoutfs_key *key, struct scoutfs_log_trees *lt) +{ + SCOUTFS_BTREE_ITEM_REF(iref); + int ret; + + ret = scoutfs_btree_next(sb, root, key, &iref); + if (ret == 0) { + if (iref.val_len == sizeof(struct scoutfs_log_trees)) { + memcpy(lt, iref.val, iref.val_len); + *key = *iref.key; + scoutfs_key_inc(key); + ret = 1; + } else { + ret = -EIO; + } + scoutfs_btree_put_iref(&iref); + } else if (ret == -ENOENT) { + ret = 0; + } + + return ret; +} + +/* + * Log merge range items are stored at the starting fs key of the range. + * The only fs key field that doesn't hold information is the zone, so + * we use the zone to differentiate all types that we store in the log + * merge tree. + */ +static void init_log_merge_key(struct scoutfs_key *key, u8 zone, u64 first, + u64 second) +{ + *key = (struct scoutfs_key) { + .sk_zone = zone, + ._sk_first = cpu_to_le64(first), + ._sk_second = cpu_to_le64(second), + }; +} + +static int next_log_merge_item_key(struct super_block *sb, struct scoutfs_btree_root *root, + u8 zone, struct scoutfs_key *key, void *val, size_t val_len) +{ + SCOUTFS_BTREE_ITEM_REF(iref); + int ret; + + ret = scoutfs_btree_next(sb, root, key, &iref); + if (ret == 0) { + if (iref.key->sk_zone != zone) + ret = -ENOENT; + else if (iref.val_len != val_len) + ret = -EIO; + else + memcpy(val, iref.val, val_len); + scoutfs_btree_put_iref(&iref); + } + + return ret; +} + +static int next_log_merge_item(struct super_block *sb, + struct scoutfs_btree_root *root, + u8 zone, u64 first, u64 second, + void *val, size_t val_len) +{ + struct scoutfs_key key; + + init_log_merge_key(&key, zone, first, second); + return next_log_merge_item_key(sb, root, zone, &key, val, val_len); +} + +/* + * Finalizing the log btrees for merging needs to be done carefully so + * that items don't appear to go backwards in time. + * + * This can happen if an older version of an item happens to be present + * in a log btree that is seeing activity without growing. It will + * never be merged, while another growing tree with an older version + * gets finalized and merged. The older version in the active log btree + * will take precedent over the new item in the fs root. + * + * To avoid this without examining the overlapping of all item key + * ranges in all log btrees we need to create a strict discontinuity in + * item versions between all the finalized log btrees and all the active + * log btrees. Since active log btrees can get new item versions from + * new locks, we can't naively finalize individual log btrees as they + * grow. It's almost guaranteed that some existing tree will have + * older items than the finalizing tree, and will get new locks with + * seqs greater. Existing log btrees always naturally have seq ranges + * that overlap with individually finalized log btrees. + * + * So we have the server perform a hard coordinated finalization of all + * client log btrees once any of them is naturally finalized -- either + * by growing or being cleaned up (via unmount or fencing). Each + * client's get_log_trees waits for everyone else to arrive and finalize + * before any of them return the new next log btree. This ensures that + * the trans seq and all lock seqs of all the new log btrees will be + * greater than all the items in all the previous and finalized log + * btrees. + * + * This creates a bubble in pipeline. We don't wait forever for an + * active log btree to be finalized because we could be waiting for a + * series of timeouts before a missing client is fenced and has its + * abandoned log btree finalized. If it takes too long each client has + * a change to make forward progress before being asked to commit again. + * + * We're waiting on heavy state that is protected by mutexes and + * transaction machinery. It's tricky to recreate that state for + * lightweight condition tests that don't change task state. Instead of + * trying to get that right, particularly as we unwind after success or + * after timeouts, waiters use an unsatisfying poll. Short enough to + * not add terrible latency, given how heavy and infrequent this already + * is, and long enough to not melt the cpu. This could be tuned if it + * becomes a problem. + * + * This can end up finalizing a new empty log btree if a new mount + * happens to arrive at just the right time. That's fine, merging will + * ignore and tear down the empty input. + */ +#define FINALIZE_POLL_MS (11) +#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2) +static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt, + u64 rid) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_log_merge_status stat; + struct scoutfs_log_merge_range rng; + struct scoutfs_log_trees each_lt; + struct scoutfs_log_trees fin; + unsigned long timeo; + bool saw_finalized; + bool others_active; + bool finalize_ours; + bool ours_visible; + struct scoutfs_key key; + char *err_str = NULL; + int ret; + int err; + + timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS); + + for (;;) { + /* nothing to do if there's already a merge in flight */ + ret = next_log_merge_item(sb, &super->log_merge, + SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0, + &stat, sizeof(stat)); + if (ret != -ENOENT) { + if (ret < 0) + err_str = "checking merge status item to finalize"; + break; + } + + /* look for finalized and other active log btrees */ + saw_finalized = false; + others_active = false; + ours_visible = false; + scoutfs_key_init_log_trees(&key, 0, 0); + while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { + + if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) + saw_finalized = true; + else if (le64_to_cpu(each_lt.rid) != rid) + others_active = true; + else if (each_lt.nr == lt->nr) + ours_visible = true; + } + if (ret < 0) { + err_str = "searching finalized flags in log_trees items"; + break; + } + + /* + * We'll first finalize our log btree when it has enough + * leaf blocks to allow some degree of merging + * concurrency. Smaller btrees are also finalized when + * meta was low so that deleted items are merged + * promptly and freed blocks can bring the client out of + * enospc. + */ + finalize_ours = (lt->item_root.height > 2) || + (le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW); + + /* done if we're not finalizing and there's no finalized */ + if (!finalize_ours && !saw_finalized) { + ret = 0; + break; + } + + /* send sync requests soon to give time to commit */ + scoutfs_key_init_log_trees(&key, 0, 0); + while (others_active && + (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) { + + if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || + (le64_to_cpu(each_lt.rid) == rid)) + continue; + + ret = scoutfs_net_submit_request_node(sb, server->conn, + le64_to_cpu(each_lt.rid), + SCOUTFS_NET_CMD_SYNC_LOG_TREES, + NULL, 0, NULL, NULL, NULL); + if (ret < 0) { + /* fine if they're not here, they'll reconnect or be fenced */ + if (ret == -ENOTCONN) + ret = 0; + else + err_str = "sending sync log tree request"; + } + } + if (ret < 0) { + err_str = "sending sync log tree request"; + break; + } + + /* Finalize ours if it's visible to others */ + if (ours_visible) { + fin = *lt; + memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); + memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); + memset(&fin.data_avail, 0, sizeof(fin.data_avail)); + memset(&fin.data_freed, 0, sizeof(fin.data_freed)); + memset(&fin.srch_file, 0, sizeof(fin.srch_file)); + le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED); + fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + + scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid), + le64_to_cpu(fin.nr)); + ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->logs_root, &key, &fin, + sizeof(fin)); + if (ret < 0) { + err_str = "updating finalized log_trees"; + break; + } + + memset(<->item_root, 0, sizeof(lt->item_root)); + memset(<->bloom_ref, 0, sizeof(lt->bloom_ref)); + lt->max_item_seq = 0; + lt->finalize_seq = 0; + le64_add_cpu(<->nr, 1); + lt->flags = 0; + } + + /* wait a bit for mounts to arrive */ + if (others_active) { + mutex_unlock(&server->logs_mutex); + ret = scoutfs_server_apply_commit(sb, 0); + if (ret < 0) + err_str = "applying commit before waiting for finalized"; + + msleep(FINALIZE_POLL_MS); + + scoutfs_server_hold_commit(sb); + mutex_lock(&server->logs_mutex); + + /* done if we timed out */ + if (time_after(jiffies, timeo)) { + ret = 0; + break; + } + + /* rescan items now that we reacquired lock */ + continue; + } + + /* we can add the merge item under the lock once everyone's finalized */ + + /* add an initial full-range */ + scoutfs_key_set_zeros(&rng.start); + scoutfs_key_set_ones(&rng.end); + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, &rng, sizeof(rng)); + if (ret < 0) { + err_str = "inserting new merge range item"; + break; + } + + /* and add the merge status item, deleting the range if insertion fails */ + scoutfs_key_set_zeros(&stat.next_range_key); + stat.nr_requests = 0; + stat.nr_complete = 0; + stat.seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + + init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); + ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, + &super->log_merge, &key, + &stat, sizeof(stat)); + if (ret < 0) { + err_str = "inserting new merge status item"; + key = rng.start; + key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; + err = scoutfs_btree_delete(sb, &server->alloc, &server->wri, + &super->log_merge, &key); + BUG_ON(err); /* inconsistent */ + } + + /* we're done, caller can make forward progress */ + break; + } + + if (ret < 0) + scoutfs_err(sb, "error %d finalizing log trees for rid %016llx: %s", + ret, rid, err_str); + + return ret; +} + /* * Give the client roots to all the trees that they'll use to build * their transaction. @@ -718,10 +1033,8 @@ static int server_get_log_trees(struct super_block *sb, __le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S]; __le64 vacant[SCOUTFS_DATA_ALLOC_ZONE_LE64S]; struct alloc_extent_cb_args cba; - struct scoutfs_log_trees fin; struct scoutfs_log_trees lt; struct scoutfs_key key; - bool have_fin = false; bool unlock_alloc = false; u64 data_zone_blocks; char *err_str = NULL; @@ -737,18 +1050,8 @@ static int server_get_log_trees(struct super_block *sb, mutex_lock(&server->logs_mutex); - /* see if we have already have a finalized root from the rid */ - ret = find_log_trees_item(sb, &super->logs_root, true, rid, 0, <); - if (ret < 0 && ret != -ENOENT) { - err_str = "finding first log trees"; - goto unlock; - } - if (ret == 0 && le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) - have_fin = true; - /* use the last non-finalized root, or start a new one */ - ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, - <); + ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, <); if (ret < 0 && ret != -ENOENT) { err_str = "finding last log trees"; goto unlock; @@ -767,39 +1070,10 @@ static int server_get_log_trees(struct super_block *sb, lt.nr = cpu_to_le64(nr); } - /* - * Finalize the client log btree when it has enough leaf blocks - * to allow some degree of merging concurrency. Smaller btrees - * are also finalized when meta was low so that deleted items - * are merged promptly and freed blocks can bring the client out - * of enospc. - */ - if (!have_fin && ((lt.item_root.height > 2) || - (le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) { - fin = lt; - memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); - memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); - memset(&fin.data_avail, 0, sizeof(fin.data_avail)); - memset(&fin.data_freed, 0, sizeof(fin.data_freed)); - memset(&fin.srch_file, 0, sizeof(fin.srch_file)); - le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED); - - scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid), - le64_to_cpu(fin.nr)); - ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, - &super->logs_root, &key, &fin, - sizeof(fin)); - if (ret < 0) { - err_str = "updating finalized log trees"; - goto unlock; - } - - memset(<.item_root, 0, sizeof(lt.item_root)); - memset(<.bloom_ref, 0, sizeof(lt.bloom_ref)); - lt.max_item_seq = 0; - le64_add_cpu(<.nr, 1); - lt.flags = 0; - } + /* drops and re-acquires the mutex and commit if it has to wait */ + ret = finalize_and_start_log_merge(sb, <, rid); + if (ret < 0) + goto unlock; if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) { ret = get_data_alloc_zone_bits(sb, rid, exclusive, vacant, data_zone_blocks); @@ -1085,6 +1359,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) /* the mount is no longer writing to the zones */ zero_data_alloc_zone_bits(<); le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED); + lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); err = scoutfs_btree_update(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); @@ -1488,145 +1763,6 @@ out: return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); } -/* - * Log merge range items are stored at the starting fs key of the range. - * The only fs key field that doesn't hold information is the zone, so - * we use the zone to differentiate all types that we store in the log - * merge tree. - */ -static void init_log_merge_key(struct scoutfs_key *key, u8 zone, u64 first, - u64 second) -{ - *key = (struct scoutfs_key) { - .sk_zone = zone, - ._sk_first = cpu_to_le64(first), - ._sk_second = cpu_to_le64(second), - }; -} - -static int next_log_merge_item_key(struct super_block *sb, struct scoutfs_btree_root *root, - u8 zone, struct scoutfs_key *key, void *val, size_t val_len) -{ - SCOUTFS_BTREE_ITEM_REF(iref); - int ret; - - ret = scoutfs_btree_next(sb, root, key, &iref); - if (ret == 0) { - if (iref.key->sk_zone != zone) - ret = -ENOENT; - else if (iref.val_len != val_len) - ret = -EIO; - else - memcpy(val, iref.val, val_len); - scoutfs_btree_put_iref(&iref); - } - - return ret; -} - -static int next_log_merge_item(struct super_block *sb, - struct scoutfs_btree_root *root, - u8 zone, u64 first, u64 second, - void *val, size_t val_len) -{ - struct scoutfs_key key; - - init_log_merge_key(&key, zone, first, second); - return next_log_merge_item_key(sb, root, zone, &key, val, val_len); -} - -/* - * We start a log merge operation if there are any finalized log trees - * whose greatest seq is within the last stable seq. This is called by - * every client's get_log_merge handler at a relatively low frequency - * until a merge starts. - */ -static int start_log_merge(struct super_block *sb, - struct scoutfs_super_block *super, - struct scoutfs_log_merge_status *stat_ret) -{ - struct server_info *server = SCOUTFS_SB(sb)->server_info; - struct scoutfs_log_merge_status stat; - struct scoutfs_log_merge_range rng; - SCOUTFS_BTREE_ITEM_REF(iref); - struct scoutfs_log_trees *lt; - struct scoutfs_key key; - u64 last_seq; - bool start; - int ret; - int err; - - scoutfs_key_init_log_trees(&key, 0, 0); - - ret = get_stable_trans_seq(sb, &last_seq); - if (ret < 0) - goto out; - - scoutfs_key_init_log_trees(&key, 0, 0); - for (start = false; !start; scoutfs_key_inc(&key)) { - ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref); - if (ret == 0) { - if (iref.val_len == sizeof(*lt)) { - key = *iref.key; - lt = iref.val; - if ((le64_to_cpu(lt->flags) & - SCOUTFS_LOG_TREES_FINALIZED) && - (le64_to_cpu(lt->max_item_seq) <= - last_seq)) { - start = true; - } - } else { - ret = -EIO; - } - scoutfs_btree_put_iref(&iref); - } - if (ret < 0) - goto out; - } - - if (!start) { - ret = -ENOENT; - goto out; - } - - /* add an initial full-range */ - scoutfs_key_set_zeros(&rng.start); - scoutfs_key_set_ones(&rng.end); - key = rng.start; - key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; - ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, - &super->log_merge, &key, &rng, sizeof(rng)); - if (ret < 0) - goto out; - - /* and add the merge status item */ - scoutfs_key_set_zeros(&stat.next_range_key); - stat.nr_requests = 0; - stat.nr_complete = 0; - stat.last_seq = cpu_to_le64(last_seq); - stat.seq = cpu_to_le64(scoutfs_server_next_seq(sb)); - - init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0); - ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri, - &super->log_merge, &key, - &stat, sizeof(stat)); - if (ret < 0) { - key = rng.start; - key.sk_zone = SCOUTFS_LOG_MERGE_RANGE_ZONE; - err = scoutfs_btree_delete(sb, &server->alloc, &server->wri, - &super->log_merge, &key); - BUG_ON(err); /* inconsistent */ - } - - /* queue free to see if there's lingering items to process */ - if (ret == 0) - queue_work(server->wq, &server->log_merge_free_work); -out: - if (ret == 0) - *stat_ret = stat; - return ret; -} - /* Requests drain once we get this many completions to splice */ #define LOG_MERGE_SPLICE_BATCH 8 @@ -1839,9 +1975,8 @@ static int splice_log_merge_completions(struct super_block *sb, } /* only free the inputs to the log merge that just finished */ - if (!(le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || - (le64_to_cpu(lt.max_item_seq) > - le64_to_cpu(stat->last_seq))) + if (!((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) && + (le64_to_cpu(lt.finalize_seq) < le64_to_cpu(stat->seq)))) continue; fr.root = lt.item_root; @@ -1914,15 +2049,14 @@ static int next_least_log_item(struct super_block *sb, for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) { - /* find the next finalized log root within the merge last_seq */ + /* find the next finalized log root within the merge */ ret = scoutfs_btree_next(sb, logs_root, &key, &iref); if (ret == 0) { if (iref.val_len == sizeof(*lt)) { key = *iref.key; lt = iref.val; - if ((le64_to_cpu(lt->flags) & - SCOUTFS_LOG_TREES_FINALIZED) && - (le64_to_cpu(lt->max_item_seq) <= seq)) + if ((le64_to_cpu(lt->flags) & SCOUTFS_LOG_TREES_FINALIZED) && + (le64_to_cpu(lt->finalize_seq) < seq)) item_root = lt->item_root; else item_root.ref.blkno = 0; @@ -2095,21 +2229,19 @@ restart: del_req = false; upd_stat = false; - /* get the status item, maybe creating a new one */ + /* get the status item */ ret = next_log_merge_item(sb, &super->log_merge, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0, &stat, sizeof(stat)); - if (ret == -ENOENT) - ret = start_log_merge(sb, super, &stat); if (ret < 0) { - err_str = "finding merge status item"; + if (ret != -ENOENT) + err_str = "finding merge status item"; goto out; } trace_scoutfs_get_log_merge_status(sb, rid, &stat.next_range_key, le64_to_cpu(stat.nr_requests), le64_to_cpu(stat.nr_complete), - le64_to_cpu(stat.last_seq), le64_to_cpu(stat.seq)); /* find the next range, always checking for splicing */ @@ -2149,8 +2281,7 @@ restart: } /* find the next logged item in the next range */ - ret = next_least_log_item(sb, &super->logs_root, - le64_to_cpu(stat.last_seq), + ret = next_least_log_item(sb, &super->logs_root, le64_to_cpu(stat.seq), &rng.start, &rng.end, &next_key); if (ret == 0) { break; @@ -2173,7 +2304,7 @@ restart: /* start to build the request that's saved and sent to the client */ req.logs_root = super->logs_root; - req.last_seq = stat.last_seq; + req.input_seq = stat.seq; req.rid = cpu_to_le64(rid); req.seq = cpu_to_le64(scoutfs_server_next_seq(sb)); req.flags = 0; @@ -2254,7 +2385,7 @@ restart: trace_scoutfs_get_log_merge_request(sb, rid, &req.root, &req.start, &req.end, - le64_to_cpu(req.last_seq), + le64_to_cpu(req.input_seq), le64_to_cpu(req.seq)); /* make sure next range avoids ranges for parent in use */ diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 3cc34212..1c17c631 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -190,25 +190,8 @@ void scoutfs_trans_write_func(struct work_struct *work) goto out; } - trace_scoutfs_trans_write_func(sb, - scoutfs_block_writer_dirty_bytes(sb, &tri->wri)); - - if (!scoutfs_block_writer_has_dirty(sb, &tri->wri) && - !scoutfs_item_dirty_pages(sb)) { - if (sbi->trans_deadline_expired) { - /* - * If we're not writing data then we only advance the - * seq at the sync deadline interval. This keeps idle - * mounts from pinning a seq and stopping readers of the - * seq indices but doesn't send a message for every sync - * syscall. - */ - ret = scoutfs_client_advance_seq(sb, &trans_seq); - if (ret < 0) - s = "clean advance seq"; - } - goto err; - } + trace_scoutfs_trans_write_func(sb, scoutfs_block_writer_dirty_bytes(sb, &tri->wri), + scoutfs_item_dirty_pages(sb)); if (sbi->trans_deadline_expired) scoutfs_inc_counter(sb, trans_commit_timer); @@ -219,15 +202,12 @@ void scoutfs_trans_write_func(struct work_struct *work) ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?: (s = "item dirty", scoutfs_item_write_dirty(sb)) ?: (s = "data prepare", scoutfs_data_prepare_commit(sb)) ?: - (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, - &tri->alloc, &tri->wri)) ?: + (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri)) ?: (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri)) ?: (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?: - (s = "commit log trees", commit_btrees(sb)) ?: - scoutfs_item_write_done(sb) ?: - (s = "advance seq", scoutfs_client_advance_seq(sb, &trans_seq)) ?: - (s = "get log trees", scoutfs_trans_get_log_trees(sb)); -err: + (s = "commit log trees", commit_btrees(sb)) ?: scoutfs_item_write_done(sb) ?: + (s = "get log trees", scoutfs_trans_get_log_trees(sb)) ?: + (s = "advance seq", scoutfs_client_advance_seq(sb, &trans_seq)); if (ret < 0) scoutfs_err(sb, "critical transaction commit failure: %s, %d", s, ret); diff --git a/utils/src/print.c b/utils/src/print.c index 920393f3..05e884b3 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -290,6 +290,7 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val, " data_freed: "ALCROOT_F"\n" " srch_file: "SRF_FMT"\n" " max_item_seq: %llu\n" + " finalize_seq: %llu\n" " rid: %016llx\n" " nr: %llu\n" " flags: %llx\n" @@ -306,6 +307,7 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val, ALCROOT_A(<->data_freed), SRF_A(<->srch_file), le64_to_cpu(lt->max_item_seq), + le64_to_cpu(lt->finalize_seq), le64_to_cpu(lt->rid), le64_to_cpu(lt->nr), le64_to_cpu(lt->flags), @@ -397,12 +399,10 @@ static int print_log_merge_item(struct scoutfs_key *key, void *val, switch (key->sk_zone) { case SCOUTFS_LOG_MERGE_STATUS_ZONE: stat = val; - printf(" status: next_range_key "SK_FMT" nr_req %llu nr_comp %llu" - " last_seq %llu seq %llu\n", + printf(" status: next_range_key "SK_FMT" nr_req %llu nr_comp %llu seq %llu\n", SK_ARG(&stat->next_range_key), le64_to_cpu(stat->nr_requests), le64_to_cpu(stat->nr_complete), - le64_to_cpu(stat->last_seq), le64_to_cpu(stat->seq)); break; case SCOUTFS_LOG_MERGE_RANGE_ZONE: @@ -414,12 +414,12 @@ static int print_log_merge_item(struct scoutfs_key *key, void *val, case SCOUTFS_LOG_MERGE_REQUEST_ZONE: req = val; printf(" request: logs_root "BTROOT_F" logs_root "BTROOT_F" start "SK_FMT - " end "SK_FMT" last_seq %llu rid %016llx seq %llu flags 0x%llx\n", + " end "SK_FMT" input_seq %llu rid %016llx seq %llu flags 0x%llx\n", BTROOT_A(&req->logs_root), BTROOT_A(&req->root), SK_ARG(&req->start), SK_ARG(&req->end), - le64_to_cpu(req->last_seq), + le64_to_cpu(req->input_seq), le64_to_cpu(req->rid), le64_to_cpu(req->seq), le64_to_cpu(req->flags));