diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 894b92ed..40c33c20 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -86,18 +86,47 @@ static u64 smallest_order_length(u64 len) } /* - * An extent modification dirties three distinct leaves of an allocator - * btree as it adds and removes the blkno and size sorted items for the - * old and new lengths of the extent. Dirtying the paths to these - * leaves can grow the tree and grow/shrink neighbours at each level. - * We over-estimate the number of blocks allocated and freed (the paths - * share a root, growth doesn't free) to err on the simpler and safer - * side. The overhead is minimal given the relatively large list blocks - * and relatively short allocator trees. + * Moving an extent between trees can dirty blocks in several ways. This + * function calculates worst case number of blocks across these scenarions. + * We treat the alloc and free counts independently, so the values below are + * max(allocated, freed), not the sum. + * + * We track extents with two separate btree items: by block number and by size. + * + * If we're removing an extent from the btree (allocating), we can dirty + * two blocks if the keys are in different leaves. If we wind up merging + * leaves because we fall below the low water mark, we can wind up freeing + * three leaves. + * + * That sequence is as follows, assuming the original keys are removed from + * blocks A and B: + * + * Allocate new dirty A' and B' + * Free old stable A and B + * B' has fallen below the low water mark, so copy B' into A' + * Free B' + * + * An extent insertion (freeing an extent) can dirty up to five distinct items + * in the btree as it adds and removes the blkno and size sorted items for the + * old and new lengths of the extent: + * + * In the by-blkno portion of the btree, we can dirty (allocate for COW) up + * to two blocks- either by merging adjacent extents, which can cause us to + * join leaf blocks; or by an insertion that causes a split. + * + * In the by-size portion, we never merge extents, so normally we just dirty + * a single item with a size insertion. But if we merged adjacent extents in + * the by-blkno portion of the tree, we might be working with three by-sizex + * items: removing the two old ones that were combined in the merge; and + * adding the new one for the larger, merged size. + * + * Finally, dirtying the paths to these leaves can grow the tree and grow/shrink + * neighbours at each level, so we multiply by the height of the tree after + * accounting for a possible new level. */ static u32 extent_mod_blocks(u32 height) { - return ((1 + height) * 2) * 3; + return ((1 + height) * 3) * 5; } /* diff --git a/kmod/src/block.c b/kmod/src/block.c index 9eaba500..0fa35dcc 100644 --- a/kmod/src/block.c +++ b/kmod/src/block.c @@ -488,7 +488,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp, int ret = 0; if (scoutfs_forcing_unmount(sb)) - return -EIO; + return -ENOLINK; sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9); @@ -712,8 +712,8 @@ retry: ret = 0; out: - if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) && - !retried && !block_is_dirty(bp)) { + if (!retried && !IS_ERR_OR_NULL(bp) && !block_is_dirty(bp) && + (ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE))) { retried = true; scoutfs_inc_counter(sb, block_cache_remove_stale); block_remove(sb, bp); @@ -1210,7 +1210,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE); if (scoutfs_forcing_unmount(sb)) - return -EIO; + return -ENOLINK; if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) || WARN_ON_ONCE(!op_is_write(opf) && !blk_crc)) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 18e6daf1..c2bfa89f 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -90,6 +90,7 @@ EXPAND_COUNTER(forest_read_items) \ EXPAND_COUNTER(forest_roots_next_hint) \ EXPAND_COUNTER(forest_set_bloom_bits) \ + EXPAND_COUNTER(inode_deleted) \ EXPAND_COUNTER(item_cache_count_objects) \ EXPAND_COUNTER(item_cache_scan_objects) \ EXPAND_COUNTER(item_clear_dirty) \ @@ -145,6 +146,7 @@ EXPAND_COUNTER(lock_shrink_work) \ EXPAND_COUNTER(lock_unlock) \ EXPAND_COUNTER(lock_wait) \ + EXPAND_COUNTER(log_merge_no_finalized) \ EXPAND_COUNTER(log_merge_wait_timeout) \ EXPAND_COUNTER(net_dropped_response) \ EXPAND_COUNTER(net_send_bytes) \ @@ -181,6 +183,7 @@ EXPAND_COUNTER(quorum_send_vote) \ EXPAND_COUNTER(quorum_server_shutdown) \ EXPAND_COUNTER(quorum_term_follower) \ + EXPAND_COUNTER(reclaimed_open_logs) \ EXPAND_COUNTER(server_commit_hold) \ EXPAND_COUNTER(server_commit_queue) \ EXPAND_COUNTER(server_commit_worker) \ diff --git a/kmod/src/format.h b/kmod/src/format.h index 83e6015c..07bb67aa 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -470,7 +470,7 @@ struct scoutfs_srch_compact { * @get_trans_seq, @commit_trans_seq: These pair of sequence numbers * determine if a transaction is currently open for the mount that owns * the log_trees struct. get_trans_seq is advanced by the server as the - * transaction is opened. The server sets comimt_trans_seq equal to + * transaction is opened. The server sets commit_trans_seq equal to * get_ as the transaction is committed. */ struct scoutfs_log_trees { @@ -1091,7 +1091,8 @@ enum scoutfs_net_cmd { EXPAND_NET_ERRNO(ENOMEM) \ EXPAND_NET_ERRNO(EIO) \ EXPAND_NET_ERRNO(ENOSPC) \ - EXPAND_NET_ERRNO(EINVAL) + EXPAND_NET_ERRNO(EINVAL) \ + EXPAND_NET_ERRNO(ENOLINK) #undef EXPAND_NET_ERRNO #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which, diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 5e3f724f..e1ac0e3e 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -1854,6 +1854,9 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) goto out; ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); + if (ret == 0) + scoutfs_inc_counter(sb, inode_deleted); + out: if (clear_trying) clear_bit(bit_nr, ldata->trying); @@ -1962,6 +1965,8 @@ static void iput_worker(struct work_struct *work) while (count-- > 0) iput(inode); + cond_resched(); + /* can't touch inode after final iput */ spin_lock(&inf->iput_lock); diff --git a/kmod/src/net.c b/kmod/src/net.c index 613cf169..8636452e 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -332,7 +332,7 @@ static int submit_send(struct super_block *sb, return -EINVAL; if (scoutfs_forcing_unmount(sb)) - return -EIO; + return -ENOLINK; msend = kmalloc(offsetof(struct message_send, nh.data[data_len]), GFP_NOFS); diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index 7308b1dc..1c42188e 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -520,10 +520,10 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool set_quorum_block_event(sb, &blk, event, term); ret = write_quorum_block(sb, blkno, &blk); if (ret < 0) - scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu", + scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu", ret, blkno, event, term); } else { - scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu", + scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu", ret, blkno, event, term); } @@ -822,6 +822,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* followers and candidates start new election on timeout */ if (qst.role != LEADER && + msg.type == SCOUTFS_QUORUM_MSG_INVALID && ktime_after(ktime_get(), qst.timeout)) { /* .. but only if their server has stopped */ if (!scoutfs_server_is_down(sb)) { @@ -982,7 +983,10 @@ static void scoutfs_quorum_worker(struct work_struct *work) } /* record that this slot no longer has an active quorum */ - update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true); + err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true); + if (err < 0 && ret == 0) + ret = err; + out: if (ret < 0) { scoutfs_err(sb, "quorum service saw error %d, shutting down. This mount is no longer participating in quorum. It should be remounted to restore service.", diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 930a275a..56da1bff 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -823,13 +823,14 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy, ); TRACE_EVENT(scoutfs_xattr_set, - TP_PROTO(struct super_block *sb, size_t name_len, const void *value, - size_t size, int flags), + TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len, + const void *value, size_t size, int flags), - TP_ARGS(sb, name_len, value, size, flags), + TP_ARGS(sb, ino, name_len, value, size, flags), TP_STRUCT__entry( SCSB_TRACE_FIELDS + __field(__u64, ino) __field(size_t, name_len) __field(const void *, value) __field(size_t, size) @@ -838,15 +839,16 @@ TRACE_EVENT(scoutfs_xattr_set, TP_fast_assign( SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; __entry->name_len = name_len; __entry->value = value; __entry->size = size; __entry->flags = flags; ), - TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x", - SCSB_TRACE_ARGS, __entry->name_len, __entry->value, - __entry->size, __entry->flags) + TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x", + SCSB_TRACE_ARGS, __entry->ino, __entry->name_len, + __entry->value, __entry->size, __entry->flags) ); TRACE_EVENT(scoutfs_advance_dirty_super, @@ -1966,15 +1968,17 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down, ); DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, - exceeded), + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(int, holding) __field(int, applying) __field(int, nr_holders) + __field(u32, budget) __field(__u32, avail_before) __field(__u32, freed_before) __field(int, committing) @@ -1985,35 +1989,45 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class, __entry->holding = !!holding; __entry->applying = !!applying; __entry->nr_holders = nr_holders; + __entry->budget = budget; __entry->avail_before = avail_before; __entry->freed_before = freed_before; __entry->committing = !!committing; __entry->exceeded = !!exceeded; ), - TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u", - SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders, - __entry->avail_before, __entry->freed_before, __entry->committing, - __entry->exceeded) + TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u", + SCSB_TRACE_ARGS, __entry->holding, __entry->applying, + __entry->nr_holders, __entry->budget, + __entry->avail_before, __entry->freed_before, + __entry->committing, __entry->exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); #define slt_symbolic(mode) \ @@ -2451,6 +2465,27 @@ TRACE_EVENT(scoutfs_block_dirty_ref, __entry->block_blkno, __entry->block_seq) ); +TRACE_EVENT(scoutfs_get_file_block, + TP_PROTO(struct super_block *sb, u64 blkno, int flags), + + TP_ARGS(sb, blkno, flags), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, blkno) + __field(int, flags) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->blkno = blkno; + __entry->flags = flags; + ), + + TP_printk(SCSBF" blkno %llu flags 0x%x", + SCSB_TRACE_ARGS, __entry->blkno, __entry->flags) +); + TRACE_EVENT(scoutfs_block_stale, TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref, struct scoutfs_block_header *hdr, u32 magic, u32 crc), @@ -3048,6 +3083,27 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv, TP_ARGS(sb, sc) ); +TRACE_EVENT(scoutfs_ioc_search_xattrs, + TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino), + + TP_ARGS(sb, ino, last_ino), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(u64, ino) + __field(u64, last_ino) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; + __entry->last_ino = last_ino; + ), + + TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS, + __entry->ino, __entry->last_ino) +); + #endif /* _TRACE_SCOUTFS_H */ /* This part must be outside protection */ diff --git a/kmod/src/server.c b/kmod/src/server.c index 9244f8d2..4c9de90f 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -65,6 +65,7 @@ struct commit_users { struct list_head holding; struct list_head applying; unsigned int nr_holders; + u32 budget; u32 avail_before; u32 freed_before; bool committing; @@ -84,8 +85,9 @@ static void init_commit_users(struct commit_users *cusers) do { \ __typeof__(cusers) _cusers = (cusers); \ trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding), \ - !list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before, \ - _cusers->freed_before, _cusers->committing, _cusers->exceeded); \ + !list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget, \ + _cusers->avail_before, _cusers->freed_before, _cusers->committing, \ + _cusers->exceeded); \ } while (0) struct server_info { @@ -303,7 +305,6 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv u32 freed_used; u32 avail_now; u32 freed_now; - u32 budget; assert_spin_locked(&cusers->lock); @@ -318,15 +319,14 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv else freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now; - budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET; - if (avail_used <= budget && freed_used <= budget) + if (avail_used <= cusers->budget && freed_used <= cusers->budget) return; exceeded_once = true; cusers->exceeded = cusers->nr_holders; - scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u", - cusers->nr_holders, cusers->avail_before, avail_now, + scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u", + cusers->budget, cusers->avail_before, avail_now, cusers->freed_before, freed_now); list_for_each_entry(hold, &cusers->holding, entry) { @@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server, { bool has_room; bool held; - u32 budget; + u32 new_budget; u32 av; u32 fr; @@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server, } /* +2 for our additional hold and then for the final commit work the server does */ - budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET; - has_room = av >= budget && fr >= budget; + new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET); + has_room = av >= new_budget && fr >= new_budget; /* checking applying so holders drain once an apply caller starts waiting */ held = !cusers->committing && has_room && list_empty(&cusers->applying); @@ -388,6 +388,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server, list_add_tail(&hold->entry, &cusers->holding); cusers->nr_holders++; + cusers->budget = new_budget; } else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) { cusers->committing = true; @@ -516,6 +517,7 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int list_for_each_entry_safe(hold, tmp, &cusers->applying, entry) list_del_init(&hold->entry); cusers->committing = false; + cusers->budget = 0; spin_unlock(&cusers->lock); wake_up(&cusers->waitq); @@ -608,7 +610,7 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; if (scoutfs_forcing_unmount(sb)) { - ret = -EIO; + ret = -ENOLINK; goto out; } @@ -1038,6 +1040,101 @@ static int next_log_merge_item(struct super_block *sb, return next_log_merge_item_key(sb, root, zone, &key, val, val_len); } +static int do_finalize_ours(struct super_block *sb, + struct scoutfs_log_trees *lt, + struct commit_hold *hold) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); + struct scoutfs_key key; + char *err_str = NULL; + u64 rid = le64_to_cpu(lt->rid); + bool more; + int ret; + int err; + + mutex_lock(&server->srch_mutex); + ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri, + &super->srch_root, <->srch_file, true); + mutex_unlock(&server->srch_mutex); + if (ret < 0) { + scoutfs_err(sb, "error rotating srch log for rid %016llx: %d", + rid, ret); + return ret; + } + + do { + more = false; + + /* + * All of these can return errors, perhaps indicating successful + * partial progress, after having modified the allocator trees. + * We always have to update the roots in the log item. + */ + mutex_lock(&server->alloc_mutex); + ret = (err_str = "splice meta_freed to other_freed", + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, server->other_freed, + <->meta_freed)) ?: + (err_str = "splice meta_avail", + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, server->other_freed, + <->meta_avail)) ?: + (err_str = "empty data_avail", + alloc_move_empty(sb, &super->data_alloc, + <->data_avail, + COMMIT_HOLD_ALLOC_BUDGET / 2)) ?: + (err_str = "empty data_freed", + alloc_move_empty(sb, &super->data_alloc, + <->data_freed, + COMMIT_HOLD_ALLOC_BUDGET / 2)); + mutex_unlock(&server->alloc_mutex); + + /* + * only finalize, allowing merging, once the allocators are + * fully freed + */ + if (ret == 0) { + /* the transaction is no longer open */ + le64_add_cpu(<->flags, SCOUTFS_LOG_TREES_FINALIZED); + lt->finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + } + + scoutfs_key_init_log_trees(&key, rid, le64_to_cpu(lt->nr)); + + err = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->logs_root, &key, lt, + sizeof(*lt)); + BUG_ON(err != 0); /* alloc, log, srch items out of sync */ + + if (ret == -EINPROGRESS) { + more = true; + mutex_unlock(&server->logs_mutex); + ret = server_apply_commit(sb, hold, 0); + if (ret < 0) + WARN_ON_ONCE(ret < 0); + server_hold_commit(sb, hold); + mutex_lock(&server->logs_mutex); + } else if (ret == 0) { + memset(<->item_root, 0, sizeof(lt->item_root)); + memset(<->bloom_ref, 0, sizeof(lt->bloom_ref)); + lt->inode_count_delta = 0; + lt->max_item_seq = 0; + lt->finalize_seq = 0; + le64_add_cpu(<->nr, 1); + lt->flags = 0; + } + } while (more); + + if (ret < 0) { + scoutfs_err(sb, + "error %d finalizing log trees for rid %016llx: %s", + ret, rid, err_str); + } + + return ret; +} + /* * Finalizing the log btrees for merging needs to be done carefully so * that items don't appear to go backwards in time. @@ -1089,7 +1186,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l struct scoutfs_log_merge_range rng; struct scoutfs_mount_options opts; struct scoutfs_log_trees each_lt; - struct scoutfs_log_trees fin; unsigned int delay_ms; unsigned long timeo; bool saw_finalized; @@ -1160,6 +1256,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l /* done if we're not finalizing and there's no finalized */ if (!finalize_ours && !saw_finalized) { ret = 0; + scoutfs_inc_counter(sb, log_merge_no_finalized); break; } @@ -1194,32 +1291,11 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l /* Finalize ours if it's visible to others */ if (ours_visible) { - fin = *lt; - memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); - memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); - memset(&fin.data_avail, 0, sizeof(fin.data_avail)); - memset(&fin.data_freed, 0, sizeof(fin.data_freed)); - memset(&fin.srch_file, 0, sizeof(fin.srch_file)); - le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED); - fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); - - scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid), - le64_to_cpu(fin.nr)); - ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, - &super->logs_root, &key, &fin, - sizeof(fin)); + ret = do_finalize_ours(sb, lt, hold); if (ret < 0) { - err_str = "updating finalized log_trees"; + err_str = "finalizing ours"; break; } - - memset(<->item_root, 0, sizeof(lt->item_root)); - memset(<->bloom_ref, 0, sizeof(lt->bloom_ref)); - lt->inode_count_delta = 0; - lt->max_item_seq = 0; - lt->finalize_seq = 0; - le64_add_cpu(<->nr, 1); - lt->flags = 0; } /* wait a bit for mounts to arrive */ @@ -1678,8 +1754,8 @@ unlock: ret = server_apply_commit(sb, &hold, ret); if (ret < 0) - scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s", - ret, rid, err_str); + scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s", + ret, rid, le64_to_cpu(lt.nr), err_str); out: WARN_ON_ONCE(ret < 0); return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); @@ -1814,6 +1890,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) out: mutex_unlock(&server->logs_mutex); + if (ret == 0) + scoutfs_inc_counter(sb, reclaimed_open_logs); + if (ret < 0 && ret != -EINPROGRESS) scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s", ret, rid, err_str); @@ -2531,7 +2610,7 @@ static void server_log_merge_free_work(struct work_struct *work) ret = scoutfs_btree_free_blocks(sb, &server->alloc, &server->wri, &fr.key, - &fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2); + &fr.root, COMMIT_HOLD_ALLOC_BUDGET / 8); if (ret < 0) { err_str = "freeing log btree"; break; @@ -2550,7 +2629,7 @@ static void server_log_merge_free_work(struct work_struct *work) /* freed blocks are in allocator, we *have* to update fr */ BUG_ON(ret < 0); - if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) { + if (server_hold_alloc_used_since(sb, &hold) >= (COMMIT_HOLD_ALLOC_BUDGET * 3) / 4) { mutex_unlock(&server->logs_mutex); ret = server_apply_commit(sb, &hold, ret); commit = false; diff --git a/kmod/src/srch.c b/kmod/src/srch.c index 99ea6448..a77c5eff 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -62,7 +62,7 @@ * re-allocated and re-written. Search can restart by checking the * btree for the current set of files. Compaction reads log files which * are protected from other compactions by the persistent busy items - * created by the server. Compaction won't see it's blocks reused out + * created by the server. Compaction won't see its blocks reused out * from under it, but it can encounter stale cached blocks that need to * be invalidated. */ @@ -442,6 +442,10 @@ out: if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks)) sfl->blocks = cpu_to_le64(blk + 1); + if (bl) { + trace_scoutfs_get_file_block(sb, bl->blkno, flags); + } + *bl_ret = bl; return ret; } @@ -533,23 +537,35 @@ out: * the pairs cancel each other out by all readers (the second encoding * looks like deletion) so they aren't visible to the first/last bounds of * the block or file. + * + * We use the same entry repeatedly, so the diff between them will be empty. + * This lets us just emit the two-byte count word, leaving the other bytes + * as zero. + * + * Split the desired total len into two pieces, adding any remainder to the + * first four-bit value. */ -static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk, - struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre) +static void append_padded_entry(struct scoutfs_srch_file *sfl, + struct scoutfs_srch_block *srb, + int len) { - int ret; + int each; + int rem; + u16 lengths = 0; + u8 *buf = srb->entries + le32_to_cpu(srb->entry_bytes); - ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes), - sre, &srb->tail); - if (ret > 0) { - srb->tail = *sre; - le32_add_cpu(&srb->entry_nr, 1); - le32_add_cpu(&srb->entry_bytes, ret); - le64_add_cpu(&sfl->entries, 1); - ret = 0; - } + each = (len - 2) >> 1; + rem = (len - 2) & 1; - return ret; + lengths |= each + rem; + lengths |= each << 4; + + memset(buf, 0, len); + put_unaligned_le16(lengths, buf); + + le32_add_cpu(&srb->entry_nr, 1); + le32_add_cpu(&srb->entry_bytes, len); + le64_add_cpu(&sfl->entries, 1); } /* @@ -560,61 +576,41 @@ static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk, * This is called when there is a single existing entry in the block. * We have the entire block to work with. We encode pairs of matching * entries. This hides them from readers (both searches and merging) as - * they're interpreted as creation and deletion and are deleted. We use - * the existing hash value of the first entry in the block but then set - * the inode to an impossibly large number so it doesn't interfere with - * anything. + * they're interpreted as creation and deletion and are deleted. * - * To hit the specific offset we very carefully manage the amount of - * bytes of change between fields in the entry. We know that if we - * change all the byte of the ino and id we end up with a 20 byte - * (2+8+8,2) encoding of the pair of entries. To have the last entry - * start at the _SAFE_POS offset we know that the final 20 byte pair - * encoding needs to end at 2 bytes (second entry encoding) after the - * _SAFE_POS offset. + * For simplicity and to maintain sort ordering within the block, we reuse + * the existing entry. This lets us skip the encoding step, because we know + * the diff will be zero. We can zero-pad the resulting entries to hit the + * target offset exactly. * - * So as we encode pairs we watch the delta of our current offset from - * that desired final offset of 2 past _SAFE_POS. If we're a multiple - * of 20 away then we encode the full 20 byte pairs. If we're not, then - * we drop a byte to encode 19 bytes. That'll slowly change the offset - * to be a multiple of 20 again while encoding large entries. + * Because we can't predict the exact number of entry_bytes when we start, + * we adjust the byte count of subsequent entries until we wind up at a + * multiple of 20 bytes away from our goal and then use that length for + * the remaining entries. + * + * We could just use a single pair of unnaturally large entries to consume + * the needed space, adjusting for an odd number of entry_bytes if necessary. + * The use of 19 or 20 bytes for the entry pair matches what we would see with + * real (non-zero) entries that vary from the existing entry. */ -static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk, +static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, struct scoutfs_srch_block *srb) { - struct scoutfs_srch_entry sre; u32 target; s32 diff; - u64 hash; - u64 ino; - u64 id; - int ret; - - hash = le64_to_cpu(srb->tail.hash); - ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62); - id = le64_to_cpu(srb->tail.id); target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2; while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) { - ino ^= 1ULL << (7 * 8); + append_padded_entry(sfl, srb, 10); if (diff % 20 == 0) { - id ^= 1ULL << (7 * 8); + append_padded_entry(sfl, srb, 10); } else { - id ^= 1ULL << (6 * 8); + append_padded_entry(sfl, srb, 9); } - - sre.hash = cpu_to_le64(hash); - sre.ino = cpu_to_le64(ino); - sre.id = cpu_to_le64(id); - - ret = append_padded_entry(sfl, blk, srb, &sre); - if (ret == 0) - ret = append_padded_entry(sfl, blk, srb, &sre); - BUG_ON(ret != 0); - - diff = target - le32_to_cpu(srb->entry_bytes); } + + WARN_ON_ONCE(diff != 0); } /* @@ -749,14 +745,14 @@ static int search_log_file(struct super_block *sb, for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) { if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } ret = decode_entry(srb->entries + pos, &sre, &prev); if (ret <= 0) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } pos += ret; @@ -859,14 +855,14 @@ static int search_sorted_file(struct super_block *sb, if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } ret = decode_entry(srb->entries + pos, &sre, &prev); if (ret <= 0) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } pos += ret; @@ -972,6 +968,8 @@ int scoutfs_srch_search_xattrs(struct super_block *sb, scoutfs_inc_counter(sb, srch_search_xattrs); + trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino); + *done = false; srch_init_rb_root(sroot); @@ -1669,7 +1667,7 @@ static int kway_merge(struct super_block *sb, /* end sorted block on _SAFE offset for testing */ if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input && scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) { - pad_entries_at_safe(sfl, blk, srb); + pad_entries_at_safe(sfl, srb); scoutfs_block_put(sb, bl); bl = NULL; blk++; @@ -1802,7 +1800,7 @@ static void swap_page_sre(void *A, void *B, int size) * typically, ~10x worst case). * * Because we read and sort all the input files we must perform the full - * compaction in one operation. The server must have given us a + * compaction in one operation. The server must have given us * sufficiently large avail/freed lists, otherwise we'll return ENOSPC. */ static int compact_logs(struct super_block *sb, @@ -1866,14 +1864,14 @@ static int compact_logs(struct super_block *sb, if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } ret = decode_entry(srb->entries + pos, sre, &prev); if (ret <= 0) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; goto out; } prev = *sre; diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 10494c43..d131bfa1 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -196,7 +196,7 @@ static int retry_forever(struct super_block *sb, int (*func)(struct super_block } if (scoutfs_forcing_unmount(sb)) { - ret = -EIO; + ret = -ENOLINK; break; } @@ -252,7 +252,7 @@ void scoutfs_trans_write_func(struct work_struct *work) } if (scoutfs_forcing_unmount(sb)) { - ret = -EIO; + ret = -ENOLINK; goto out; } diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index 09db9927..7805b364 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_ int ret; int err; - trace_scoutfs_xattr_set(sb, name_len, value, size, flags); + trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags); if (WARN_ON_ONCE(tgs->totl && tgs->indx) || WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock)) diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index 294f9c72..b5a1a3cf 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -140,6 +140,9 @@ t_filter_dmesg() re="$re|scoutfs .* error.*server failed to bind to.*" re="$re|scoutfs .* critical transaction commit failure.*" + # ENOLINK (-67) indicates an expected forced unmount error + re="$re|scoutfs .* error -67 .*" + # change-devices causes loop device resizing re="$re|loop: module loaded" re="$re|loop[0-9].* detected capacity change from.*" diff --git a/tests/golden/large-fragmented-free b/tests/golden/large-fragmented-free index 7740849c..58aea5f7 100644 --- a/tests/golden/large-fragmented-free +++ b/tests/golden/large-fragmented-free @@ -1,4 +1,3 @@ -== setting longer hung task timeout == creating fragmented extents == unlink file with moved extents to free extents per block == cleanup diff --git a/tests/golden/offline-extent-waiting b/tests/golden/offline-extent-waiting index 9ea47d4f..7fcd8f19 100644 --- a/tests/golden/offline-extent-waiting +++ b/tests/golden/offline-extent-waiting @@ -49,7 +49,7 @@ offline wating should be empty: 0 == truncating does wait truncate should be waiting for first block: -trunate should no longer be waiting: +truncate should no longer be waiting: 0 == writing waits should be waiting for write diff --git a/tests/tests/format-version-forward-back.sh b/tests/tests/format-version-forward-back.sh index a5dde26b..04d6f907 100644 --- a/tests/tests/format-version-forward-back.sh +++ b/tests/tests/format-version-forward-back.sh @@ -11,8 +11,8 @@ # format version. # -# not supported on el9! -if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then +# not supported on el8 or higher +if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then t_skip_permitted "Unsupported OS version" fi diff --git a/tests/tests/large-fragmented-free.sh b/tests/tests/large-fragmented-free.sh index 33f3bef7..e27d1bbe 100644 --- a/tests/tests/large-fragmented-free.sh +++ b/tests/tests/large-fragmented-free.sh @@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600 EXTENTS_PER_LIST_BLOCK=8192 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK)) -# -# This test specifically creates a pathologically sparse file that will -# be as expensive as possible to free. This is usually fine on -# dedicated or reasonable hardware, but trying to run this in -# virtualized debug kernels can take a very long time. This test is -# about making sure that the server doesn't fail, not that the platform -# can handle the scale of work that our btree formats happen to require -# while execution is bogged down with use-after-free memory reference -# tracking. So we give the test a lot more breathing room before -# deciding that its hung. -# -echo "== setting longer hung task timeout" -if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then - secs=$(cat /proc/sys/kernel/hung_task_timeout_secs) - test "$secs" -gt 0 || \ - t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs" - restore_hung_task_timeout() - { - echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs - } - trap restore_hung_task_timeout EXIT - echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs -fi - echo "== creating fragmented extents" fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move" diff --git a/tests/tests/offline-extent-waiting.sh b/tests/tests/offline-extent-waiting.sh index 31b04b59..dda3b7d4 100644 --- a/tests/tests/offline-extent-waiting.sh +++ b/tests/tests/offline-extent-waiting.sh @@ -157,7 +157,7 @@ echo "truncate should be waiting for first block:" expect_wait "$DIR/file" "change_size" $ino 0 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES sleep .1 -echo "trunate should no longer be waiting:" +echo "truncate should no longer be waiting:" scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l cat "$DIR/golden" > "$DIR/file" vers=$(scoutfs stat -s data_version "$DIR/file") @@ -168,10 +168,13 @@ scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES # overwrite, not truncate+write dd if="$DIR/other" of="$DIR/file" \ bs=$BS count=$BLOCKS conv=notrunc status=none & +pid="$!" sleep .1 echo "should be waiting for write" expect_wait "$DIR/file" "write" $ino 0 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES +# wait for the background dd to complete +wait "$pid" 2> /dev/null cmp "$DIR/file" "$DIR/other" echo "== cleanup" diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh index 414c747b..2bfddacc 100644 --- a/tests/tests/orphan-inodes.sh +++ b/tests/tests/orphan-inodes.sh @@ -67,18 +67,49 @@ t_mount_all while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do sleep .5 done -# wait for orphan scans to run -t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000 -# also have to wait for delayed log merge work from mount -C=120 -while (( C-- )); do - brk=1 - for ino in $inos; do - inode_exists $ino && brk=0 - done - test $brk -eq 1 && break + + +sv=$(t_server_nr) + +# wait for reclaim_open_log_tree() to complete for each mount +while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do sleep 1 done + +# wait for finalize_and_start_log_merge() to find no active merges in flight +# and not find any finalized trees +while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do + sleep 1 +done + +# wait for orphan scans to run +t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000 +# wait until we see two consecutive orphan scan attempts without +# any inode deletion forward progress in each mount +for nr in $(t_fs_nrs); do + C=0 + LOSA=$(t_counter orphan_scan_attempts $nr) + LDOP=$(t_counter inode_deleted $nr) + + while [ $C -lt 2 ]; do + sleep 1 + + OSA=$(t_counter orphan_scan_attempts $nr) + DOP=$(t_counter inode_deleted $nr) + + if [ $OSA != $LOSA ]; then + if [ $DOP == $LDOP ]; then + (( C++ )) + else + C=0 + fi + fi + + LOSA=$OSA + LDOP=$DOP + done +done + for ino in $inos; do inode_exists $ino && echo "$ino still exists" done diff --git a/utils/sparse.sh b/utils/sparse.sh index 0b93324f..adef573b 100755 --- a/utils/sparse.sh +++ b/utils/sparse.sh @@ -22,6 +22,11 @@ RE="$RE|warning: memset with byte count of 4194304" # some sparse versions don't know about some builtins RE="$RE|error: undefined identifier '__builtin_fpclassify'" +# on el8, sparse can't handle __has_include for some reason when _GNU_SOURCE +# is defined, and we need that for O_DIRECT. +RE="$RE|note: in included file .through /usr/include/sys/stat.h.:" +RE="$RE|/usr/include/bits/statx.h:30:6: error: " + # # don't filter out 'too many errors' here, it can signify that # sparse doesn't understand something and is throwing a *ton*