diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index d556112e..ac9601b8 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -676,6 +676,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb, * * Unlike meta allocations, the caller is expected to serialize * allocations from the root. + * + * ENOBUFS is returned if the data allocator ran out of space and we can + * probably refill it from the server. The caller is expected to back + * out, commit the transaction, and try again. + * + * ENOSPC is returned if the data allocator ran out of space but we have + * a flag from the server telling us that there's no more space + * available. This is a hard error and should be returned. */ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, @@ -724,13 +732,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, ret = 0; out: if (ret < 0) { - /* - * Special retval meaning there wasn't space to alloc from - * this txn. Doesn't mean filesystem is completely full. - * Maybe upper layers want to try again. - */ - if (ret == -ENOENT) - ret = -ENOBUFS; + if (ret == -ENOENT) { + if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW) + ret = -ENOSPC; + else + ret = -ENOBUFS; + } + *blkno_ret = 0; *count_ret = 0; } else { @@ -1261,6 +1269,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb, return lo; } +bool scoutfs_alloc_test_flag(struct super_block *sb, + struct scoutfs_alloc *alloc, u32 flag) +{ + unsigned int seq; + bool set; + + do { + seq = read_seqbegin(&alloc->seqlock); + set = !!(le32_to_cpu(alloc->avail.flags) & flag); + } while (read_seqretry(&alloc->seqlock, seq)); + + return set; +} + /* * Call the callers callback for every persistent allocator structure * we can find. diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 9130d086..5a95d98c 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -38,6 +38,10 @@ #define SCOUTFS_ALLOC_DATA_LG_THRESH \ (8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) +/* the client will force commits if data allocators get too low */ +#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \ + ((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT) + /* * Fill client alloc roots to the target when they fall below the lo * threshold. @@ -55,6 +59,7 @@ #define SCOUTFS_SERVER_DATA_FILL_LO \ (1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) + /* * Log merge meta allocations are only used for one request and will * never use more than the dirty limit. @@ -65,16 +70,6 @@ ((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4) #define SCOUTFS_SERVER_MERGE_FILL_LO SCOUTFS_SERVER_MERGE_FILL_TARGET -/* - * Each of the server meta_alloc roots will try to keep a minimum amount - * of free blocks. The server will swap roots when its current avail - * falls below the threshold while the freed root is still above it. It - * must have room for all the largest allocation attempted in a - * transaction on the server. - */ -#define SCOUTFS_SERVER_META_ALLOC_MIN \ - (SCOUTFS_SERVER_META_FILL_TARGET * 2) - /* * A run-time use of a pair of persistent avail/freed roots as a * metadata allocator. It has the machinery needed to lock and avoid @@ -157,6 +152,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb, bool scoutfs_alloc_meta_low(struct super_block *sb, struct scoutfs_alloc *alloc, u32 nr); +bool scoutfs_alloc_test_flag(struct super_block *sb, + struct scoutfs_alloc *alloc, u32 flag); typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg, int owner, u64 id, diff --git a/kmod/src/data.c b/kmod/src/data.c index caf26657..4d710496 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -312,10 +312,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, while (iblock <= last) { if (inode) - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, - true); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); else - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, false); if (ret) break; @@ -756,8 +755,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode, true) ?: - scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, - ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true); } while (ret > 0); if (ret < 0) goto out; @@ -1010,7 +1008,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) while(iblock <= last) { - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret) goto out; @@ -1086,7 +1084,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, } /* we're updating meta_seq with offline block count */ - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret < 0) goto out; @@ -1238,7 +1236,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off, ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, &locks, from, true) ?: scoutfs_inode_index_prepare(sb, &locks, to, true) ?: - scoutfs_inode_index_try_lock_hold(sb, &locks, seq); + scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false); if (ret > 0) continue; if (ret < 0) @@ -1844,13 +1842,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb) return ret; } -u64 scoutfs_data_alloc_free_bytes(struct super_block *sb) +/* + * Return true if the data allocator is lower than the caller's + * requirement and we haven't been told by the server that we're out of + * free extents. + */ +bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks) { DECLARE_DATA_INFO(sb, datinf); - return scoutfs_dalloc_total_len(&datinf->dalloc) << - SCOUTFS_BLOCK_SM_SHIFT; - + return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) && + !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW); } int scoutfs_data_setup(struct super_block *sb) diff --git a/kmod/src/data.h b/kmod/src/data.h index 4f51a8c2..064564f6 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb, void scoutfs_data_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); int scoutfs_data_prepare_commit(struct super_block *sb); -u64 scoutfs_data_alloc_free_bytes(struct super_block *sb); +bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks); int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 79a276da..c6eb331d 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -712,7 +712,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?: scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?: - scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -869,7 +869,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -969,7 +969,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) @@ -1641,7 +1641,7 @@ retry: scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?: (new_inode == NULL ? 0 : scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) diff --git a/kmod/src/format.h b/kmod/src/format.h index 654da558..fb6c1f4f 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -286,9 +286,10 @@ struct scoutfs_alloc_list_head { struct scoutfs_block_ref ref; __le64 total_nr; __le32 first_nr; - __u8 __pad[4]; + __le32 flags; }; + /* * While the main allocator uses extent items in btree blocks, metadata * allocations for a single transaction are recorded in arrays in @@ -317,9 +318,14 @@ struct scoutfs_alloc_list_block { */ struct scoutfs_alloc_root { __le64 total_len; + __le32 flags; + __le32 _pad; struct scoutfs_btree_root root; }; +/* Shared by _alloc_list_head and _alloc_root */ +#define SCOUTFS_ALLOC_FLAG_LOW (1U << 0) + /* types of allocators, exposed to alloc_detail ioctl */ #define SCOUTFS_ALLOC_OWNER_NONE 0 #define SCOUTFS_ALLOC_OWNER_SERVER 1 diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 3911b74e..cfabc332 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -358,7 +358,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, if (!S_ISREG(inode->i_mode)) return 0; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); if (ret) return ret; @@ -385,7 +385,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock) LIST_HEAD(ind_locks); int ret; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false); if (ret) return ret; @@ -500,7 +500,7 @@ retry: } } - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false); if (ret) goto out; @@ -1213,7 +1213,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq) * Returns > 0 if the seq changed and the locks should be retried. */ int scoutfs_inode_index_try_lock_hold(struct super_block *sb, - struct list_head *list, u64 seq) + struct list_head *list, u64 seq, bool allocing) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct index_lock *ind_lock; @@ -1229,7 +1229,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb, goto out; } - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, allocing); if (ret == 0 && seq != sbi->trans_seq) { scoutfs_release_trans(sb); ret = 1; @@ -1243,7 +1243,7 @@ out: } int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, - bool set_data_seq) + bool set_data_seq, bool allocing) { struct super_block *sb = inode->i_sb; int ret; @@ -1253,7 +1253,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, list, inode, set_data_seq) ?: - scoutfs_inode_index_try_lock_hold(sb, list, seq); + scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing); } while (ret > 0); return ret; @@ -1533,7 +1533,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 805eb237..7cb61b57 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -88,9 +88,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb, struct list_head *list, u64 ino, umode_t mode); int scoutfs_inode_index_try_lock_hold(struct super_block *sb, - struct list_head *list, u64 seq); + struct list_head *list, u64 seq, bool allocing); int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, - bool set_data_seq); + bool set_data_seq, bool allocing); void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list); int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock); diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index b323b9a1..cb3f4a4e 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -38,6 +38,7 @@ #include "hash.h" #include "srch.h" #include "alloc.h" +#include "server.h" #include "scoutfs_trace.h" /* @@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg) /* setting only so we don't see 0 data seq with nonzero data_version */ set_data_seq = sm.data_version != 0 ? true : false; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false); if (ret) goto unlock; @@ -879,6 +880,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg) sfm.rid = sbi->rid; sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks); sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks); + sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb); ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq); if (ret) diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 5042edfe..446611e9 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more { __u64 committed_seq; __u64 total_meta_blocks; __u64 total_data_blocks; + __u64 reserved_meta_blocks; }; #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \ diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index bc9c4797..b92471fd 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func, ); DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), - TP_ARGS(sb, journal_info, holders), + TP_ARGS(sb, journal_info, holders, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(unsigned long, journal_info) __field(int, holders) + __field(int, ret) ), TP_fast_assign( @@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, __entry->holders = holders; ), - TP_printk(SCSBF" journal_info 0x%0lx holders %d", - SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders) + TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d", + SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret) ); -DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), - TP_ARGS(sb, journal_info, holders) +DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans, + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), + TP_ARGS(sb, journal_info, holders, ret) ); DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), - TP_ARGS(sb, journal_info, holders) + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), + TP_ARGS(sb, journal_info, holders, ret) ); TRACE_EVENT(scoutfs_ioc_release, diff --git a/kmod/src/server.c b/kmod/src/server.c index 9e8307b8..0bcabc45 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work) struct commit_waiter *cw; struct commit_waiter *pos; struct llist_node *node; + u64 reserved; int ret; trace_scoutfs_server_commit_work_enter(sb, 0, 0); @@ -387,11 +388,17 @@ static void scoutfs_server_commit_func(struct work_struct *work) server->other_avail = &super->server_meta_avail[server->other_ind]; server->other_freed = &super->server_meta_freed[server->other_ind]; - /* swap avail/free if avail gets low and freed is high */ - if (le64_to_cpu(server->meta_avail->total_len) <= - SCOUTFS_SERVER_META_ALLOC_MIN && - le64_to_cpu(server->meta_freed->total_len) > - SCOUTFS_SERVER_META_ALLOC_MIN) + /* + * The reserved metadata blocks includes the max size of + * outstanding allocators and a server transaction could be + * asked to refill all those allocators from meta_avail. If our + * meta_avail falls below the reserved count, and freed is still + * above it, then swap so that we don't start returning enospc + * until we're truly low. + */ + reserved = scoutfs_server_reserved_meta_blocks(sb); + if (le64_to_cpu(server->meta_avail->total_len) <= reserved && + le64_to_cpu(server->meta_freed->total_len) > reserved) swap(server->meta_avail, server->meta_freed); ret = 0; @@ -479,6 +486,57 @@ static int alloc_move_empty(struct super_block *sb, dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0); } +/* + * Copy on write transactions need to allocate new dirty blocks as they + * make modifications to delete items and eventually free more blocks. + * The reserved blocks are meant to keep enough available blocks in + * flight to allow servers and clients to perform transactions that + * don't consume additional space. We have quite a few allocators in + * flight across the server and various client mechanisms (posix items, + * srch compaction, and log merging). We also want to include + * sufficient blocks for client log btrees to grow tall enough to be + * finalized and merges. + * + * The reserved blocks calculation is a policy of the server but it's + * exposed to the statfs_more interface so that df isn't misleading. + * Requiring this synchronization without explicit protocol + * communication isn't great. + */ +u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + u64 server_blocks; + u64 client_blocks; + u64 log_blocks; + u64 nr_clients; + + /* server has two meta_avail lists it swaps between */ + server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2; + + /* + * Log trees will be compacted once they hit a height of 3. + * That'll be the grandparent, two parents resulting from a + * split, and all their child blocks (roughly calculated, + * overestimating). + */ + log_blocks = 3 + (SCOUTFS_BLOCK_LG_SIZE / + (sizeof(struct scoutfs_btree_item) + sizeof(struct scoutfs_block_ref))); + + /* + * Each client can have a meta_avail list, srch compaction + * request, log merge request, and a log btree it's building. + */ + client_blocks = SCOUTFS_SERVER_META_FILL_TARGET + SCOUTFS_SERVER_META_FILL_TARGET + + SCOUTFS_SERVER_MERGE_FILL_TARGET + log_blocks; + + /* we should reserve for voting majority, too */ + spin_lock(&server->lock); + nr_clients = server->nr_clients; + spin_unlock(&server->lock); + + return server_blocks + (max(1ULL, nr_clients) * client_blocks); +} + /* * Set all the bits in the destination which overlap with the extent. */ @@ -662,6 +720,7 @@ static int server_get_log_trees(struct super_block *sb, struct scoutfs_log_trees lt; struct scoutfs_key key; bool have_fin = false; + bool unlock_alloc = false; u64 data_zone_blocks; u64 nr; int ret; @@ -701,8 +760,15 @@ static int server_get_log_trees(struct super_block *sb, lt.nr = cpu_to_le64(nr); } - /* finalize an existing root when large enough and don't have one */ - if (lt.item_root.height > 2 && !have_fin) { + /* + * Finalize the client log btree when it has enough leaf blocks + * to allow some degree of merging concurrency. Smaller btrees + * are also finalized when meta was low so that deleted items + * are merged promptly and freed blocks can bring the client out + * of enospc. + */ + if (!have_fin && ((lt.item_root.height > 2) || + (le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) { fin = lt; memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); @@ -734,24 +800,45 @@ static int server_get_log_trees(struct super_block *sb, data_zone_blocks = 0; } - /* return freed to server for emptying, refill avail */ + /* + * Reclaim the freed meta and data allocators and refill the + * avail allocators, setting low flags if they drop too low. + */ mutex_lock(&server->alloc_mutex); - ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, - server->other_freed, + unlock_alloc = true; + + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed, <.meta_freed) ?: - alloc_move_empty(sb, &super->data_alloc, <.data_freed) ?: - scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, - <.meta_avail, server->meta_avail, - SCOUTFS_SERVER_META_FILL_LO, - SCOUTFS_SERVER_META_FILL_TARGET) ?: - alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc, - SCOUTFS_SERVER_DATA_FILL_LO, - SCOUTFS_SERVER_DATA_FILL_TARGET, - exclusive, vacant, data_zone_blocks); - mutex_unlock(&server->alloc_mutex); + alloc_move_empty(sb, &super->data_alloc, <.data_freed); if (ret < 0) goto unlock; + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + <.meta_avail, server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET); + if (ret < 0) + goto unlock; + + if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb)) + lt.meta_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + else + lt.meta_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + + ret = alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc, + SCOUTFS_SERVER_DATA_FILL_LO, SCOUTFS_SERVER_DATA_FILL_TARGET, + exclusive, vacant, data_zone_blocks); + if (ret < 0) + goto unlock; + + if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO) + lt.data_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + else + lt.data_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + + mutex_unlock(&server->alloc_mutex); + unlock_alloc = false; + /* record data alloc zone bits */ zero_data_alloc_zone_bits(<); if (data_zone_blocks != 0) { @@ -772,6 +859,8 @@ static int server_get_log_trees(struct super_block *sb, ret = scoutfs_btree_force(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); unlock: + if (unlock_alloc) + mutex_unlock(&server->alloc_mutex); mutex_unlock(&server->logs_mutex); ret = scoutfs_server_apply_commit(sb, ret); diff --git a/kmod/src/server.h b/kmod/src/server.h index 79fcb443..41b808e7 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -56,6 +56,8 @@ do { \ __entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \ __entry->name##_error +u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb); + int scoutfs_server_lock_request(struct super_block *sb, u64 rid, struct scoutfs_net_lock *nl); int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id, diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 07eea0fa..9417e39a 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -436,8 +436,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri) return true; } - /* Try to refill data allocator before premature enospc */ - if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) { + /* if we're low and can't refill then alloc could empty and return enospc */ + if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) { scoutfs_inc_counter(sb, trans_commit_data_alloc_low); return true; } @@ -445,38 +445,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri) return false; } -static bool acquired_hold(struct super_block *sb) +/* + * called as a wait_event condition, needs to be careful to not change + * task state and is racing with waking paths that sub_return, test, and + * wake. + */ +static bool holders_no_writer(struct trans_info *tri) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - DECLARE_TRANS_INFO(sb, tri); - bool acquired; - - /* if a caller already has a hold we acquire unconditionally */ - if (inc_journal_info_holders()) { - atomic_inc(&tri->holders); - acquired = true; - goto out; - } - - /* wait if the writer is blocking holds */ - if (!inc_holders_unless_writer(tri)) { - dec_journal_info_holders(); - acquired = false; - goto out; - } - - /* wait if we're triggering another commit */ - if (commit_before_hold(sb, tri)) { - release_holders(sb); - queue_trans_work(sbi); - acquired = false; - goto out; - } - - trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders)); - acquired = true; -out: - return acquired; + smp_mb(); /* make sure task in wait_event queue before atomic read */ + return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT); } /* @@ -492,15 +469,64 @@ out: * The writing thread marks itself as a global trans_task which * short-circuits all the hold machinery so it can call code that would * otherwise try to hold transactions while it is writing. + * + * If the caller is adding metadata items that will eventually consume + * free space -- not dirtying existing items or adding deletion items -- + * then we can return enospc if our metadata allocator indicates that + * we're low on space. */ -int scoutfs_hold_trans(struct super_block *sb) +int scoutfs_hold_trans(struct super_block *sb, bool allocing) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + DECLARE_TRANS_INFO(sb, tri); + u64 seq; + int ret; if (current == sbi->trans_task) return 0; - return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb)); + for (;;) { + /* if a caller already has a hold we acquire unconditionally */ + if (inc_journal_info_holders()) { + atomic_inc(&tri->holders); + ret = 0; + break; + } + + /* wait until the writer work is finished */ + if (!inc_holders_unless_writer(tri)) { + dec_journal_info_holders(); + ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri)); + if (ret < 0) + break; + continue; + } + + /* return enospc if server is into reserved blocks and we're allocating */ + if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) { + release_holders(sb); + ret = -ENOSPC; + break; + } + + /* see if we need to trigger and wait for a commit before holding */ + if (commit_before_hold(sb, tri)) { + seq = scoutfs_trans_sample_seq(sb); + release_holders(sb); + queue_trans_work(sbi); + ret = wait_event_interruptible(sbi->trans_hold_wq, + scoutfs_trans_sample_seq(sb) != seq); + if (ret < 0) + break; + continue; + } + + ret = 0; + break; + } + + trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret); + return ret; } /* @@ -525,7 +551,7 @@ void scoutfs_release_trans(struct super_block *sb) release_holders(sb); - trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders)); + trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0); } /* diff --git a/kmod/src/trans.h b/kmod/src/trans.h index ab42a4cf..51ae1232 100644 --- a/kmod/src/trans.h +++ b/kmod/src/trans.h @@ -1,18 +1,13 @@ #ifndef _SCOUTFS_TRANS_H_ #define _SCOUTFS_TRANS_H_ -/* the server will attempt to fill data allocs for each trans */ -#define SCOUTFS_TRANS_DATA_ALLOC_HWM (2ULL * 1024 * 1024 * 1024) -/* the client will force commits if data allocators get too low */ -#define SCOUTFS_TRANS_DATA_ALLOC_LWM (256ULL * 1024 * 1024) - void scoutfs_trans_write_func(struct work_struct *work); int scoutfs_trans_sync(struct super_block *sb, int wait); int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); void scoutfs_trans_restart_sync_deadline(struct super_block *sb); -int scoutfs_hold_trans(struct super_block *sb); +int scoutfs_hold_trans(struct super_block *sb, bool allocing); bool scoutfs_trans_held(void); void scoutfs_release_trans(struct super_block *sb); u64 scoutfs_trans_sample_seq(struct super_block *sb); diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index 6c00c0c5..fd8acd8e 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, &tgs) != 0) memset(&tgs, 0, sizeof(tgs)); - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, false); if (ret < 0) break; release = true; diff --git a/tests/golden/enospc b/tests/golden/enospc new file mode 100644 index 00000000..150e5cf9 --- /dev/null +++ b/tests/golden/enospc @@ -0,0 +1,8 @@ +== prepare directories and files +== fallocate until enospc +== remove all the files and verify free data blocks +== make small meta fs +== create large xattrs until we fill up metadata +== remove files with xattrs after enospc +== make sure we can create again +== cleanup small meta fs diff --git a/tests/sequence b/tests/sequence index b39ac824..b97e4847 100644 --- a/tests/sequence +++ b/tests/sequence @@ -7,6 +7,7 @@ simple-release-extents.sh setattr_more.sh offline-extent-waiting.sh move-blocks.sh +enospc.sh srch-basic-functionality.sh simple-xattr-unit.sh lock-refleak.sh diff --git a/tests/tests/enospc.sh b/tests/tests/enospc.sh new file mode 100644 index 00000000..ab042479 --- /dev/null +++ b/tests/tests/enospc.sh @@ -0,0 +1,100 @@ +# +# test hititng enospc by filling with data or metadata and +# then recovering by removing what we filled. +# + +# Type Size Total Used Free Use% +#MetaData 64KB 1048576 32782 1015794 3 +# Data 4KB 16777152 0 16777152 0 +free_blocks() { + local md="$1" + local mnt="$2" + scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }' +} + +t_require_commands scoutfs stat fallocate createmany + +echo "== prepare directories and files" +for n in $(t_fs_nrs); do + eval path="\$T_D${n}/dir-$n/file-$n" + mkdir -p $(dirname $path) + touch $path +done +sync + +echo "== fallocate until enospc" +before=$(free_blocks Data "$T_M0") +finished=0 +while [ $finished != 1 ]; do + for n in $(t_fs_nrs); do + eval path="\$T_D${n}/dir-$n/file-$n" + off=$(stat -c "%s" "$path") + + LC_ALL=C fallocate -o $off -l 128MiB "$path" > $T_TMP.fallocate 2>&1 + err="$?" + + if grep -qi "no space" $T_TMP.fallocate; then + finished=1 + break + fi + if [ "$err" != "0" ]; then + t_fail "fallocate failed with $err" + fi + done +done + +echo "== remove all the files and verify free data blocks" +for n in $(t_fs_nrs); do + eval dir="\$T_D${n}/dir-$n" + rm -rf "$dir" +done +sync +after=$(free_blocks Data "$T_M0") +# nothing else should be modifying data blocks +test "$before" == "$after" || \ + t_fail "$after free data blocks after rm, expected $before" + +# XXX this is all pretty manual, would be nice to have helpers +echo "== make small meta fs" +# meta device just big enough for reserves and the metadata we'll fill +scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \ + t_fail "mkfs failed" +SCR="/mnt/scoutfs.enospc" +mkdir -p "$SCR" +mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \ + "$T_EX_DATA_DEV" "$SCR" + +echo "== create large xattrs until we fill up metadata" +mkdir -p "$SCR/xattrs" + +for f in $(seq 1 100000); do + file="$SCR/xattrs/file-$f" + touch "$file" + + LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1 + err="$?" + + if grep -qi "no space" $T_TMP.cxl; then + echo "enospc at f $f" >> $T_TMP.cxl + break + fi + if [ "$err" != "0" ]; then + t_fail "create_xattr_loop failed with $err" + fi +done + +echo "== remove files with xattrs after enospc" +rm -rf "$SCR/xattrs" + +echo "== make sure we can create again" +file="$SCR/file-after" +touch $file +setfattr -n user.scoutfs-enospc -v 1 "$file" +sync +rm -f "$file" + +echo "== cleanup small meta fs" +umount "$SCR" +rmdir "$SCR" + +t_pass diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8 index 09062fb0..abf815dd 100644 --- a/utils/man/scoutfs.8 +++ b/utils/man/scoutfs.8 @@ -36,6 +36,11 @@ A path within a ScoutFS filesystem. .sp Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses separate block devices for its metadata and data storage, two are required. +The internal structures and nature of metadata and data transactions +lead to minimum viable device sizes. +.B mkfs +will check both devices and fail with an error if either are under the +minimum size. .sp If .B --force diff --git a/utils/src/df.c b/utils/src/df.c index 21ea9f04..585d658c 100644 --- a/utils/src/df.c +++ b/utils/src/df.c @@ -86,6 +86,11 @@ static int do_df(struct df_args *args) data_free += ade[i].blocks; } + if (meta_free >= sfm.reserved_meta_blocks) + meta_free -= sfm.reserved_meta_blocks; + else + meta_free = 0; + snprintf(cells[0][0], CHARS, "Type"); snprintf(cells[0][1], CHARS, "Size"); snprintf(cells[0][2], CHARS, "Total"); diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index bcf07357..0abc9086 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -215,12 +215,14 @@ static int do_mkfs(struct mkfs_args *args) goto out; } - ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024), + /* minumum meta device size to make reserved blocks reasonably large */ + ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024), args->max_meta_size, "meta", &meta_size); if (ret) goto out; - ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024), + /* .. then arbitrarily the same minimum data device size */ + ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024), args->max_data_size, "data", &data_size); if (ret) goto out; diff --git a/utils/src/print.c b/utils/src/print.c index f4b51277..4c79a5fb 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -245,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val, le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq) #define AL_HEAD_F \ - AL_REF_F" total_nr %llu first_nr %u" + AL_REF_F" total_nr %llu first_nr %u flags 0x%x" #define AL_HEAD_A(p) \ AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\ - le32_to_cpu((p)->first_nr) + le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags) #define ALCROOT_F \ - BTROOT_F" total_len %llu" + BTROOT_F" total_len %llu flags 0x%x" #define ALCROOT_A(ar) \ - BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len) + BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags) #define SRE_FMT "%016llx.%llu.%llu" #define SRE_A(sre) \