From 73bf9161825eb9623817cc7016bb904269df4803 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 23 Jun 2021 10:49:40 -0700 Subject: [PATCH] Return ENOSPC as space gets low Returning ENOSPC is challenging because we have clients working on allocators which are a fraction of the whole and we use COW transactions so we need to be able to allocate to free. This adds support for returning ENOSPC to client posix allocators as free space gets low. For metadata, we reserve a number of free blocks for making progress with client and server transactions which can free space. The server sets the low flag in a client's allocator if we start to dip into reserved blocks. In the client we add an argument to entering a transaction which indicates if we're allocating new space (as opposed to just modifying existing data or freeing). When an allocating transaction runs low and the server low flag is set then we return ENOSPC. Adding an argument to transaciton holders and having it return ENOSPC gave us the opportunity to clean it up and make it a little clearer. More work is done outside the wait_event function and it now specifically waits for a transaction to cycle when it forces a commit rather than spinning until the transaction worker acquires the lock and stops it. For data the same pattern applies except there are no reserved blocks and we don't COW data so it's a simple case of returning the hard ENOSPC when the data allocator flag is set. The server needs to consider the reserved count when refilling the client's meta_avail allocator and when swapping between the two meta_avail and meta_free allocators. We add the reserved metadata block count to statfs_more so that df can subtract it from the free meta blocks and make it clear when enospc is going to be returned for metadata allocations. We increase the minimum device size in mkfs so that small testing devices provide sufficient reserved blocks. And finally we add a little test that makes sure we can fill both metadata and data to ENOSPC and then recover by deleting what we filled. Signed-off-by: Zach Brown --- kmod/src/alloc.c | 36 ++++++++--- kmod/src/alloc.h | 17 +++--- kmod/src/data.c | 26 ++++---- kmod/src/data.h | 2 +- kmod/src/dir.c | 8 +-- kmod/src/format.h | 8 ++- kmod/src/inode.c | 16 ++--- kmod/src/inode.h | 4 +- kmod/src/ioctl.c | 4 +- kmod/src/ioctl.h | 1 + kmod/src/scoutfs_trace.h | 19 +++--- kmod/src/server.c | 129 +++++++++++++++++++++++++++++++++------ kmod/src/server.h | 2 + kmod/src/trans.c | 98 ++++++++++++++++++----------- kmod/src/trans.h | 7 +-- kmod/src/xattr.c | 4 +- tests/golden/enospc | 8 +++ tests/sequence | 1 + tests/tests/enospc.sh | 100 ++++++++++++++++++++++++++++++ utils/man/scoutfs.8 | 5 ++ utils/src/df.c | 5 ++ utils/src/mkfs.c | 6 +- utils/src/print.c | 8 +-- 23 files changed, 389 insertions(+), 125 deletions(-) create mode 100644 tests/golden/enospc create mode 100644 tests/tests/enospc.sh diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index d556112e..ac9601b8 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -676,6 +676,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb, * * Unlike meta allocations, the caller is expected to serialize * allocations from the root. + * + * ENOBUFS is returned if the data allocator ran out of space and we can + * probably refill it from the server. The caller is expected to back + * out, commit the transaction, and try again. + * + * ENOSPC is returned if the data allocator ran out of space but we have + * a flag from the server telling us that there's no more space + * available. This is a hard error and should be returned. */ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, @@ -724,13 +732,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, ret = 0; out: if (ret < 0) { - /* - * Special retval meaning there wasn't space to alloc from - * this txn. Doesn't mean filesystem is completely full. - * Maybe upper layers want to try again. - */ - if (ret == -ENOENT) - ret = -ENOBUFS; + if (ret == -ENOENT) { + if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW) + ret = -ENOSPC; + else + ret = -ENOBUFS; + } + *blkno_ret = 0; *count_ret = 0; } else { @@ -1261,6 +1269,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb, return lo; } +bool scoutfs_alloc_test_flag(struct super_block *sb, + struct scoutfs_alloc *alloc, u32 flag) +{ + unsigned int seq; + bool set; + + do { + seq = read_seqbegin(&alloc->seqlock); + set = !!(le32_to_cpu(alloc->avail.flags) & flag); + } while (read_seqretry(&alloc->seqlock, seq)); + + return set; +} + /* * Call the callers callback for every persistent allocator structure * we can find. diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 9130d086..5a95d98c 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -38,6 +38,10 @@ #define SCOUTFS_ALLOC_DATA_LG_THRESH \ (8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) +/* the client will force commits if data allocators get too low */ +#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \ + ((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT) + /* * Fill client alloc roots to the target when they fall below the lo * threshold. @@ -55,6 +59,7 @@ #define SCOUTFS_SERVER_DATA_FILL_LO \ (1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) + /* * Log merge meta allocations are only used for one request and will * never use more than the dirty limit. @@ -65,16 +70,6 @@ ((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4) #define SCOUTFS_SERVER_MERGE_FILL_LO SCOUTFS_SERVER_MERGE_FILL_TARGET -/* - * Each of the server meta_alloc roots will try to keep a minimum amount - * of free blocks. The server will swap roots when its current avail - * falls below the threshold while the freed root is still above it. It - * must have room for all the largest allocation attempted in a - * transaction on the server. - */ -#define SCOUTFS_SERVER_META_ALLOC_MIN \ - (SCOUTFS_SERVER_META_FILL_TARGET * 2) - /* * A run-time use of a pair of persistent avail/freed roots as a * metadata allocator. It has the machinery needed to lock and avoid @@ -157,6 +152,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb, bool scoutfs_alloc_meta_low(struct super_block *sb, struct scoutfs_alloc *alloc, u32 nr); +bool scoutfs_alloc_test_flag(struct super_block *sb, + struct scoutfs_alloc *alloc, u32 flag); typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg, int owner, u64 id, diff --git a/kmod/src/data.c b/kmod/src/data.c index caf26657..4d710496 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -312,10 +312,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, while (iblock <= last) { if (inode) - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, - true); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); else - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, false); if (ret) break; @@ -756,8 +755,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode, true) ?: - scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, - ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true); } while (ret > 0); if (ret < 0) goto out; @@ -1010,7 +1008,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) while(iblock <= last) { - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret) goto out; @@ -1086,7 +1084,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, } /* we're updating meta_seq with offline block count */ - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret < 0) goto out; @@ -1238,7 +1236,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off, ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, &locks, from, true) ?: scoutfs_inode_index_prepare(sb, &locks, to, true) ?: - scoutfs_inode_index_try_lock_hold(sb, &locks, seq); + scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false); if (ret > 0) continue; if (ret < 0) @@ -1844,13 +1842,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb) return ret; } -u64 scoutfs_data_alloc_free_bytes(struct super_block *sb) +/* + * Return true if the data allocator is lower than the caller's + * requirement and we haven't been told by the server that we're out of + * free extents. + */ +bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks) { DECLARE_DATA_INFO(sb, datinf); - return scoutfs_dalloc_total_len(&datinf->dalloc) << - SCOUTFS_BLOCK_SM_SHIFT; - + return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) && + !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW); } int scoutfs_data_setup(struct super_block *sb) diff --git a/kmod/src/data.h b/kmod/src/data.h index 4f51a8c2..064564f6 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb, void scoutfs_data_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); int scoutfs_data_prepare_commit(struct super_block *sb); -u64 scoutfs_data_alloc_free_bytes(struct super_block *sb); +bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks); int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 79a276da..c6eb331d 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -712,7 +712,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?: scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?: - scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -869,7 +869,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -969,7 +969,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) @@ -1641,7 +1641,7 @@ retry: scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?: (new_inode == NULL ? 0 : scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) diff --git a/kmod/src/format.h b/kmod/src/format.h index 654da558..fb6c1f4f 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -286,9 +286,10 @@ struct scoutfs_alloc_list_head { struct scoutfs_block_ref ref; __le64 total_nr; __le32 first_nr; - __u8 __pad[4]; + __le32 flags; }; + /* * While the main allocator uses extent items in btree blocks, metadata * allocations for a single transaction are recorded in arrays in @@ -317,9 +318,14 @@ struct scoutfs_alloc_list_block { */ struct scoutfs_alloc_root { __le64 total_len; + __le32 flags; + __le32 _pad; struct scoutfs_btree_root root; }; +/* Shared by _alloc_list_head and _alloc_root */ +#define SCOUTFS_ALLOC_FLAG_LOW (1U << 0) + /* types of allocators, exposed to alloc_detail ioctl */ #define SCOUTFS_ALLOC_OWNER_NONE 0 #define SCOUTFS_ALLOC_OWNER_SERVER 1 diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 3911b74e..cfabc332 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -358,7 +358,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, if (!S_ISREG(inode->i_mode)) return 0; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); if (ret) return ret; @@ -385,7 +385,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock) LIST_HEAD(ind_locks); int ret; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false); if (ret) return ret; @@ -500,7 +500,7 @@ retry: } } - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false); if (ret) goto out; @@ -1213,7 +1213,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq) * Returns > 0 if the seq changed and the locks should be retried. */ int scoutfs_inode_index_try_lock_hold(struct super_block *sb, - struct list_head *list, u64 seq) + struct list_head *list, u64 seq, bool allocing) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct index_lock *ind_lock; @@ -1229,7 +1229,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb, goto out; } - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, allocing); if (ret == 0 && seq != sbi->trans_seq) { scoutfs_release_trans(sb); ret = 1; @@ -1243,7 +1243,7 @@ out: } int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, - bool set_data_seq) + bool set_data_seq, bool allocing) { struct super_block *sb = inode->i_sb; int ret; @@ -1253,7 +1253,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, list, inode, set_data_seq) ?: - scoutfs_inode_index_try_lock_hold(sb, list, seq); + scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing); } while (ret > 0); return ret; @@ -1533,7 +1533,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 805eb237..7cb61b57 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -88,9 +88,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb, struct list_head *list, u64 ino, umode_t mode); int scoutfs_inode_index_try_lock_hold(struct super_block *sb, - struct list_head *list, u64 seq); + struct list_head *list, u64 seq, bool allocing); int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, - bool set_data_seq); + bool set_data_seq, bool allocing); void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list); int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock); diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index b323b9a1..cb3f4a4e 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -38,6 +38,7 @@ #include "hash.h" #include "srch.h" #include "alloc.h" +#include "server.h" #include "scoutfs_trace.h" /* @@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg) /* setting only so we don't see 0 data seq with nonzero data_version */ set_data_seq = sm.data_version != 0 ? true : false; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false); if (ret) goto unlock; @@ -879,6 +880,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg) sfm.rid = sbi->rid; sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks); sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks); + sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb); ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq); if (ret) diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 5042edfe..446611e9 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more { __u64 committed_seq; __u64 total_meta_blocks; __u64 total_data_blocks; + __u64 reserved_meta_blocks; }; #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \ diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index bc9c4797..b92471fd 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func, ); DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), - TP_ARGS(sb, journal_info, holders), + TP_ARGS(sb, journal_info, holders, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(unsigned long, journal_info) __field(int, holders) + __field(int, ret) ), TP_fast_assign( @@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, __entry->holders = holders; ), - TP_printk(SCSBF" journal_info 0x%0lx holders %d", - SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders) + TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d", + SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret) ); -DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), - TP_ARGS(sb, journal_info, holders) +DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans, + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), + TP_ARGS(sb, journal_info, holders, ret) ); DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), - TP_ARGS(sb, journal_info, holders) + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), + TP_ARGS(sb, journal_info, holders, ret) ); TRACE_EVENT(scoutfs_ioc_release, diff --git a/kmod/src/server.c b/kmod/src/server.c index 9e8307b8..0bcabc45 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work) struct commit_waiter *cw; struct commit_waiter *pos; struct llist_node *node; + u64 reserved; int ret; trace_scoutfs_server_commit_work_enter(sb, 0, 0); @@ -387,11 +388,17 @@ static void scoutfs_server_commit_func(struct work_struct *work) server->other_avail = &super->server_meta_avail[server->other_ind]; server->other_freed = &super->server_meta_freed[server->other_ind]; - /* swap avail/free if avail gets low and freed is high */ - if (le64_to_cpu(server->meta_avail->total_len) <= - SCOUTFS_SERVER_META_ALLOC_MIN && - le64_to_cpu(server->meta_freed->total_len) > - SCOUTFS_SERVER_META_ALLOC_MIN) + /* + * The reserved metadata blocks includes the max size of + * outstanding allocators and a server transaction could be + * asked to refill all those allocators from meta_avail. If our + * meta_avail falls below the reserved count, and freed is still + * above it, then swap so that we don't start returning enospc + * until we're truly low. + */ + reserved = scoutfs_server_reserved_meta_blocks(sb); + if (le64_to_cpu(server->meta_avail->total_len) <= reserved && + le64_to_cpu(server->meta_freed->total_len) > reserved) swap(server->meta_avail, server->meta_freed); ret = 0; @@ -479,6 +486,57 @@ static int alloc_move_empty(struct super_block *sb, dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0); } +/* + * Copy on write transactions need to allocate new dirty blocks as they + * make modifications to delete items and eventually free more blocks. + * The reserved blocks are meant to keep enough available blocks in + * flight to allow servers and clients to perform transactions that + * don't consume additional space. We have quite a few allocators in + * flight across the server and various client mechanisms (posix items, + * srch compaction, and log merging). We also want to include + * sufficient blocks for client log btrees to grow tall enough to be + * finalized and merges. + * + * The reserved blocks calculation is a policy of the server but it's + * exposed to the statfs_more interface so that df isn't misleading. + * Requiring this synchronization without explicit protocol + * communication isn't great. + */ +u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + u64 server_blocks; + u64 client_blocks; + u64 log_blocks; + u64 nr_clients; + + /* server has two meta_avail lists it swaps between */ + server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2; + + /* + * Log trees will be compacted once they hit a height of 3. + * That'll be the grandparent, two parents resulting from a + * split, and all their child blocks (roughly calculated, + * overestimating). + */ + log_blocks = 3 + (SCOUTFS_BLOCK_LG_SIZE / + (sizeof(struct scoutfs_btree_item) + sizeof(struct scoutfs_block_ref))); + + /* + * Each client can have a meta_avail list, srch compaction + * request, log merge request, and a log btree it's building. + */ + client_blocks = SCOUTFS_SERVER_META_FILL_TARGET + SCOUTFS_SERVER_META_FILL_TARGET + + SCOUTFS_SERVER_MERGE_FILL_TARGET + log_blocks; + + /* we should reserve for voting majority, too */ + spin_lock(&server->lock); + nr_clients = server->nr_clients; + spin_unlock(&server->lock); + + return server_blocks + (max(1ULL, nr_clients) * client_blocks); +} + /* * Set all the bits in the destination which overlap with the extent. */ @@ -662,6 +720,7 @@ static int server_get_log_trees(struct super_block *sb, struct scoutfs_log_trees lt; struct scoutfs_key key; bool have_fin = false; + bool unlock_alloc = false; u64 data_zone_blocks; u64 nr; int ret; @@ -701,8 +760,15 @@ static int server_get_log_trees(struct super_block *sb, lt.nr = cpu_to_le64(nr); } - /* finalize an existing root when large enough and don't have one */ - if (lt.item_root.height > 2 && !have_fin) { + /* + * Finalize the client log btree when it has enough leaf blocks + * to allow some degree of merging concurrency. Smaller btrees + * are also finalized when meta was low so that deleted items + * are merged promptly and freed blocks can bring the client out + * of enospc. + */ + if (!have_fin && ((lt.item_root.height > 2) || + (le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) { fin = lt; memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); @@ -734,24 +800,45 @@ static int server_get_log_trees(struct super_block *sb, data_zone_blocks = 0; } - /* return freed to server for emptying, refill avail */ + /* + * Reclaim the freed meta and data allocators and refill the + * avail allocators, setting low flags if they drop too low. + */ mutex_lock(&server->alloc_mutex); - ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, - server->other_freed, + unlock_alloc = true; + + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed, <.meta_freed) ?: - alloc_move_empty(sb, &super->data_alloc, <.data_freed) ?: - scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, - <.meta_avail, server->meta_avail, - SCOUTFS_SERVER_META_FILL_LO, - SCOUTFS_SERVER_META_FILL_TARGET) ?: - alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc, - SCOUTFS_SERVER_DATA_FILL_LO, - SCOUTFS_SERVER_DATA_FILL_TARGET, - exclusive, vacant, data_zone_blocks); - mutex_unlock(&server->alloc_mutex); + alloc_move_empty(sb, &super->data_alloc, <.data_freed); if (ret < 0) goto unlock; + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + <.meta_avail, server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET); + if (ret < 0) + goto unlock; + + if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb)) + lt.meta_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + else + lt.meta_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + + ret = alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc, + SCOUTFS_SERVER_DATA_FILL_LO, SCOUTFS_SERVER_DATA_FILL_TARGET, + exclusive, vacant, data_zone_blocks); + if (ret < 0) + goto unlock; + + if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO) + lt.data_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + else + lt.data_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + + mutex_unlock(&server->alloc_mutex); + unlock_alloc = false; + /* record data alloc zone bits */ zero_data_alloc_zone_bits(<); if (data_zone_blocks != 0) { @@ -772,6 +859,8 @@ static int server_get_log_trees(struct super_block *sb, ret = scoutfs_btree_force(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); unlock: + if (unlock_alloc) + mutex_unlock(&server->alloc_mutex); mutex_unlock(&server->logs_mutex); ret = scoutfs_server_apply_commit(sb, ret); diff --git a/kmod/src/server.h b/kmod/src/server.h index 79fcb443..41b808e7 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -56,6 +56,8 @@ do { \ __entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \ __entry->name##_error +u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb); + int scoutfs_server_lock_request(struct super_block *sb, u64 rid, struct scoutfs_net_lock *nl); int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id, diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 07eea0fa..9417e39a 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -436,8 +436,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri) return true; } - /* Try to refill data allocator before premature enospc */ - if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) { + /* if we're low and can't refill then alloc could empty and return enospc */ + if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) { scoutfs_inc_counter(sb, trans_commit_data_alloc_low); return true; } @@ -445,38 +445,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri) return false; } -static bool acquired_hold(struct super_block *sb) +/* + * called as a wait_event condition, needs to be careful to not change + * task state and is racing with waking paths that sub_return, test, and + * wake. + */ +static bool holders_no_writer(struct trans_info *tri) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - DECLARE_TRANS_INFO(sb, tri); - bool acquired; - - /* if a caller already has a hold we acquire unconditionally */ - if (inc_journal_info_holders()) { - atomic_inc(&tri->holders); - acquired = true; - goto out; - } - - /* wait if the writer is blocking holds */ - if (!inc_holders_unless_writer(tri)) { - dec_journal_info_holders(); - acquired = false; - goto out; - } - - /* wait if we're triggering another commit */ - if (commit_before_hold(sb, tri)) { - release_holders(sb); - queue_trans_work(sbi); - acquired = false; - goto out; - } - - trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders)); - acquired = true; -out: - return acquired; + smp_mb(); /* make sure task in wait_event queue before atomic read */ + return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT); } /* @@ -492,15 +469,64 @@ out: * The writing thread marks itself as a global trans_task which * short-circuits all the hold machinery so it can call code that would * otherwise try to hold transactions while it is writing. + * + * If the caller is adding metadata items that will eventually consume + * free space -- not dirtying existing items or adding deletion items -- + * then we can return enospc if our metadata allocator indicates that + * we're low on space. */ -int scoutfs_hold_trans(struct super_block *sb) +int scoutfs_hold_trans(struct super_block *sb, bool allocing) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + DECLARE_TRANS_INFO(sb, tri); + u64 seq; + int ret; if (current == sbi->trans_task) return 0; - return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb)); + for (;;) { + /* if a caller already has a hold we acquire unconditionally */ + if (inc_journal_info_holders()) { + atomic_inc(&tri->holders); + ret = 0; + break; + } + + /* wait until the writer work is finished */ + if (!inc_holders_unless_writer(tri)) { + dec_journal_info_holders(); + ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri)); + if (ret < 0) + break; + continue; + } + + /* return enospc if server is into reserved blocks and we're allocating */ + if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) { + release_holders(sb); + ret = -ENOSPC; + break; + } + + /* see if we need to trigger and wait for a commit before holding */ + if (commit_before_hold(sb, tri)) { + seq = scoutfs_trans_sample_seq(sb); + release_holders(sb); + queue_trans_work(sbi); + ret = wait_event_interruptible(sbi->trans_hold_wq, + scoutfs_trans_sample_seq(sb) != seq); + if (ret < 0) + break; + continue; + } + + ret = 0; + break; + } + + trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret); + return ret; } /* @@ -525,7 +551,7 @@ void scoutfs_release_trans(struct super_block *sb) release_holders(sb); - trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders)); + trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0); } /* diff --git a/kmod/src/trans.h b/kmod/src/trans.h index ab42a4cf..51ae1232 100644 --- a/kmod/src/trans.h +++ b/kmod/src/trans.h @@ -1,18 +1,13 @@ #ifndef _SCOUTFS_TRANS_H_ #define _SCOUTFS_TRANS_H_ -/* the server will attempt to fill data allocs for each trans */ -#define SCOUTFS_TRANS_DATA_ALLOC_HWM (2ULL * 1024 * 1024 * 1024) -/* the client will force commits if data allocators get too low */ -#define SCOUTFS_TRANS_DATA_ALLOC_LWM (256ULL * 1024 * 1024) - void scoutfs_trans_write_func(struct work_struct *work); int scoutfs_trans_sync(struct super_block *sb, int wait); int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); void scoutfs_trans_restart_sync_deadline(struct super_block *sb); -int scoutfs_hold_trans(struct super_block *sb); +int scoutfs_hold_trans(struct super_block *sb, bool allocing); bool scoutfs_trans_held(void); void scoutfs_release_trans(struct super_block *sb); u64 scoutfs_trans_sample_seq(struct super_block *sb); diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index 6c00c0c5..fd8acd8e 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, &tgs) != 0) memset(&tgs, 0, sizeof(tgs)); - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, false); if (ret < 0) break; release = true; diff --git a/tests/golden/enospc b/tests/golden/enospc new file mode 100644 index 00000000..150e5cf9 --- /dev/null +++ b/tests/golden/enospc @@ -0,0 +1,8 @@ +== prepare directories and files +== fallocate until enospc +== remove all the files and verify free data blocks +== make small meta fs +== create large xattrs until we fill up metadata +== remove files with xattrs after enospc +== make sure we can create again +== cleanup small meta fs diff --git a/tests/sequence b/tests/sequence index b39ac824..b97e4847 100644 --- a/tests/sequence +++ b/tests/sequence @@ -7,6 +7,7 @@ simple-release-extents.sh setattr_more.sh offline-extent-waiting.sh move-blocks.sh +enospc.sh srch-basic-functionality.sh simple-xattr-unit.sh lock-refleak.sh diff --git a/tests/tests/enospc.sh b/tests/tests/enospc.sh new file mode 100644 index 00000000..ab042479 --- /dev/null +++ b/tests/tests/enospc.sh @@ -0,0 +1,100 @@ +# +# test hititng enospc by filling with data or metadata and +# then recovering by removing what we filled. +# + +# Type Size Total Used Free Use% +#MetaData 64KB 1048576 32782 1015794 3 +# Data 4KB 16777152 0 16777152 0 +free_blocks() { + local md="$1" + local mnt="$2" + scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }' +} + +t_require_commands scoutfs stat fallocate createmany + +echo "== prepare directories and files" +for n in $(t_fs_nrs); do + eval path="\$T_D${n}/dir-$n/file-$n" + mkdir -p $(dirname $path) + touch $path +done +sync + +echo "== fallocate until enospc" +before=$(free_blocks Data "$T_M0") +finished=0 +while [ $finished != 1 ]; do + for n in $(t_fs_nrs); do + eval path="\$T_D${n}/dir-$n/file-$n" + off=$(stat -c "%s" "$path") + + LC_ALL=C fallocate -o $off -l 128MiB "$path" > $T_TMP.fallocate 2>&1 + err="$?" + + if grep -qi "no space" $T_TMP.fallocate; then + finished=1 + break + fi + if [ "$err" != "0" ]; then + t_fail "fallocate failed with $err" + fi + done +done + +echo "== remove all the files and verify free data blocks" +for n in $(t_fs_nrs); do + eval dir="\$T_D${n}/dir-$n" + rm -rf "$dir" +done +sync +after=$(free_blocks Data "$T_M0") +# nothing else should be modifying data blocks +test "$before" == "$after" || \ + t_fail "$after free data blocks after rm, expected $before" + +# XXX this is all pretty manual, would be nice to have helpers +echo "== make small meta fs" +# meta device just big enough for reserves and the metadata we'll fill +scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \ + t_fail "mkfs failed" +SCR="/mnt/scoutfs.enospc" +mkdir -p "$SCR" +mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \ + "$T_EX_DATA_DEV" "$SCR" + +echo "== create large xattrs until we fill up metadata" +mkdir -p "$SCR/xattrs" + +for f in $(seq 1 100000); do + file="$SCR/xattrs/file-$f" + touch "$file" + + LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1 + err="$?" + + if grep -qi "no space" $T_TMP.cxl; then + echo "enospc at f $f" >> $T_TMP.cxl + break + fi + if [ "$err" != "0" ]; then + t_fail "create_xattr_loop failed with $err" + fi +done + +echo "== remove files with xattrs after enospc" +rm -rf "$SCR/xattrs" + +echo "== make sure we can create again" +file="$SCR/file-after" +touch $file +setfattr -n user.scoutfs-enospc -v 1 "$file" +sync +rm -f "$file" + +echo "== cleanup small meta fs" +umount "$SCR" +rmdir "$SCR" + +t_pass diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8 index 09062fb0..abf815dd 100644 --- a/utils/man/scoutfs.8 +++ b/utils/man/scoutfs.8 @@ -36,6 +36,11 @@ A path within a ScoutFS filesystem. .sp Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses separate block devices for its metadata and data storage, two are required. +The internal structures and nature of metadata and data transactions +lead to minimum viable device sizes. +.B mkfs +will check both devices and fail with an error if either are under the +minimum size. .sp If .B --force diff --git a/utils/src/df.c b/utils/src/df.c index 21ea9f04..585d658c 100644 --- a/utils/src/df.c +++ b/utils/src/df.c @@ -86,6 +86,11 @@ static int do_df(struct df_args *args) data_free += ade[i].blocks; } + if (meta_free >= sfm.reserved_meta_blocks) + meta_free -= sfm.reserved_meta_blocks; + else + meta_free = 0; + snprintf(cells[0][0], CHARS, "Type"); snprintf(cells[0][1], CHARS, "Size"); snprintf(cells[0][2], CHARS, "Total"); diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index bcf07357..0abc9086 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -215,12 +215,14 @@ static int do_mkfs(struct mkfs_args *args) goto out; } - ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024), + /* minumum meta device size to make reserved blocks reasonably large */ + ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024), args->max_meta_size, "meta", &meta_size); if (ret) goto out; - ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024), + /* .. then arbitrarily the same minimum data device size */ + ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024), args->max_data_size, "data", &data_size); if (ret) goto out; diff --git a/utils/src/print.c b/utils/src/print.c index f4b51277..4c79a5fb 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -245,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val, le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq) #define AL_HEAD_F \ - AL_REF_F" total_nr %llu first_nr %u" + AL_REF_F" total_nr %llu first_nr %u flags 0x%x" #define AL_HEAD_A(p) \ AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\ - le32_to_cpu((p)->first_nr) + le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags) #define ALCROOT_F \ - BTROOT_F" total_len %llu" + BTROOT_F" total_len %llu flags 0x%x" #define ALCROOT_A(ar) \ - BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len) + BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags) #define SRE_FMT "%016llx.%llu.%llu" #define SRE_A(sre) \