diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index d556112e..ac9601b8 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -676,6 +676,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb, * * Unlike meta allocations, the caller is expected to serialize * allocations from the root. + * + * ENOBUFS is returned if the data allocator ran out of space and we can + * probably refill it from the server. The caller is expected to back + * out, commit the transaction, and try again. + * + * ENOSPC is returned if the data allocator ran out of space but we have + * a flag from the server telling us that there's no more space + * available. This is a hard error and should be returned. */ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, @@ -724,13 +732,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc, ret = 0; out: if (ret < 0) { - /* - * Special retval meaning there wasn't space to alloc from - * this txn. Doesn't mean filesystem is completely full. - * Maybe upper layers want to try again. - */ - if (ret == -ENOENT) - ret = -ENOBUFS; + if (ret == -ENOENT) { + if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW) + ret = -ENOSPC; + else + ret = -ENOBUFS; + } + *blkno_ret = 0; *count_ret = 0; } else { @@ -1261,6 +1269,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb, return lo; } +bool scoutfs_alloc_test_flag(struct super_block *sb, + struct scoutfs_alloc *alloc, u32 flag) +{ + unsigned int seq; + bool set; + + do { + seq = read_seqbegin(&alloc->seqlock); + set = !!(le32_to_cpu(alloc->avail.flags) & flag); + } while (read_seqretry(&alloc->seqlock, seq)); + + return set; +} + /* * Call the callers callback for every persistent allocator structure * we can find. diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 9130d086..5a95d98c 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -38,6 +38,10 @@ #define SCOUTFS_ALLOC_DATA_LG_THRESH \ (8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) +/* the client will force commits if data allocators get too low */ +#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \ + ((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT) + /* * Fill client alloc roots to the target when they fall below the lo * threshold. @@ -55,6 +59,7 @@ #define SCOUTFS_SERVER_DATA_FILL_LO \ (1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) + /* * Log merge meta allocations are only used for one request and will * never use more than the dirty limit. @@ -65,16 +70,6 @@ ((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4) #define SCOUTFS_SERVER_MERGE_FILL_LO SCOUTFS_SERVER_MERGE_FILL_TARGET -/* - * Each of the server meta_alloc roots will try to keep a minimum amount - * of free blocks. The server will swap roots when its current avail - * falls below the threshold while the freed root is still above it. It - * must have room for all the largest allocation attempted in a - * transaction on the server. - */ -#define SCOUTFS_SERVER_META_ALLOC_MIN \ - (SCOUTFS_SERVER_META_FILL_TARGET * 2) - /* * A run-time use of a pair of persistent avail/freed roots as a * metadata allocator. It has the machinery needed to lock and avoid @@ -157,6 +152,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb, bool scoutfs_alloc_meta_low(struct super_block *sb, struct scoutfs_alloc *alloc, u32 nr); +bool scoutfs_alloc_test_flag(struct super_block *sb, + struct scoutfs_alloc *alloc, u32 flag); typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg, int owner, u64 id, diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 9e9e9f5e..e12c58d7 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -88,6 +88,7 @@ EXPAND_COUNTER(forest_read_items) \ EXPAND_COUNTER(forest_roots_next_hint) \ EXPAND_COUNTER(forest_set_bloom_bits) \ + EXPAND_COUNTER(inode_evict_intr) \ EXPAND_COUNTER(item_clear_dirty) \ EXPAND_COUNTER(item_create) \ EXPAND_COUNTER(item_delete) \ @@ -151,6 +152,12 @@ EXPAND_COUNTER(net_recv_invalid_message) \ EXPAND_COUNTER(net_recv_messages) \ EXPAND_COUNTER(net_unknown_request) \ + EXPAND_COUNTER(orphan_scan) \ + EXPAND_COUNTER(orphan_scan_cached) \ + EXPAND_COUNTER(orphan_scan_error) \ + EXPAND_COUNTER(orphan_scan_item) \ + EXPAND_COUNTER(orphan_scan_omap_set) \ + EXPAND_COUNTER(orphan_scan_read) \ EXPAND_COUNTER(quorum_elected) \ EXPAND_COUNTER(quorum_fence_error) \ EXPAND_COUNTER(quorum_fence_leader) \ diff --git a/kmod/src/data.c b/kmod/src/data.c index caf26657..4d710496 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -312,10 +312,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, while (iblock <= last) { if (inode) - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, - true); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); else - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, false); if (ret) break; @@ -756,8 +755,7 @@ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode, true) ?: - scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, - ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true); } while (ret > 0); if (ret < 0) goto out; @@ -1010,7 +1008,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) while(iblock <= last) { - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret) goto out; @@ -1086,7 +1084,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, } /* we're updating meta_seq with offline block count */ - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret < 0) goto out; @@ -1238,7 +1236,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off, ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, &locks, from, true) ?: scoutfs_inode_index_prepare(sb, &locks, to, true) ?: - scoutfs_inode_index_try_lock_hold(sb, &locks, seq); + scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false); if (ret > 0) continue; if (ret < 0) @@ -1844,13 +1842,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb) return ret; } -u64 scoutfs_data_alloc_free_bytes(struct super_block *sb) +/* + * Return true if the data allocator is lower than the caller's + * requirement and we haven't been told by the server that we're out of + * free extents. + */ +bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks) { DECLARE_DATA_INFO(sb, datinf); - return scoutfs_dalloc_total_len(&datinf->dalloc) << - SCOUTFS_BLOCK_SM_SHIFT; - + return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) && + !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW); } int scoutfs_data_setup(struct super_block *sb) diff --git a/kmod/src/data.h b/kmod/src/data.h index 4f51a8c2..064564f6 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb, void scoutfs_data_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); int scoutfs_data_prepare_commit(struct super_block *sb); -u64 scoutfs_data_alloc_free_bytes(struct super_block *sb); +bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks); int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 2223a4ab..c6eb331d 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -669,6 +669,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev, struct scoutfs_lock **dir_lock, struct scoutfs_lock **inode_lock, + struct scoutfs_lock **orph_lock, struct list_head *ind_locks) { struct super_block *sb = dir->i_sb; @@ -701,11 +702,17 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry, if (ret) goto out_unlock; + if (orph_lock) { + ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock); + if (ret < 0) + goto out_unlock; + } + retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?: scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?: - scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -725,9 +732,13 @@ out_unlock: if (ret) { scoutfs_inode_index_unlock(sb, ind_locks); scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE); - scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE); *dir_lock = NULL; + scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE); *inode_lock = NULL; + if (orph_lock) { + scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY); + *orph_lock = NULL; + } inode = ERR_PTR(ret); } @@ -752,7 +763,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); inode = lock_hold_create(dir, dentry, mode, rdev, - &dir_lock, &inode_lock, &ind_locks); + &dir_lock, &inode_lock, NULL, &ind_locks); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -813,13 +824,15 @@ static int scoutfs_link(struct dentry *old_dentry, struct super_block *sb = dir->i_sb; struct scoutfs_lock *dir_lock; struct scoutfs_lock *inode_lock = NULL; + struct scoutfs_lock *orph_lock = NULL; LIST_HEAD(ind_locks); - bool del_orphan; + bool del_orphan = false; u64 dir_size; u64 ind_seq; u64 hash; u64 pos; int ret; + int err; hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); @@ -843,13 +856,20 @@ static int scoutfs_link(struct dentry *old_dentry, goto out_unlock; dir_size = i_size_read(dir) + dentry->d_name.len; - del_orphan = (inode->i_nlink == 0); + + if (inode->i_nlink == 0) { + del_orphan = true; + ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode), + &orph_lock); + if (ret < 0) + goto out_unlock; + } retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -860,7 +880,7 @@ retry: goto out; if (del_orphan) { - ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode)); + ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock); if (ret) goto out; } @@ -871,8 +891,11 @@ retry: dentry->d_name.name, dentry->d_name.len, scoutfs_ino(inode), inode->i_mode, dir_lock, inode_lock); - if (ret) + if (ret) { + err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */ goto out; + } update_dentry_info(sb, dentry, hash, pos, dir_lock); i_size_write(dir, dir_size); @@ -880,11 +903,6 @@ retry: inode->i_ctime = dir->i_mtime; inc_nlink(inode); - if (del_orphan) { - ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode)); - WARN_ON_ONCE(ret); - } - scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); @@ -896,6 +914,8 @@ out_unlock: scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); + return ret; } @@ -920,6 +940,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct timespec ts = current_kernel_time(); struct scoutfs_lock *inode_lock = NULL; + struct scoutfs_lock *orph_lock = NULL; struct scoutfs_lock *dir_lock = NULL; LIST_HEAD(ind_locks); u64 ind_seq; @@ -937,32 +958,36 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry) goto unlock; } + if (should_orphan(inode)) { + ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode), + &orph_lock); + if (ret < 0) + goto unlock; + } + retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) goto unlock; + if (should_orphan(inode)) { + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + if (ret < 0) + goto out; + } + ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry), dentry_info_pos(dentry), scoutfs_ino(inode), dir_lock, inode_lock); - if (ret) + if (ret) { + ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock); + WARN_ON_ONCE(ret); /* should have been dirty */ goto out; - - if (should_orphan(inode)) { - /* - * Insert the orphan item before we modify any inode - * metadata so we can gracefully exit should it - * fail. - */ - ret = scoutfs_orphan_inode(inode); - WARN_ON_ONCE(ret); /* XXX returning error but items deleted */ - if (ret) - goto out; } dir->i_ctime = ts; @@ -984,6 +1009,7 @@ unlock: scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); return ret; } @@ -1176,7 +1202,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry, return ret; inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, - &dir_lock, &inode_lock, &ind_locks); + &dir_lock, &inode_lock, NULL, &ind_locks); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -1535,6 +1561,7 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct scoutfs_lock *new_dir_lock = NULL; struct scoutfs_lock *old_inode_lock = NULL; struct scoutfs_lock *new_inode_lock = NULL; + struct scoutfs_lock *orph_lock = NULL; struct timespec now; bool ins_new = false; bool del_new = false; @@ -1599,6 +1626,13 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; + if (should_orphan(new_inode)) { + ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode), + &orph_lock); + if (ret < 0) + goto out_unlock; + } + retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?: @@ -1607,7 +1641,7 @@ retry: scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?: (new_inode == NULL ? 0 : scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -1658,7 +1692,7 @@ retry: ins_old = true; if (should_orphan(new_inode)) { - ret = scoutfs_orphan_inode(new_inode); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock); if (ret) goto out; } @@ -1762,6 +1796,7 @@ out_unlock: scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); return ret; } @@ -1781,6 +1816,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod struct inode *inode = NULL; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_lock *inode_lock = NULL; + struct scoutfs_lock *orph_lock = NULL; LIST_HEAD(ind_locks); int ret; @@ -1788,25 +1824,32 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod return -ENAMETOOLONG; inode = lock_hold_create(dir, dentry, mode, 0, - &dir_lock, &inode_lock, &ind_locks); + &dir_lock, &inode_lock, &orph_lock, &ind_locks); if (IS_ERR(inode)) return PTR_ERR(inode); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + if (ret < 0) { + iput(inode); + goto out; /* XXX returning error but items created */ + } + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; insert_inode_hash(inode); + ihold(inode); /* need to update inode modifications in d_tmpfile */ d_tmpfile(dentry, inode); scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); scoutfs_inode_index_unlock(sb, &ind_locks); + iput(inode); - ret = scoutfs_orphan_inode(inode); - WARN_ON_ONCE(ret); /* XXX returning error but items deleted */ - +out: scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); return ret; } diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 37be80a0..a2f555d0 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -758,6 +758,16 @@ out: return 0; } +void scoutfs_forest_stop(struct super_block *sb) +{ + DECLARE_FOREST_INFO(sb, finf); + + if (finf && finf->workq) { + cancel_delayed_work_sync(&finf->log_merge_dwork); + destroy_workqueue(finf->workq); + } +} + void scoutfs_forest_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); @@ -766,11 +776,6 @@ void scoutfs_forest_destroy(struct super_block *sb) if (finf) { scoutfs_block_put(sb, finf->srch_bl); - if (finf->workq) { - cancel_delayed_work_sync(&finf->log_merge_dwork); - destroy_workqueue(finf->workq); - } - kfree(finf); sbi->forest_info = NULL; } diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 3ca50670..7bd4609e 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -39,6 +39,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); int scoutfs_forest_setup(struct super_block *sb); +void scoutfs_forest_stop(struct super_block *sb); void scoutfs_forest_destroy(struct super_block *sb); #endif diff --git a/kmod/src/format.h b/kmod/src/format.h index af2358a0..fb6c1f4f 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -286,9 +286,10 @@ struct scoutfs_alloc_list_head { struct scoutfs_block_ref ref; __le64 total_nr; __le32 first_nr; - __u8 __pad[4]; + __le32 flags; }; + /* * While the main allocator uses extent items in btree blocks, metadata * allocations for a single transaction are recorded in arrays in @@ -317,9 +318,14 @@ struct scoutfs_alloc_list_block { */ struct scoutfs_alloc_root { __le64 total_len; + __le32 flags; + __le32 _pad; struct scoutfs_btree_root root; }; +/* Shared by _alloc_list_head and _alloc_root */ +#define SCOUTFS_ALLOC_FLAG_LOW (1U << 0) + /* types of allocators, exposed to alloc_detail ioctl */ #define SCOUTFS_ALLOC_OWNER_NONE 0 #define SCOUTFS_ALLOC_OWNER_SERVER 1 @@ -570,7 +576,7 @@ struct scoutfs_log_merge_freeing { * Keys are first sorted by major key zones. */ #define SCOUTFS_INODE_INDEX_ZONE 1 -#define SCOUTFS_RID_ZONE 2 +#define SCOUTFS_ORPHAN_ZONE 2 #define SCOUTFS_FS_ZONE 3 #define SCOUTFS_LOCK_ZONE 4 /* Items only stored in server btrees */ @@ -592,7 +598,7 @@ struct scoutfs_log_merge_freeing { #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE 2 #define SCOUTFS_INODE_INDEX_NR 3 /* don't forget to update */ -/* rid zone (also used in server alloc btree) */ +/* orphan zone, redundant type used for clarity */ #define SCOUTFS_ORPHAN_TYPE 1 /* fs zone */ diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 3c78dc21..15e4014f 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -34,6 +34,7 @@ #include "client.h" #include "cmp.h" #include "omap.h" +#include "forest.h" /* * XXX @@ -54,10 +55,19 @@ struct inode_allocator { }; struct inode_sb_info { + struct super_block *sb; + bool stopped; + spinlock_t writeback_lock; struct rb_root writeback_inodes; struct inode_allocator dir_ino_alloc; struct inode_allocator ino_alloc; + + struct delayed_work orphan_scan_dwork; + + /* serialize multiple inode ->evict trying to delete same ino's items */ + spinlock_t deleting_items_lock; + struct list_head deleting_items_list; }; #define DECLARE_INODE_SB_INFO(sb, name) \ @@ -352,7 +362,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, if (!S_ISREG(inode->i_mode)) return 0; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); if (ret) return ret; @@ -379,7 +389,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock) LIST_HEAD(ind_locks); int ret; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false); if (ret) return ret; @@ -494,7 +504,7 @@ retry: } } - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false); if (ret) goto out; @@ -1207,7 +1217,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq) * Returns > 0 if the seq changed and the locks should be retried. */ int scoutfs_inode_index_try_lock_hold(struct super_block *sb, - struct list_head *list, u64 seq) + struct list_head *list, u64 seq, bool allocing) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct index_lock *ind_lock; @@ -1223,7 +1233,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb, goto out; } - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, allocing); if (ret == 0 && seq != sbi->trans_seq) { scoutfs_release_trans(sb); ret = 1; @@ -1237,7 +1247,7 @@ out: } int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, - bool set_data_seq) + bool set_data_seq, bool allocing) { struct super_block *sb = inode->i_sb; int ret; @@ -1247,7 +1257,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, list, inode, set_data_seq) ?: - scoutfs_inode_index_try_lock_hold(sb, list, seq); + scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing); } while (ret > 0); return ret; @@ -1437,41 +1447,74 @@ out: return inode; } -static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino) +static void init_orphan_key(struct scoutfs_key *key, u64 ino) { *key = (struct scoutfs_key) { - .sk_zone = SCOUTFS_RID_ZONE, - .sko_rid = cpu_to_le64(rid), - .sk_type = SCOUTFS_ORPHAN_TYPE, + .sk_zone = SCOUTFS_ORPHAN_ZONE, .sko_ino = cpu_to_le64(ino), + .sk_type = SCOUTFS_ORPHAN_TYPE, }; } -int scoutfs_orphan_dirty(struct super_block *sb, u64 ino) +/* + * Create an orphan item. The orphan items are maintained in their own + * zone under a write only lock while the caller has the inode protected + * by a write lock. + */ +int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_lock *lock = sbi->rid_lock; struct scoutfs_key key; - init_orphan_key(&key, sbi->rid, ino); + init_orphan_key(&key, ino); - return scoutfs_item_dirty(sb, &key, lock); + return scoutfs_item_create_force(sb, &key, NULL, 0, lock); } -int scoutfs_orphan_delete(struct super_block *sb, u64 ino) +int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_lock *lock = sbi->rid_lock; struct scoutfs_key key; - int ret; - init_orphan_key(&key, sbi->rid, ino); + init_orphan_key(&key, ino); - ret = scoutfs_item_delete(sb, &key, lock); - if (ret == -ENOENT) - ret = 0; + return scoutfs_item_delete_force(sb, &key, lock); +} - return ret; +struct deleting_ino_entry { + struct list_head head; + u64 ino; +}; + +static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino) +{ + struct deleting_ino_entry *tmp; + bool added = true; + + spin_lock(&inf->deleting_items_lock); + + list_for_each_entry(tmp, &inf->deleting_items_list, head) { + if (tmp->ino == ino) { + added = false; + break; + } + } + + if (added) { + del->ino = ino; + list_add_tail(&del->head, &inf->deleting_items_list); + } + + spin_unlock(&inf->deleting_items_lock); + + return added; +} + +static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del) +{ + if (del->ino) { + spin_lock(&inf->deleting_items_lock); + list_del_init(&del->head); + spin_unlock(&inf->deleting_items_lock); + } } /* @@ -1482,9 +1525,21 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino) * orphan item will continue triggering attempts to finish previous * partial deletion until all deletion is complete and the orphan item * is removed. + * + * Currently this can be called multiple times for multiple cached + * inodes for a given ino number (ilookup avoids freeing inodes to avoid + * cluster lock<->inode flag waiting inversions). Some items are not + * safe to delete concurrently, for example concurrent data truncation + * could free extents multiple times. We use a very silly list of inos + * being deleted. Duplicates just return success. If the first + * deletion ends up failing orphan deletion will come back around later + * and retry. */ -static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) +static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *orph_lock) { + DECLARE_INODE_SB_INFO(sb, inf); + struct deleting_ino_entry del = {{NULL, }}; struct scoutfs_inode sinode; struct scoutfs_key key; LIST_HEAD(ind_locks); @@ -1494,6 +1549,11 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo u64 size; int ret; + if (!added_deleting_ino(inf, &del, ino)) { + ret = 0; + goto out; + } + init_inode_key(&key, ino); ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), @@ -1531,7 +1591,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) @@ -1553,8 +1613,9 @@ retry: if (ret) goto out; - ret = scoutfs_orphan_delete(sb, ino); + ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock); out: + del_deleting_ino(inf, &del); if (release) scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); @@ -1568,11 +1629,17 @@ out: * tear down. We use locking and open inode number bitmaps to decide if * we should finally destroy an inode that is no longer open nor * reachable through directory entries. + * + * Because lookup ignores freeing inodes we can get here from multiple + * instances of an inode that is being deleted. Orphan scanning in + * particular can race with deletion. delete_inode_items() resolves + * concurrent attempts. */ void scoutfs_evict_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); + struct scoutfs_lock *orph_lock; struct scoutfs_lock *lock; int ret; @@ -1584,14 +1651,21 @@ void scoutfs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); - ret = scoutfs_omap_should_delete(sb, inode, &lock); + ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock); if (ret > 0) { - ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock); + ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock); scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); } - if (ret < 0) + if (ret == -ERESTARTSYS) { + /* can be in task with pending, could be found as orphan */ + scoutfs_inc_counter(sb, inode_evict_intr); + ret = 0; + } + if (ret < 0) { scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.", ret, ino); + } scoutfs_omap_dec(sb, ino); @@ -1626,75 +1700,141 @@ int scoutfs_drop_inode(struct inode *inode) } /* - * Find orphan items and process each one. - * - * Runtime of this will be bounded by the number of orphans, which could - * theoretically be very large. If that becomes a problem we might want to push - * this work off to a thread. - * - * This only scans orphans for this node. This will need to be covered by - * the rest of node zone cleanup. + * All mounts are performing this work concurrently. We introduce + * significant jitter between them to try and keep them from all + * bunching up and working on the same inodes. */ -int scoutfs_scan_orphans(struct super_block *sb) +static void schedule_orphan_dwork(struct inode_sb_info *inf) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_lock *lock = sbi->rid_lock; - struct scoutfs_lock *inode_lock = NULL; - struct scoutfs_key key; +#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC) +#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC) + unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS + + prandom_u32_max(ORPHAN_SCAN_JITTER_MS)); + if (!inf->stopped) { + delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS + + prandom_u32_max(ORPHAN_SCAN_JITTER_MS)); + schedule_delayed_work(&inf->orphan_scan_dwork, delay); + } +} + +/* + * Find and delete inodes whose only remaining reference is the + * persistent orphan item that was created as they were unlinked. + * + * Orphan items are created as the final directory entry referring to an + * inode is deleted. They're deleted as the final cached inode is + * evicted and the inode items are destroyed. They can linger if all + * the cached inodes pinning the inode fail to delete as they are + * evicted from the cache -- either through crashing or errors. + * + * This work runs in all mounts in the background looking for orphaned + * inodes that should be deleted. + * + * We use the forest hint call to read the persistent forest trees + * looking for orphan items without creating lock contention. Orphan + * items exist for O_TMPFILE users and we don't want to force them to + * commit by trying to acquire a conflicting read lock the orphan zone. + * There's no rush to reclaim deleted items, eventually they will be + * found in the persistent item btrees. + * + * Once we find candidate orphan items we can first check our local + * inode cache for inodes that are already on their way to eviction and + * can be skipped. Then we ask the server for the open map containing + * the inode. Only if we don't have it cached, and no one else does, do + * we try and read it into our cache and evict it to trigger the final + * inode deletion process. + * + * Orphaned items that make it that far should be very rare. They can + * only exist if all the mounts that were using an inode after it had + * been unlinked (or created with o_tmpfile) didn't unmount cleanly. + */ +static void inode_orphan_scan_worker(struct work_struct *work) +{ + struct inode_sb_info *inf = container_of(work, struct inode_sb_info, + orphan_scan_dwork.work); + struct super_block *sb = inf->sb; + struct scoutfs_open_ino_map omap; struct scoutfs_key last; + struct scoutfs_key next; + struct scoutfs_key key; + struct inode *inode; + u64 group_nr; + int bit_nr; u64 ino; - int err = 0; int ret; - trace_scoutfs_scan_orphans(sb); + scoutfs_inc_counter(sb, orphan_scan); - init_orphan_key(&key, sbi->rid, 0); - init_orphan_key(&last, sbi->rid, ~0ULL); + init_orphan_key(&last, U64_MAX); + omap.args.group_nr = cpu_to_le64(U64_MAX); - while (1) { - ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock); - if (ret == -ENOENT) /* No more orphan items */ - break; - if (ret < 0) + for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) { + if (inf->stopped) { + ret = 0; goto out; - - ino = le64_to_cpu(key.sko_ino); - - ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock); - if (ret == 0) { - ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock); - scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); } - if (ret && ret != -ENOENT && !err) - err = ret; - if (le64_to_cpu(key.sko_ino) == U64_MAX) { - ret = -ENOENT; + /* find the next orphan item */ + init_orphan_key(&key, ino); + ret = scoutfs_forest_next_hint(sb, &key, &next); + if (ret < 0) { + if (ret == -ENOENT) + break; + goto out; + } + + if (scoutfs_key_compare(&next, &last) > 0) break; + + scoutfs_inc_counter(sb, orphan_scan_item); + ino = le64_to_cpu(next.sko_ino); + + /* locally cached inodes will already be deleted */ + inode = scoutfs_ilookup(sb, ino); + if (inode) { + scoutfs_inc_counter(sb, orphan_scan_cached); + iput(inode); + continue; } - le64_add_cpu(&key.sko_ino, 1); + + /* get an omap that covers the orphaned ino */ + group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT; + bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK; + + if (le64_to_cpu(omap.args.group_nr) != group_nr) { + ret = scoutfs_client_open_ino_map(sb, group_nr, &omap); + if (ret < 0) + goto out; + } + + /* don't need to evict if someone else has it open (cached) */ + if (test_bit_le(bit_nr, omap.bits)) { + scoutfs_inc_counter(sb, orphan_scan_omap_set); + continue; + } + + /* try to cached and evict unused inode to delete, can be racing */ + inode = scoutfs_iget(sb, ino); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) + continue; + else + goto out; + } + + scoutfs_inc_counter(sb, orphan_scan_read); + SCOUTFS_I(inode)->drop_invalidated = true; + iput(inode); } ret = 0; + out: - return err ? err : ret; -} + if (ret < 0) + scoutfs_inc_counter(sb, orphan_scan_error); -int scoutfs_orphan_inode(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct scoutfs_lock *lock = sbi->rid_lock; - struct scoutfs_key key; - int ret; - - trace_scoutfs_orphan_inode(sb, inode); - - init_orphan_key(&key, sbi->rid, scoutfs_ino(inode)); - - ret = scoutfs_item_create(sb, &key, NULL, 0, lock); - - return ret; + schedule_orphan_dwork(inf); } /* @@ -1803,16 +1943,43 @@ int scoutfs_inode_setup(struct super_block *sb) if (!inf) return -ENOMEM; + inf->sb = sb; spin_lock_init(&inf->writeback_lock); inf->writeback_inodes = RB_ROOT; spin_lock_init(&inf->dir_ino_alloc.lock); spin_lock_init(&inf->ino_alloc.lock); + INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker); + spin_lock_init(&inf->deleting_items_lock); + INIT_LIST_HEAD(&inf->deleting_items_list); sbi->inode_sb_info = inf; return 0; } +/* + * Our inode subsystem is setup pretty early but orphan scanning uses + * many other subsystems like networking and the server. We only kick + * it off once everything is ready. + */ +int scoutfs_inode_start(struct super_block *sb) +{ + DECLARE_INODE_SB_INFO(sb, inf); + + schedule_orphan_dwork(inf); + return 0; +} + +void scoutfs_inode_stop(struct super_block *sb) +{ + DECLARE_INODE_SB_INFO(sb, inf); + + if (inf) { + inf->stopped = true; + cancel_delayed_work_sync(&inf->orphan_scan_dwork); + } +} + void scoutfs_inode_destroy(struct super_block *sb) { struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info; diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 070d6492..7cb61b57 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -75,7 +75,6 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb); void scoutfs_destroy_inode(struct inode *inode); int scoutfs_drop_inode(struct inode *inode); void scoutfs_evict_inode(struct inode *inode); -int scoutfs_orphan_inode(struct inode *inode); struct inode *scoutfs_iget(struct super_block *sb, u64 ino); struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino); @@ -89,9 +88,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb, struct list_head *list, u64 ino, umode_t mode); int scoutfs_inode_index_try_lock_hold(struct super_block *sb, - struct list_head *list, u64 seq); + struct list_head *list, u64 seq, bool allocing); int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list, - bool set_data_seq); + bool set_data_seq, bool allocing); void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list); int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock); @@ -120,9 +119,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int scoutfs_setattr(struct dentry *dentry, struct iattr *attr); -int scoutfs_scan_orphans(struct super_block *sb); -int scoutfs_orphan_dirty(struct super_block *sb, u64 ino); -int scoutfs_orphan_delete(struct super_block *sb, u64 ino); +int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); +int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); void scoutfs_inode_queue_writeback(struct inode *inode); int scoutfs_inode_walk_writeback(struct super_block *sb, bool write); @@ -133,6 +131,8 @@ void scoutfs_inode_exit(void); int scoutfs_inode_init(void); int scoutfs_inode_setup(struct super_block *sb); +int scoutfs_inode_start(struct super_block *sb); +void scoutfs_inode_stop(struct super_block *sb); void scoutfs_inode_destroy(struct super_block *sb); #endif diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index b323b9a1..cb3f4a4e 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -38,6 +38,7 @@ #include "hash.h" #include "srch.h" #include "alloc.h" +#include "server.h" #include "scoutfs_trace.h" /* @@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg) /* setting only so we don't see 0 data seq with nonzero data_version */ set_data_seq = sm.data_version != 0 ? true : false; - ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq); + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false); if (ret) goto unlock; @@ -879,6 +880,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg) sfm.rid = sbi->rid; sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks); sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks); + sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb); ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq); if (ret) diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 5042edfe..446611e9 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more { __u64 committed_seq; __u64 total_meta_blocks; __u64 total_data_blocks; + __u64 reserved_meta_blocks; }; #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \ diff --git a/kmod/src/item.c b/kmod/src/item.c index d9cc2b2f..549bfcec 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -95,7 +95,7 @@ struct item_cache_info { /* written by page readers, read by shrink */ spinlock_t active_lock; - struct rb_root active_root; + struct list_head active_list; }; #define DECLARE_ITEM_CACHE_INFO(sb, name) \ @@ -127,6 +127,7 @@ struct cached_page { unsigned long lru_time; struct list_head dirty_list; struct list_head dirty_head; + u64 max_liv_seq; struct page *page; unsigned int page_off; unsigned int erased_bytes; @@ -385,6 +386,14 @@ static void put_pg(struct super_block *sb, struct cached_page *pg) } } +static void update_pg_max_liv_seq(struct cached_page *pg, struct cached_item *item) +{ + u64 liv_seq = le64_to_cpu(item->liv.seq); + + if (liv_seq > pg->max_liv_seq) + pg->max_liv_seq = liv_seq; +} + /* * Allocate space for a new item from the free offset at the end of a * cached page. This isn't a blocking allocation, and it's likely that @@ -416,6 +425,8 @@ static struct cached_item *alloc_item(struct cached_page *pg, if (val_len) memcpy(item->val, val, val_len); + update_pg_max_liv_seq(pg, item); + return item; } @@ -622,6 +633,8 @@ static void mark_item_dirty(struct super_block *sb, list_add_tail(&item->dirty_head, &pg->dirty_list); item->dirty = 1; } + + update_pg_max_liv_seq(pg, item); } static void clear_item_dirty(struct super_block *sb, @@ -1260,46 +1273,76 @@ static int cache_empty_page(struct super_block *sb, return 0; } +/* + * Readers operate independently from dirty items and transactions. + * They read a set of persistent items and insert them into the cache + * when there aren't already pages whose key range contains the items. + * This naturally prefers cached dirty items over stale read items. + * + * We have to deal with the case where dirty items are written and + * invalidated while a read is in flight. The reader won't have seen + * the items that were dirty in their persistent roots as they started + * reading. By the time they insert their read pages the previously + * dirty items have been reclaimed and are not in the cache. The old + * stale items will be inserted in their place, effectively corrupting + * by having the dirty items disappear. + * + * We fix this by tracking the max seq of items in pages. As readers + * start they record the current transaction seq. Invalidation skips + * pages with a max seq greater than the first reader seq because the + * items in the page have to stick around to prevent the readers stale + * items from being inserted. + * + * This naturally only affects a small set of pages with items that were + * written relatively recently. If we're in memory pressure then we + * probably have a lot of pages and they'll naturally have items that + * were visible to any raders. We don't bother with the complicated and + * expensive further refinement of tracking the ranges that are being + * read and comparing those with pages to invalidate. + */ struct active_reader { - struct rb_node node; - struct scoutfs_key start; - struct scoutfs_key end; + struct list_head head; + u64 seq; }; -static struct active_reader *active_rbtree_walk(struct rb_root *root, - struct scoutfs_key *start, - struct scoutfs_key *end, - struct rb_node **par, - struct rb_node ***pnode) +#define INIT_ACTIVE_READER(rdr) \ + struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) } + +static void add_active_reader(struct super_block *sb, struct active_reader *active) +{ + DECLARE_ITEM_CACHE_INFO(sb, cinf); + + BUG_ON(!list_empty(&active->head)); + + active->seq = scoutfs_trans_sample_seq(sb); + + spin_lock(&cinf->active_lock); + list_add_tail(&active->head, &cinf->active_list); + spin_unlock(&cinf->active_lock); +} + +static u64 first_active_reader_seq(struct item_cache_info *cinf) { - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - struct active_reader *ret = NULL; struct active_reader *active; - int cmp; + u64 first; - while (*node) { - parent = *node; - active = container_of(*node, struct active_reader, node); + /* only the calling task adds or deletes this active */ + spin_lock(&cinf->active_lock); + active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head); + first = active ? active->seq : U64_MAX; + spin_unlock(&cinf->active_lock); - cmp = scoutfs_key_compare_ranges(start, end, &active->start, - &active->end); - if (cmp < 0) { - node = &(*node)->rb_left; - } else if (cmp > 0) { - node = &(*node)->rb_right; - } else { - ret = active; - node = &(*node)->rb_left; - } + return first; +} + +static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active) +{ + /* only the calling task adds or deletes this active */ + if (!list_empty(&active->head)) { + spin_lock(&cinf->active_lock); + list_del_init(&active->head); + spin_unlock(&cinf->active_lock); } - - if (par) - *par = parent; - if (pnode) - *pnode = node; - - return ret; } /* @@ -1399,22 +1442,15 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, * locks held, but without locking the cache. The regions we read can * be stale with respect to the current cache, which can be read and * dirtied by other cluster lock holders on our node, but the cluster - * locks protect the stable items we read. - * - * There's also the exciting case where a reader can populate the cache - * with stale old persistent data which was read before another local - * cluster lock holder was able to read, dirty, write, and then shrink - * the cache. In this case the cache couldn't be cleared by lock - * invalidation because the caller is actively holding the lock. But - * shrinking could evict the cache within the held lock. So we record - * that we're an active reader in the range covered by the lock and - * shrink will refuse to reclaim any pages that intersect with our read. + * locks protect the stable items we read. Invalidation is careful not + * to drop pages that have items that we couldn't see because they were + * dirty when we started reading. */ static int read_pages(struct super_block *sb, struct item_cache_info *cinf, struct scoutfs_key *key, struct scoutfs_lock *lock) { struct rb_root root = RB_ROOT; - struct active_reader active; + INIT_ACTIVE_READER(active); struct cached_page *right = NULL; struct cached_page *pg; struct cached_page *rd; @@ -1430,15 +1466,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf, int pgi; int ret; - /* stop shrink from freeing new clean data, would let us cache stale */ - active.start = lock->start; - active.end = lock->end; - spin_lock(&cinf->active_lock); - active_rbtree_walk(&cinf->active_root, &active.start, &active.end, - &par, &pnode); - rbtree_insert(&active.node, par, pnode, &cinf->active_root); - spin_unlock(&cinf->active_lock); - /* start with an empty page that covers the whole lock */ pg = alloc_pg(sb, 0); if (!pg) { @@ -1449,6 +1476,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf, pg->end = lock->end; rbtree_insert(&pg->node, NULL, &root.rb_node, &root); + /* set active reader seq before reading persistent roots */ + add_active_reader(sb, &active); + ret = scoutfs_forest_read_items(sb, lock, key, &start, &end, read_page_item, &root); if (ret < 0) @@ -1526,9 +1556,7 @@ retry: ret = 0; out: - spin_lock(&cinf->active_lock); - rbtree_erase(&active.node, &cinf->active_root); - spin_unlock(&cinf->active_lock); + del_active_reader(cinf, &active); /* free any pages we left dangling on error */ for_each_page_safe(&root, rd, pg_tmp) { @@ -1830,8 +1858,8 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key, if (!item || item->deletion) { ret = -ENOENT; } else { - mark_item_dirty(sb, cinf, pg, NULL, item); item->liv.seq = item_seq(sb, lock); + mark_item_dirty(sb, cinf, pg, NULL, item); ret = 0; } @@ -2406,9 +2434,9 @@ retry: /* * Shrink the size the item cache. We're operating against the fast - * path lock ordering and we skip pages if we can't acquire locks. - * Similarly, we can run into dirty pages or pages which intersect with - * active readers that we can't shrink and also choose to skip. + * path lock ordering and we skip pages if we can't acquire locks. We + * can run into dirty pages or pages with items that weren't visible to + * the earliest active reader which must be skipped. */ static int item_lru_shrink(struct shrinker *shrink, struct shrink_control *sc) @@ -2417,26 +2445,24 @@ static int item_lru_shrink(struct shrinker *shrink, struct item_cache_info, shrinker); struct super_block *sb = cinf->sb; - struct active_reader *active; struct cached_page *tmp; struct cached_page *pg; + u64 first_reader_seq; int nr; if (sc->nr_to_scan == 0) goto out; nr = sc->nr_to_scan; + /* can't invalidate pages with items that weren't visible to first reader */ + first_reader_seq = first_active_reader_seq(cinf); + write_lock(&cinf->rwlock); spin_lock(&cinf->lru_lock); list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) { - /* can't invalidate ranges being read, reader might be stale */ - spin_lock(&cinf->active_lock); - active = active_rbtree_walk(&cinf->active_root, &pg->start, - &pg->end, NULL, NULL); - spin_unlock(&cinf->active_lock); - if (active) { + if (first_reader_seq <= pg->max_liv_seq) { scoutfs_inc_counter(sb, item_shrink_page_reader); continue; } @@ -2505,7 +2531,7 @@ int scoutfs_item_setup(struct super_block *sb) spin_lock_init(&cinf->lru_lock); INIT_LIST_HEAD(&cinf->lru_list); spin_lock_init(&cinf->active_lock); - cinf->active_root = RB_ROOT; + INIT_LIST_HEAD(&cinf->active_list); cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages); if (!cinf->pcpu_pages) @@ -2536,7 +2562,7 @@ void scoutfs_item_destroy(struct super_block *sb) int cpu; if (cinf) { - BUG_ON(!RB_EMPTY_ROOT(&cinf->active_root)); + BUG_ON(!list_empty(&cinf->active_list)); unregister_hotcpu_notifier(&cinf->notifier); unregister_shrinker(&cinf->shrinker); diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 36227eae..2278ee71 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -1347,29 +1347,28 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode } /* - * The rid lock protects a mount's private persistent items in the rid - * zone. It's held for the duration of the mount. It lets the mount - * modify the rid items at will and signals to other mounts that we're - * still alive and our rid items shouldn't be reclaimed. + * Orphan items are stored in their own zone which are modified with + * shared write_only locks and are read inconsistently without locks by + * background scanning work. * - * Being held for the entire mount prevents other nodes from reclaiming - * our items, like free blocks, when it would make sense for them to be - * able to. Maybe we have a bunch free and they're trying to allocate - * and are getting ENOSPC. + * Since we only use write_only locks we just lock the entire zone, but + * the api provides the inode in case we ever change the locking scheme. */ -int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, - u64 rid, struct scoutfs_lock **lock) +int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino, + struct scoutfs_lock **lock) { struct scoutfs_key start; struct scoutfs_key end; scoutfs_key_set_zeros(&start); - start.sk_zone = SCOUTFS_RID_ZONE; - start.sko_rid = cpu_to_le64(rid); + start.sk_zone = SCOUTFS_ORPHAN_ZONE; + start.sko_ino = 0; + start.sk_type = SCOUTFS_ORPHAN_TYPE; - scoutfs_key_set_ones(&end); - end.sk_zone = SCOUTFS_RID_ZONE; - end.sko_rid = cpu_to_le64(rid); + scoutfs_key_set_zeros(&end); + end.sk_zone = SCOUTFS_ORPHAN_ZONE; + end.sko_ino = cpu_to_le64(U64_MAX); + end.sk_type = SCOUTFS_ORPHAN_TYPE; return lock_key_range(sb, mode, flags, &start, &end, lock); } diff --git a/kmod/src/lock.h b/kmod/src/lock.h index d043f9fc..8c3277ab 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -85,8 +85,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int struct inode *d, struct scoutfs_lock **D_lock); int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, struct scoutfs_lock **lock); -int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, - u64 rid, struct scoutfs_lock **lock); +int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, + u64 ino, struct scoutfs_lock **lock); void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode); diff --git a/kmod/src/omap.c b/kmod/src/omap.c index bbe80976..529cfda4 100644 --- a/kmod/src/omap.c +++ b/kmod/src/omap.c @@ -595,10 +595,6 @@ out: free_req(req); } - /* it's fine if we couldn't send to a client that left */ - if (ret == -ENOTCONN) - ret = 0; - return ret; } @@ -908,9 +904,9 @@ out: } /* - * Return 1 and give the caller a write inode lock if it is safe to be - * deleted. It's safe to be deleted when it is no longer reachable and - * nothing is referencing it. + * Return 1 and give the caller their locks when they should delete the + * inode items. It's safe to delete the inode items when it is no + * longer reachable and nothing is referencing it. * * The inode is unreachable when nlink hits zero. Cluster locks protect * modification and testing of nlink. We use the ino_lock_cov covrage @@ -925,15 +921,17 @@ out: * increase nlink from zero and let people get a reference to the inode. */ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, - struct scoutfs_lock **lock_ret) + struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct scoutfs_lock *orph_lock = NULL; struct scoutfs_lock *lock = NULL; const u64 ino = scoutfs_ino(inode); struct scoutfs_omap_lock_data *ldata; u64 group_nr; int bit_nr; int ret; + int err; /* lock group and omap constants are defined independently */ BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR); @@ -964,12 +962,19 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, out: trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret); + if (ret > 0) { + err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock); + if (err < 0) + ret = err; + } + if (ret <= 0) { scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); lock = NULL; } *lock_ret = lock; + *orph_lock_ret = orph_lock; return ret; } diff --git a/kmod/src/omap.h b/kmod/src/omap.h index 0e5fa29d..9a2d1e0b 100644 --- a/kmod/src/omap.h +++ b/kmod/src/omap.h @@ -4,7 +4,7 @@ int scoutfs_omap_inc(struct super_block *sb, u64 ino); void scoutfs_omap_dec(struct super_block *sb, u64 ino); int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, - struct scoutfs_lock **lock_ret); + struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret); void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata); int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id, struct scoutfs_open_ino_map_args *args); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index fb5ea548..b92471fd 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func, ); DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), - TP_ARGS(sb, journal_info, holders), + TP_ARGS(sb, journal_info, holders, ret), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(unsigned long, journal_info) __field(int, holders) + __field(int, ret) ), TP_fast_assign( @@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class, __entry->holders = holders; ), - TP_printk(SCSBF" journal_info 0x%0lx holders %d", - SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders) + TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d", + SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret) ); -DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), - TP_ARGS(sb, journal_info, holders) +DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans, + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), + TP_ARGS(sb, journal_info, holders, ret) ); DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans, - TP_PROTO(struct super_block *sb, void *journal_info, int holders), - TP_ARGS(sb, journal_info, holders) + TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret), + TP_ARGS(sb, journal_info, holders, ret) ); TRACE_EVENT(scoutfs_ioc_release, @@ -985,22 +986,6 @@ TRACE_EVENT(scoutfs_delete_inode, __entry->mode, __entry->size) ); -TRACE_EVENT(scoutfs_scan_orphans, - TP_PROTO(struct super_block *sb), - - TP_ARGS(sb), - - TP_STRUCT__entry( - __field(dev_t, dev) - ), - - TP_fast_assign( - __entry->dev = sb->s_dev; - ), - - TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) -); - DECLARE_EVENT_CLASS(scoutfs_key_class, TP_PROTO(struct super_block *sb, struct scoutfs_key *key), TP_ARGS(sb, key), diff --git a/kmod/src/server.c b/kmod/src/server.c index 9e8307b8..c0be71fd 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work) struct commit_waiter *cw; struct commit_waiter *pos; struct llist_node *node; + u64 reserved; int ret; trace_scoutfs_server_commit_work_enter(sb, 0, 0); @@ -387,11 +388,17 @@ static void scoutfs_server_commit_func(struct work_struct *work) server->other_avail = &super->server_meta_avail[server->other_ind]; server->other_freed = &super->server_meta_freed[server->other_ind]; - /* swap avail/free if avail gets low and freed is high */ - if (le64_to_cpu(server->meta_avail->total_len) <= - SCOUTFS_SERVER_META_ALLOC_MIN && - le64_to_cpu(server->meta_freed->total_len) > - SCOUTFS_SERVER_META_ALLOC_MIN) + /* + * The reserved metadata blocks includes the max size of + * outstanding allocators and a server transaction could be + * asked to refill all those allocators from meta_avail. If our + * meta_avail falls below the reserved count, and freed is still + * above it, then swap so that we don't start returning enospc + * until we're truly low. + */ + reserved = scoutfs_server_reserved_meta_blocks(sb); + if (le64_to_cpu(server->meta_avail->total_len) <= reserved && + le64_to_cpu(server->meta_freed->total_len) > reserved) swap(server->meta_avail, server->meta_freed); ret = 0; @@ -479,6 +486,57 @@ static int alloc_move_empty(struct super_block *sb, dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0); } +/* + * Copy on write transactions need to allocate new dirty blocks as they + * make modifications to delete items and eventually free more blocks. + * The reserved blocks are meant to keep enough available blocks in + * flight to allow servers and clients to perform transactions that + * don't consume additional space. We have quite a few allocators in + * flight across the server and various client mechanisms (posix items, + * srch compaction, and log merging). We also want to include + * sufficient blocks for client log btrees to grow tall enough to be + * finalized and merges. + * + * The reserved blocks calculation is a policy of the server but it's + * exposed to the statfs_more interface so that df isn't misleading. + * Requiring this synchronization without explicit protocol + * communication isn't great. + */ +u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb) +{ + DECLARE_SERVER_INFO(sb, server); + u64 server_blocks; + u64 client_blocks; + u64 log_blocks; + u64 nr_clients; + + /* server has two meta_avail lists it swaps between */ + server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2; + + /* + * Log trees will be compacted once they hit a height of 3. + * That'll be the grandparent, two parents resulting from a + * split, and all their child blocks (roughly calculated, + * overestimating). + */ + log_blocks = 3 + (SCOUTFS_BLOCK_LG_SIZE / + (sizeof(struct scoutfs_btree_item) + sizeof(struct scoutfs_block_ref))); + + /* + * Each client can have a meta_avail list, srch compaction + * request, log merge request, and a log btree it's building. + */ + client_blocks = SCOUTFS_SERVER_META_FILL_TARGET + SCOUTFS_SERVER_META_FILL_TARGET + + SCOUTFS_SERVER_MERGE_FILL_TARGET + log_blocks; + + /* we should reserve for voting majority, too */ + spin_lock(&server->lock); + nr_clients = server->nr_clients; + spin_unlock(&server->lock); + + return server_blocks + (max(1ULL, nr_clients) * client_blocks); +} + /* * Set all the bits in the destination which overlap with the extent. */ @@ -662,6 +720,7 @@ static int server_get_log_trees(struct super_block *sb, struct scoutfs_log_trees lt; struct scoutfs_key key; bool have_fin = false; + bool unlock_alloc = false; u64 data_zone_blocks; u64 nr; int ret; @@ -701,8 +760,15 @@ static int server_get_log_trees(struct super_block *sb, lt.nr = cpu_to_le64(nr); } - /* finalize an existing root when large enough and don't have one */ - if (lt.item_root.height > 2 && !have_fin) { + /* + * Finalize the client log btree when it has enough leaf blocks + * to allow some degree of merging concurrency. Smaller btrees + * are also finalized when meta was low so that deleted items + * are merged promptly and freed blocks can bring the client out + * of enospc. + */ + if (!have_fin && ((lt.item_root.height > 2) || + (le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) { fin = lt; memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); @@ -734,24 +800,45 @@ static int server_get_log_trees(struct super_block *sb, data_zone_blocks = 0; } - /* return freed to server for emptying, refill avail */ + /* + * Reclaim the freed meta and data allocators and refill the + * avail allocators, setting low flags if they drop too low. + */ mutex_lock(&server->alloc_mutex); - ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, - server->other_freed, + unlock_alloc = true; + + ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed, <.meta_freed) ?: - alloc_move_empty(sb, &super->data_alloc, <.data_freed) ?: - scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, - <.meta_avail, server->meta_avail, - SCOUTFS_SERVER_META_FILL_LO, - SCOUTFS_SERVER_META_FILL_TARGET) ?: - alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc, - SCOUTFS_SERVER_DATA_FILL_LO, - SCOUTFS_SERVER_DATA_FILL_TARGET, - exclusive, vacant, data_zone_blocks); - mutex_unlock(&server->alloc_mutex); + alloc_move_empty(sb, &super->data_alloc, <.data_freed); if (ret < 0) goto unlock; + ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri, + <.meta_avail, server->meta_avail, + SCOUTFS_SERVER_META_FILL_LO, + SCOUTFS_SERVER_META_FILL_TARGET); + if (ret < 0) + goto unlock; + + if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb)) + lt.meta_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + else + lt.meta_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + + ret = alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc, + SCOUTFS_SERVER_DATA_FILL_LO, SCOUTFS_SERVER_DATA_FILL_TARGET, + exclusive, vacant, data_zone_blocks); + if (ret < 0) + goto unlock; + + if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO) + lt.data_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + else + lt.data_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW); + + mutex_unlock(&server->alloc_mutex); + unlock_alloc = false; + /* record data alloc zone bits */ zero_data_alloc_zone_bits(<); if (data_zone_blocks != 0) { @@ -772,6 +859,8 @@ static int server_get_log_trees(struct super_block *sb, ret = scoutfs_btree_force(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); unlock: + if (unlock_alloc) + mutex_unlock(&server->alloc_mutex); mutex_unlock(&server->logs_mutex); ret = scoutfs_server_apply_commit(sb, ret); @@ -2277,15 +2366,27 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid, open_ino_map_response, NULL, NULL); } -/* The server is sending an omap response to the client */ +/* + * The server is sending an omap response to the client that originated + * the request. These responses are sent long after the incoming + * request has pinned the client connection and guaranteed that we'll be + * able to queue a response. This can race with the client connection + * being torn down and it's OK if we drop the response. Either the + * client is being evicted and we don't care about them anymore or we're + * tearing down in unmount and the client will resend to thee next + * server. + */ int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id, struct scoutfs_open_ino_map *map, int err) { struct server_info *server = SCOUTFS_SB(sb)->server_info; + int ret; - return scoutfs_net_response_node(sb, server->conn, rid, - SCOUTFS_NET_CMD_OPEN_INO_MAP, id, err, - map, sizeof(*map)); + ret = scoutfs_net_response_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP, + id, err, map, sizeof(*map)); + if (ret == -ENOTCONN) + ret = 0; + return ret; } /* The server is receiving an omap request from the client */ diff --git a/kmod/src/server.h b/kmod/src/server.h index 79fcb443..d5829abe 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -56,6 +56,8 @@ do { \ __entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \ __entry->name##_error +u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb); + int scoutfs_server_lock_request(struct super_block *sb, u64 rid, struct scoutfs_net_lock *nl); int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id, @@ -75,8 +77,6 @@ u64 scoutfs_server_seq(struct super_block *sb); u64 scoutfs_server_next_seq(struct super_block *sb); void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq); -struct sockaddr_in; -struct scoutfs_quorum_elected_info; int scoutfs_server_start(struct super_block *sb, u64 term); void scoutfs_server_abort(struct super_block *sb); void scoutfs_server_stop(struct super_block *sb); diff --git a/kmod/src/super.c b/kmod/src/super.c index 19e1503f..65224dff 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -247,11 +247,10 @@ static void scoutfs_put_super(struct super_block *sb) trace_scoutfs_put_super(sb); + scoutfs_inode_stop(sb); + scoutfs_forest_stop(sb); scoutfs_srch_destroy(sb); - scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE); - sbi->rid_lock = NULL; - scoutfs_lock_shutdown(sb); scoutfs_shutdown_trans(sb); @@ -623,10 +622,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) scoutfs_quorum_setup(sb) ?: scoutfs_client_setup(sb) ?: scoutfs_volopt_setup(sb) ?: - scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid, - &sbi->rid_lock) ?: scoutfs_trans_get_log_trees(sb) ?: - scoutfs_srch_setup(sb); + scoutfs_srch_setup(sb) ?: + scoutfs_inode_start(sb); if (ret) goto out; @@ -647,7 +645,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) goto out; scoutfs_trans_restart_sync_deadline(sb); -// scoutfs_scan_orphans(sb); ret = 0; out: /* on error, generic_shutdown_super calls put_super if s_root */ diff --git a/kmod/src/super.h b/kmod/src/super.h index e44d6575..e7733856 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -36,7 +36,6 @@ struct scoutfs_sb_info { /* assigned once at the start of each mount, read-only */ u64 rid; - struct scoutfs_lock *rid_lock; struct scoutfs_super_block super; diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 07eea0fa..9417e39a 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -436,8 +436,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri) return true; } - /* Try to refill data allocator before premature enospc */ - if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) { + /* if we're low and can't refill then alloc could empty and return enospc */ + if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) { scoutfs_inc_counter(sb, trans_commit_data_alloc_low); return true; } @@ -445,38 +445,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri) return false; } -static bool acquired_hold(struct super_block *sb) +/* + * called as a wait_event condition, needs to be careful to not change + * task state and is racing with waking paths that sub_return, test, and + * wake. + */ +static bool holders_no_writer(struct trans_info *tri) { - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - DECLARE_TRANS_INFO(sb, tri); - bool acquired; - - /* if a caller already has a hold we acquire unconditionally */ - if (inc_journal_info_holders()) { - atomic_inc(&tri->holders); - acquired = true; - goto out; - } - - /* wait if the writer is blocking holds */ - if (!inc_holders_unless_writer(tri)) { - dec_journal_info_holders(); - acquired = false; - goto out; - } - - /* wait if we're triggering another commit */ - if (commit_before_hold(sb, tri)) { - release_holders(sb); - queue_trans_work(sbi); - acquired = false; - goto out; - } - - trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders)); - acquired = true; -out: - return acquired; + smp_mb(); /* make sure task in wait_event queue before atomic read */ + return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT); } /* @@ -492,15 +469,64 @@ out: * The writing thread marks itself as a global trans_task which * short-circuits all the hold machinery so it can call code that would * otherwise try to hold transactions while it is writing. + * + * If the caller is adding metadata items that will eventually consume + * free space -- not dirtying existing items or adding deletion items -- + * then we can return enospc if our metadata allocator indicates that + * we're low on space. */ -int scoutfs_hold_trans(struct super_block *sb) +int scoutfs_hold_trans(struct super_block *sb, bool allocing) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + DECLARE_TRANS_INFO(sb, tri); + u64 seq; + int ret; if (current == sbi->trans_task) return 0; - return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb)); + for (;;) { + /* if a caller already has a hold we acquire unconditionally */ + if (inc_journal_info_holders()) { + atomic_inc(&tri->holders); + ret = 0; + break; + } + + /* wait until the writer work is finished */ + if (!inc_holders_unless_writer(tri)) { + dec_journal_info_holders(); + ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri)); + if (ret < 0) + break; + continue; + } + + /* return enospc if server is into reserved blocks and we're allocating */ + if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) { + release_holders(sb); + ret = -ENOSPC; + break; + } + + /* see if we need to trigger and wait for a commit before holding */ + if (commit_before_hold(sb, tri)) { + seq = scoutfs_trans_sample_seq(sb); + release_holders(sb); + queue_trans_work(sbi); + ret = wait_event_interruptible(sbi->trans_hold_wq, + scoutfs_trans_sample_seq(sb) != seq); + if (ret < 0) + break; + continue; + } + + ret = 0; + break; + } + + trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret); + return ret; } /* @@ -525,7 +551,7 @@ void scoutfs_release_trans(struct super_block *sb) release_holders(sb); - trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders)); + trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0); } /* diff --git a/kmod/src/trans.h b/kmod/src/trans.h index ab42a4cf..51ae1232 100644 --- a/kmod/src/trans.h +++ b/kmod/src/trans.h @@ -1,18 +1,13 @@ #ifndef _SCOUTFS_TRANS_H_ #define _SCOUTFS_TRANS_H_ -/* the server will attempt to fill data allocs for each trans */ -#define SCOUTFS_TRANS_DATA_ALLOC_HWM (2ULL * 1024 * 1024 * 1024) -/* the client will force commits if data allocators get too low */ -#define SCOUTFS_TRANS_DATA_ALLOC_LWM (256ULL * 1024 * 1024) - void scoutfs_trans_write_func(struct work_struct *work); int scoutfs_trans_sync(struct super_block *sb, int wait); int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); void scoutfs_trans_restart_sync_deadline(struct super_block *sb); -int scoutfs_hold_trans(struct super_block *sb); +int scoutfs_hold_trans(struct super_block *sb, bool allocing); bool scoutfs_trans_held(void); void scoutfs_release_trans(struct super_block *sb); u64 scoutfs_trans_sample_seq(struct super_block *sb); diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index 6c00c0c5..fd8acd8e 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: - scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq); + scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) @@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, &tgs) != 0) memset(&tgs, 0, sizeof(tgs)); - ret = scoutfs_hold_trans(sb); + ret = scoutfs_hold_trans(sb, false); if (ret < 0) break; release = true; diff --git a/tests/.gitignore b/tests/.gitignore index d8268d17..f9edc55f 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -5,3 +5,4 @@ src/handle_cat src/bulk_create_paths src/find_xattrs src/stage_tmpfile +src/create_xattr_loop diff --git a/tests/Makefile b/tests/Makefile index 9a640bc4..81e358a5 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -7,7 +7,8 @@ BIN := src/createmany \ src/handle_cat \ src/bulk_create_paths \ src/stage_tmpfile \ - src/find_xattrs + src/find_xattrs \ + src/create_xattr_loop DEPS := $(wildcard src/*.d) diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index 8f146e34..9ca9f304 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -71,6 +71,7 @@ t_filter_dmesg() re="$re|scoutfs .* quorum .* error" re="$re|scoutfs .* error reading quorum block" re="$re|scoutfs .* error .* writing quorum block" + re="$re|scoutfs .* error .* while checking to delete inode" egrep -v "($re)" } diff --git a/tests/golden/enospc b/tests/golden/enospc new file mode 100644 index 00000000..150e5cf9 --- /dev/null +++ b/tests/golden/enospc @@ -0,0 +1,8 @@ +== prepare directories and files +== fallocate until enospc +== remove all the files and verify free data blocks +== make small meta fs +== create large xattrs until we fill up metadata +== remove files with xattrs after enospc +== make sure we can create again +== cleanup small meta fs diff --git a/tests/golden/orphan-inodes b/tests/golden/orphan-inodes new file mode 100644 index 00000000..cb79e12d --- /dev/null +++ b/tests/golden/orphan-inodes @@ -0,0 +1,4 @@ +== test our inode existance function +== unlinked and opened inodes still exist +== orphan from failed evict deletion is picked up +== orphaned inos in all mounts all deleted diff --git a/tests/sequence b/tests/sequence index 0d709c1d..b97e4847 100644 --- a/tests/sequence +++ b/tests/sequence @@ -7,6 +7,7 @@ simple-release-extents.sh setattr_more.sh offline-extent-waiting.sh move-blocks.sh +enospc.sh srch-basic-functionality.sh simple-xattr-unit.sh lock-refleak.sh @@ -29,6 +30,7 @@ cross-mount-data-free.sh persistent-item-vers.sh setup-error-teardown.sh fence-and-reclaim.sh +orphan-inodes.sh mount-unmount-race.sh createmany-parallel-mounts.sh archive-light-cycle.sh diff --git a/tests/src/create_xattr_loop.c b/tests/src/create_xattr_loop.c new file mode 100644 index 00000000..8123437f --- /dev/null +++ b/tests/src/create_xattr_loop.c @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void exit_usage(void) +{ + printf(" -h/-? output this usage message and exit\n" + " -c number of xattrs to create\n" + " -n xattr name prefix, -NR is appended\n" + " -p string with path to file with xattrs\n" + " -s xattr value size\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + char *pref = NULL; + char *path = NULL; + char *val; + char *name; + unsigned long long count = 0; + unsigned long long size = 0; + unsigned long long i; + int ret; + int c; + + while ((c = getopt(argc, argv, "+c:n:p:s:")) != -1) { + + switch (c) { + case 'c': + count = strtoull(optarg, NULL, 0); + break; + case 'n': + pref = strdup(optarg); + break; + case 'p': + path = strdup(optarg); + break; + case 's': + size = strtoull(optarg, NULL, 0); + break; + case '?': + printf("unknown argument: %c\n", optind); + case 'h': + exit_usage(); + } + } + + if (count == 0) { + printf("specify count of xattrs to create with -c\n"); + exit(1); + } + + if (count == ULLONG_MAX) { + printf("invalid -c count\n"); + exit(1); + } + + if (size == 0) { + printf("specify xattrs value size with -s\n"); + exit(1); + } + + if (size == ULLONG_MAX || size < 2) { + printf("invalid -s size\n"); + exit(1); + } + + if (path == NULL) { + printf("specify path to file with -p\n"); + exit(1); + } + + if (pref == NULL) { + printf("specify xattr name prefix string with -n\n"); + exit(1); + } + + ret = snprintf(NULL, 0, "%s-%llu", pref, ULLONG_MAX) + 1; + name = malloc(ret); + if (!name) { + printf("couldn't allocate xattr name buffer\n"); + exit(1); + } + + val = malloc(size); + if (!val) { + printf("couldn't allocate xattr value buffer\n"); + exit(1); + } + + memset(val, 'a', size - 1); + val[size - 1] = '\0'; + + for (i = 0; i < count; i++) { + sprintf(name, "%s-%llu", pref, i); + + ret = setxattr(path, name, val, size, 0); + if (ret) { + printf("returned %d errno %d (%s)\n", + ret, errno, strerror(errno)); + return 1; + } + } + + return 0; +} diff --git a/tests/tests/enospc.sh b/tests/tests/enospc.sh new file mode 100644 index 00000000..ab042479 --- /dev/null +++ b/tests/tests/enospc.sh @@ -0,0 +1,100 @@ +# +# test hititng enospc by filling with data or metadata and +# then recovering by removing what we filled. +# + +# Type Size Total Used Free Use% +#MetaData 64KB 1048576 32782 1015794 3 +# Data 4KB 16777152 0 16777152 0 +free_blocks() { + local md="$1" + local mnt="$2" + scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }' +} + +t_require_commands scoutfs stat fallocate createmany + +echo "== prepare directories and files" +for n in $(t_fs_nrs); do + eval path="\$T_D${n}/dir-$n/file-$n" + mkdir -p $(dirname $path) + touch $path +done +sync + +echo "== fallocate until enospc" +before=$(free_blocks Data "$T_M0") +finished=0 +while [ $finished != 1 ]; do + for n in $(t_fs_nrs); do + eval path="\$T_D${n}/dir-$n/file-$n" + off=$(stat -c "%s" "$path") + + LC_ALL=C fallocate -o $off -l 128MiB "$path" > $T_TMP.fallocate 2>&1 + err="$?" + + if grep -qi "no space" $T_TMP.fallocate; then + finished=1 + break + fi + if [ "$err" != "0" ]; then + t_fail "fallocate failed with $err" + fi + done +done + +echo "== remove all the files and verify free data blocks" +for n in $(t_fs_nrs); do + eval dir="\$T_D${n}/dir-$n" + rm -rf "$dir" +done +sync +after=$(free_blocks Data "$T_M0") +# nothing else should be modifying data blocks +test "$before" == "$after" || \ + t_fail "$after free data blocks after rm, expected $before" + +# XXX this is all pretty manual, would be nice to have helpers +echo "== make small meta fs" +# meta device just big enough for reserves and the metadata we'll fill +scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \ + t_fail "mkfs failed" +SCR="/mnt/scoutfs.enospc" +mkdir -p "$SCR" +mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \ + "$T_EX_DATA_DEV" "$SCR" + +echo "== create large xattrs until we fill up metadata" +mkdir -p "$SCR/xattrs" + +for f in $(seq 1 100000); do + file="$SCR/xattrs/file-$f" + touch "$file" + + LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1 + err="$?" + + if grep -qi "no space" $T_TMP.cxl; then + echo "enospc at f $f" >> $T_TMP.cxl + break + fi + if [ "$err" != "0" ]; then + t_fail "create_xattr_loop failed with $err" + fi +done + +echo "== remove files with xattrs after enospc" +rm -rf "$SCR/xattrs" + +echo "== make sure we can create again" +file="$SCR/file-after" +touch $file +setfattr -n user.scoutfs-enospc -v 1 "$file" +sync +rm -f "$file" + +echo "== cleanup small meta fs" +umount "$SCR" +rmdir "$SCR" + +t_pass diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh new file mode 100644 index 00000000..bc55d35b --- /dev/null +++ b/tests/tests/orphan-inodes.sh @@ -0,0 +1,77 @@ +# +# make sure we clean up orphaned inodes +# + +t_require_commands sleep touch sync stat handle_cat kill rm +t_require_mounts 2 + +# +# usually bash prints an annoying output message when jobs +# are killed. We can avoid that by redirecting stderr for +# the bash process when it reaps the jobs that are killed. +# +silent_kill() { + exec {ERR}>&2 2>/dev/null + kill "$@" + wait "$@" + exec 2>&$ERR {ERR}>&- +} + +# +# We don't have a great way to test that inode items still exist. We +# don't prevent opening handles with nlink 0 today, so we'll use that. +# This would have to change to some other method. +# +inode_exists() +{ + local ino="$1" + + handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1 +} + +echo "== test our inode existance function" +path="$T_D0/file" +touch "$path" +ino=$(stat -c "%i" "$path") +inode_exists $ino || echo "$ino didn't exist" + +echo "== unlinked and opened inodes still exist" +sleep 1000000 < "$path" & +pid="$!" +rm -f "$path" +inode_exists $ino || echo "$ino didn't exist" + +echo "== orphan from failed evict deletion is picked up" +# pending kill signal stops evict from getting locks and deleting +silent_kill $pid +sleep 55 +inode_exists $ino && echo "$ino still exists" + +echo "== orphaned inos in all mounts all deleted" +pids="" +inos="" +for nr in $(t_fs_nrs); do + eval path="\$T_D${nr}/file-$nr" + touch "$path" + inos="$inos $(stat -c %i $path)" + sleep 1000000 < "$path" & + pids="$pids $!" + rm -f "$path" +done +sync +silent_kill $pids +for nr in $(t_fs_nrs); do + t_force_umount $nr +done +t_mount_all +# wait for all fence requests to complete +while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do + sleep .5 +done +# wait for orphan scans to run +sleep 55 +for ino in $inos; do + inode_exists $ino && echo "$ino still exists" +done + +t_pass diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8 index 09062fb0..d7723302 100644 --- a/utils/man/scoutfs.8 +++ b/utils/man/scoutfs.8 @@ -32,10 +32,18 @@ A path within a ScoutFS filesystem. .PD .TP -.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force]" +.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size]" .sp Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses separate block devices for its metadata and data storage, two are required. +The internal structures and nature of metadata and data transactions +lead to minimum viable device sizes. +.B mkfs +will check both devices and fail with an error if either are under the +minimum size. If +.B --allow-small-size +is given then sizes under the minimum size will be +allowed after printing an informational warning. .sp If .B --force @@ -81,6 +89,10 @@ kibibytes, mebibytes, etc. .B "-d, --max-data-size SIZE" Same as previous, but for limiting the size of the data device. .TP +.B "-A, --allow-small-size" +Allows use of specified device sizes less than the minimum. This can +result in bad behaviour and is only intended for testing. +.TP .B "-z, --data-alloc-zone-blocks BLOCKS" Set the data_alloc_zone_blocks volume option, as described in .BR scoutfs (5). diff --git a/utils/src/dev.c b/utils/src/dev.c index 303e6438..af1e91f6 100644 --- a/utils/src/dev.c +++ b/utils/src/dev.c @@ -6,12 +6,13 @@ #include #include #include +#include #include "sparse.h" #include "dev.h" int device_size(char *path, int fd, - u64 min_size, u64 max_size, + u64 min_size, u64 max_size, bool allow_small_size, char *use_type, u64 *size_ret) { struct stat st; @@ -63,10 +64,13 @@ int device_size(char *path, int fd, if (size < min_size) { fprintf(stderr, BASE_SIZE_FMT" %s too small for min " - BASE_SIZE_FMT" %s device\n", + BASE_SIZE_FMT" %s device%s\n", BASE_SIZE_ARGS(size), target_type, - BASE_SIZE_ARGS(min_size), use_type); - return -EINVAL; + BASE_SIZE_ARGS(min_size), use_type, + allow_small_size ? ", allowing with -A" : ""); + + if (!allow_small_size) + return -EINVAL; } *size_ret = size; diff --git a/utils/src/dev.h b/utils/src/dev.h index f7017da4..df79fe4c 100644 --- a/utils/src/dev.h +++ b/utils/src/dev.h @@ -1,6 +1,8 @@ #ifndef _DEV_H_ #define _DEV_H_ +#include + #define BASE_SIZE_FMT "%.2f%s" #define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1) @@ -8,7 +10,7 @@ #define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz) int device_size(char *path, int fd, - u64 min_size, u64 max_size, + u64 min_size, u64 max_size, bool allow_small_size, char *use_type, u64 *size_ret); float size_flt(u64 nr, unsigned size); char *size_str(u64 nr, unsigned size); diff --git a/utils/src/df.c b/utils/src/df.c index 21ea9f04..585d658c 100644 --- a/utils/src/df.c +++ b/utils/src/df.c @@ -86,6 +86,11 @@ static int do_df(struct df_args *args) data_free += ade[i].blocks; } + if (meta_free >= sfm.reserved_meta_blocks) + meta_free -= sfm.reserved_meta_blocks; + else + meta_free = 0; + snprintf(cells[0][0], CHARS, "Type"); snprintf(cells[0][1], CHARS, "Size"); snprintf(cells[0][2], CHARS, "Total"); diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index bcf07357..e18d5f9b 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -135,6 +135,7 @@ struct mkfs_args { unsigned long long max_data_size; u64 data_alloc_zone_blocks; bool force; + bool allow_small_size; int nr_slots; struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS]; }; @@ -215,13 +216,15 @@ static int do_mkfs(struct mkfs_args *args) goto out; } - ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024), - args->max_meta_size, "meta", &meta_size); + /* minumum meta device size to make reserved blocks reasonably large */ + ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024), + args->max_meta_size, args->allow_small_size, "meta", &meta_size); if (ret) goto out; - ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024), - args->max_data_size, "data", &data_size); + /* .. then arbitrarily the same minimum data device size */ + ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024), + args->max_data_size, args->allow_small_size, "data", &data_size); if (ret) goto out; @@ -520,6 +523,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state) prev_val, args->max_data_size); break; } + case 'A': + args->allow_small_size = true; + break; case 'z': /* data-alloc-zone-blocks */ { ret = parse_u64(arg, &args->data_alloc_zone_blocks); @@ -559,6 +565,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state) static struct argp_option options[] = { { "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"}, { "force", 'f', NULL, 0, "Overwrite existing data on block devices"}, + { "allow-small-size", 'A', NULL, 0, "Allow specified meta/data devices less than minimum, still warns"}, { "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"}, { "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"}, { "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"}, diff --git a/utils/src/print.c b/utils/src/print.c index c6ea1fe0..4c79a5fb 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -158,7 +158,7 @@ static print_func_t find_printer(u8 zone, u8 type) type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE) return print_inode_index; - if (zone == SCOUTFS_RID_ZONE) { + if (zone == SCOUTFS_ORPHAN_ZONE) { if (type == SCOUTFS_ORPHAN_TYPE) return print_orphan; } @@ -245,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val, le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq) #define AL_HEAD_F \ - AL_REF_F" total_nr %llu first_nr %u" + AL_REF_F" total_nr %llu first_nr %u flags 0x%x" #define AL_HEAD_A(p) \ AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\ - le32_to_cpu((p)->first_nr) + le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags) #define ALCROOT_F \ - BTROOT_F" total_len %llu" + BTROOT_F" total_len %llu flags 0x%x" #define ALCROOT_A(ar) \ - BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len) + BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags) #define SRE_FMT "%016llx.%llu.%llu" #define SRE_A(sre) \