mirror of
https://github.com/versity/scoutfs.git
synced 2026-05-02 02:45:43 +00:00
Merge pull request #46 from versity/zab/orphan_deletion_and_enospc
Zab/orphan deletion and enospc
This commit is contained in:
@@ -676,6 +676,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
|
||||
*
|
||||
* Unlike meta allocations, the caller is expected to serialize
|
||||
* allocations from the root.
|
||||
*
|
||||
* ENOBUFS is returned if the data allocator ran out of space and we can
|
||||
* probably refill it from the server. The caller is expected to back
|
||||
* out, commit the transaction, and try again.
|
||||
*
|
||||
* ENOSPC is returned if the data allocator ran out of space but we have
|
||||
* a flag from the server telling us that there's no more space
|
||||
* available. This is a hard error and should be returned.
|
||||
*/
|
||||
int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
@@ -724,13 +732,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0) {
|
||||
/*
|
||||
* Special retval meaning there wasn't space to alloc from
|
||||
* this txn. Doesn't mean filesystem is completely full.
|
||||
* Maybe upper layers want to try again.
|
||||
*/
|
||||
if (ret == -ENOENT)
|
||||
ret = -ENOBUFS;
|
||||
if (ret == -ENOENT) {
|
||||
if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW)
|
||||
ret = -ENOSPC;
|
||||
else
|
||||
ret = -ENOBUFS;
|
||||
}
|
||||
|
||||
*blkno_ret = 0;
|
||||
*count_ret = 0;
|
||||
} else {
|
||||
@@ -1261,6 +1269,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
return lo;
|
||||
}
|
||||
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag)
|
||||
{
|
||||
unsigned int seq;
|
||||
bool set;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&alloc->seqlock);
|
||||
set = !!(le32_to_cpu(alloc->avail.flags) & flag);
|
||||
} while (read_seqretry(&alloc->seqlock, seq));
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callers callback for every persistent allocator structure
|
||||
* we can find.
|
||||
|
||||
@@ -38,6 +38,10 @@
|
||||
#define SCOUTFS_ALLOC_DATA_LG_THRESH \
|
||||
(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
|
||||
|
||||
/* the client will force commits if data allocators get too low */
|
||||
#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \
|
||||
((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
|
||||
|
||||
/*
|
||||
* Fill client alloc roots to the target when they fall below the lo
|
||||
* threshold.
|
||||
@@ -55,6 +59,7 @@
|
||||
#define SCOUTFS_SERVER_DATA_FILL_LO \
|
||||
(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
|
||||
|
||||
|
||||
/*
|
||||
* Log merge meta allocations are only used for one request and will
|
||||
* never use more than the dirty limit.
|
||||
@@ -65,16 +70,6 @@
|
||||
((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4)
|
||||
#define SCOUTFS_SERVER_MERGE_FILL_LO SCOUTFS_SERVER_MERGE_FILL_TARGET
|
||||
|
||||
/*
|
||||
* Each of the server meta_alloc roots will try to keep a minimum amount
|
||||
* of free blocks. The server will swap roots when its current avail
|
||||
* falls below the threshold while the freed root is still above it. It
|
||||
* must have room for all the largest allocation attempted in a
|
||||
* transaction on the server.
|
||||
*/
|
||||
#define SCOUTFS_SERVER_META_ALLOC_MIN \
|
||||
(SCOUTFS_SERVER_META_FILL_TARGET * 2)
|
||||
|
||||
/*
|
||||
* A run-time use of a pair of persistent avail/freed roots as a
|
||||
* metadata allocator. It has the machinery needed to lock and avoid
|
||||
@@ -157,6 +152,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
|
||||
|
||||
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 nr);
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag);
|
||||
|
||||
typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
|
||||
int owner, u64 id,
|
||||
|
||||
@@ -88,6 +88,7 @@
|
||||
EXPAND_COUNTER(forest_read_items) \
|
||||
EXPAND_COUNTER(forest_roots_next_hint) \
|
||||
EXPAND_COUNTER(forest_set_bloom_bits) \
|
||||
EXPAND_COUNTER(inode_evict_intr) \
|
||||
EXPAND_COUNTER(item_clear_dirty) \
|
||||
EXPAND_COUNTER(item_create) \
|
||||
EXPAND_COUNTER(item_delete) \
|
||||
@@ -151,6 +152,12 @@
|
||||
EXPAND_COUNTER(net_recv_invalid_message) \
|
||||
EXPAND_COUNTER(net_recv_messages) \
|
||||
EXPAND_COUNTER(net_unknown_request) \
|
||||
EXPAND_COUNTER(orphan_scan) \
|
||||
EXPAND_COUNTER(orphan_scan_cached) \
|
||||
EXPAND_COUNTER(orphan_scan_error) \
|
||||
EXPAND_COUNTER(orphan_scan_item) \
|
||||
EXPAND_COUNTER(orphan_scan_omap_set) \
|
||||
EXPAND_COUNTER(orphan_scan_read) \
|
||||
EXPAND_COUNTER(quorum_elected) \
|
||||
EXPAND_COUNTER(quorum_fence_error) \
|
||||
EXPAND_COUNTER(quorum_fence_leader) \
|
||||
|
||||
@@ -312,10 +312,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
|
||||
|
||||
while (iblock <= last) {
|
||||
if (inode)
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
|
||||
true);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
|
||||
else
|
||||
ret = scoutfs_hold_trans(sb);
|
||||
ret = scoutfs_hold_trans(sb, false);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
@@ -756,8 +755,7 @@ retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
|
||||
true) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
|
||||
ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
|
||||
} while (ret > 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@@ -1010,7 +1008,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
|
||||
while(iblock <= last) {
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1086,7 +1084,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
|
||||
}
|
||||
|
||||
/* we're updating meta_seq with offline block count */
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -1238,7 +1236,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
ret = scoutfs_inode_index_start(sb, &seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
|
||||
scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
|
||||
if (ret > 0)
|
||||
continue;
|
||||
if (ret < 0)
|
||||
@@ -1844,13 +1842,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb)
|
||||
return ret;
|
||||
}
|
||||
|
||||
u64 scoutfs_data_alloc_free_bytes(struct super_block *sb)
|
||||
/*
|
||||
* Return true if the data allocator is lower than the caller's
|
||||
* requirement and we haven't been told by the server that we're out of
|
||||
* free extents.
|
||||
*/
|
||||
bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
|
||||
{
|
||||
DECLARE_DATA_INFO(sb, datinf);
|
||||
|
||||
return scoutfs_dalloc_total_len(&datinf->dalloc) <<
|
||||
SCOUTFS_BLOCK_SM_SHIFT;
|
||||
|
||||
return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
|
||||
!(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
|
||||
}
|
||||
|
||||
int scoutfs_data_setup(struct super_block *sb)
|
||||
|
||||
@@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb,
|
||||
void scoutfs_data_get_btrees(struct super_block *sb,
|
||||
struct scoutfs_log_trees *lt);
|
||||
int scoutfs_data_prepare_commit(struct super_block *sb);
|
||||
u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);
|
||||
bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks);
|
||||
|
||||
int scoutfs_data_setup(struct super_block *sb);
|
||||
void scoutfs_data_destroy(struct super_block *sb);
|
||||
|
||||
109
kmod/src/dir.c
109
kmod/src/dir.c
@@ -669,6 +669,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t rdev,
|
||||
struct scoutfs_lock **dir_lock,
|
||||
struct scoutfs_lock **inode_lock,
|
||||
struct scoutfs_lock **orph_lock,
|
||||
struct list_head *ind_locks)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
@@ -701,11 +702,17 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
if (orph_lock) {
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
|
||||
scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -725,9 +732,13 @@ out_unlock:
|
||||
if (ret) {
|
||||
scoutfs_inode_index_unlock(sb, ind_locks);
|
||||
scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
*dir_lock = NULL;
|
||||
scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
*inode_lock = NULL;
|
||||
if (orph_lock) {
|
||||
scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
*orph_lock = NULL;
|
||||
}
|
||||
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
@@ -752,7 +763,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
|
||||
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
|
||||
inode = lock_hold_create(dir, dentry, mode, rdev,
|
||||
&dir_lock, &inode_lock, &ind_locks);
|
||||
&dir_lock, &inode_lock, NULL, &ind_locks);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
@@ -813,13 +824,15 @@ static int scoutfs_link(struct dentry *old_dentry,
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct scoutfs_lock *dir_lock;
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
LIST_HEAD(ind_locks);
|
||||
bool del_orphan;
|
||||
bool del_orphan = false;
|
||||
u64 dir_size;
|
||||
u64 ind_seq;
|
||||
u64 hash;
|
||||
u64 pos;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
|
||||
|
||||
@@ -843,13 +856,20 @@ static int scoutfs_link(struct dentry *old_dentry,
|
||||
goto out_unlock;
|
||||
|
||||
dir_size = i_size_read(dir) + dentry->d_name.len;
|
||||
del_orphan = (inode->i_nlink == 0);
|
||||
|
||||
if (inode->i_nlink == 0) {
|
||||
del_orphan = true;
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
|
||||
&orph_lock);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -860,7 +880,7 @@ retry:
|
||||
goto out;
|
||||
|
||||
if (del_orphan) {
|
||||
ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
|
||||
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
@@ -871,8 +891,11 @@ retry:
|
||||
dentry->d_name.name, dentry->d_name.len,
|
||||
scoutfs_ino(inode), inode->i_mode, dir_lock,
|
||||
inode_lock);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
|
||||
goto out;
|
||||
}
|
||||
update_dentry_info(sb, dentry, hash, pos, dir_lock);
|
||||
|
||||
i_size_write(dir, dir_size);
|
||||
@@ -880,11 +903,6 @@ retry:
|
||||
inode->i_ctime = dir->i_mtime;
|
||||
inc_nlink(inode);
|
||||
|
||||
if (del_orphan) {
|
||||
ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
|
||||
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
|
||||
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
|
||||
|
||||
@@ -896,6 +914,8 @@ out_unlock:
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -920,6 +940,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct timespec ts = current_kernel_time();
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *dir_lock = NULL;
|
||||
LIST_HEAD(ind_locks);
|
||||
u64 ind_seq;
|
||||
@@ -937,32 +958,36 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (should_orphan(inode)) {
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
|
||||
&orph_lock);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
if (should_orphan(inode)) {
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
|
||||
dentry_info_pos(dentry), scoutfs_ino(inode),
|
||||
dir_lock, inode_lock);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
|
||||
WARN_ON_ONCE(ret); /* should have been dirty */
|
||||
goto out;
|
||||
|
||||
if (should_orphan(inode)) {
|
||||
/*
|
||||
* Insert the orphan item before we modify any inode
|
||||
* metadata so we can gracefully exit should it
|
||||
* fail.
|
||||
*/
|
||||
ret = scoutfs_orphan_inode(inode);
|
||||
WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
dir->i_ctime = ts;
|
||||
@@ -984,6 +1009,7 @@ unlock:
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1176,7 +1202,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
return ret;
|
||||
|
||||
inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
|
||||
&dir_lock, &inode_lock, &ind_locks);
|
||||
&dir_lock, &inode_lock, NULL, &ind_locks);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
@@ -1535,6 +1561,7 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
struct scoutfs_lock *new_dir_lock = NULL;
|
||||
struct scoutfs_lock *old_inode_lock = NULL;
|
||||
struct scoutfs_lock *new_inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct timespec now;
|
||||
bool ins_new = false;
|
||||
bool del_new = false;
|
||||
@@ -1599,6 +1626,13 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
if (should_orphan(new_inode)) {
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
|
||||
&orph_lock);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
|
||||
@@ -1607,7 +1641,7 @@ retry:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
|
||||
(new_inode == NULL ? 0 :
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -1658,7 +1692,7 @@ retry:
|
||||
ins_old = true;
|
||||
|
||||
if (should_orphan(new_inode)) {
|
||||
ret = scoutfs_orphan_inode(new_inode);
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
@@ -1762,6 +1796,7 @@ out_unlock:
|
||||
scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1781,6 +1816,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
struct inode *inode = NULL;
|
||||
struct scoutfs_lock *dir_lock = NULL;
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
LIST_HEAD(ind_locks);
|
||||
int ret;
|
||||
|
||||
@@ -1788,25 +1824,32 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
return -ENAMETOOLONG;
|
||||
|
||||
inode = lock_hold_create(dir, dentry, mode, 0,
|
||||
&dir_lock, &inode_lock, &ind_locks);
|
||||
&dir_lock, &inode_lock, &orph_lock, &ind_locks);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
if (ret < 0) {
|
||||
iput(inode);
|
||||
goto out; /* XXX returning error but items created */
|
||||
}
|
||||
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
insert_inode_hash(inode);
|
||||
ihold(inode); /* need to update inode modifications in d_tmpfile */
|
||||
d_tmpfile(dentry, inode);
|
||||
|
||||
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
|
||||
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
iput(inode);
|
||||
|
||||
ret = scoutfs_orphan_inode(inode);
|
||||
WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
|
||||
|
||||
out:
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -758,6 +758,16 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
void scoutfs_forest_stop(struct super_block *sb)
|
||||
{
|
||||
DECLARE_FOREST_INFO(sb, finf);
|
||||
|
||||
if (finf && finf->workq) {
|
||||
cancel_delayed_work_sync(&finf->log_merge_dwork);
|
||||
destroy_workqueue(finf->workq);
|
||||
}
|
||||
}
|
||||
|
||||
void scoutfs_forest_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
@@ -766,11 +776,6 @@ void scoutfs_forest_destroy(struct super_block *sb)
|
||||
if (finf) {
|
||||
scoutfs_block_put(sb, finf->srch_bl);
|
||||
|
||||
if (finf->workq) {
|
||||
cancel_delayed_work_sync(&finf->log_merge_dwork);
|
||||
destroy_workqueue(finf->workq);
|
||||
}
|
||||
|
||||
kfree(finf);
|
||||
sbi->forest_info = NULL;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
|
||||
struct scoutfs_log_trees *lt);
|
||||
|
||||
int scoutfs_forest_setup(struct super_block *sb);
|
||||
void scoutfs_forest_stop(struct super_block *sb);
|
||||
void scoutfs_forest_destroy(struct super_block *sb);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -286,9 +286,10 @@ struct scoutfs_alloc_list_head {
|
||||
struct scoutfs_block_ref ref;
|
||||
__le64 total_nr;
|
||||
__le32 first_nr;
|
||||
__u8 __pad[4];
|
||||
__le32 flags;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* While the main allocator uses extent items in btree blocks, metadata
|
||||
* allocations for a single transaction are recorded in arrays in
|
||||
@@ -317,9 +318,14 @@ struct scoutfs_alloc_list_block {
|
||||
*/
|
||||
struct scoutfs_alloc_root {
|
||||
__le64 total_len;
|
||||
__le32 flags;
|
||||
__le32 _pad;
|
||||
struct scoutfs_btree_root root;
|
||||
};
|
||||
|
||||
/* Shared by _alloc_list_head and _alloc_root */
|
||||
#define SCOUTFS_ALLOC_FLAG_LOW (1U << 0)
|
||||
|
||||
/* types of allocators, exposed to alloc_detail ioctl */
|
||||
#define SCOUTFS_ALLOC_OWNER_NONE 0
|
||||
#define SCOUTFS_ALLOC_OWNER_SERVER 1
|
||||
@@ -570,7 +576,7 @@ struct scoutfs_log_merge_freeing {
|
||||
* Keys are first sorted by major key zones.
|
||||
*/
|
||||
#define SCOUTFS_INODE_INDEX_ZONE 1
|
||||
#define SCOUTFS_RID_ZONE 2
|
||||
#define SCOUTFS_ORPHAN_ZONE 2
|
||||
#define SCOUTFS_FS_ZONE 3
|
||||
#define SCOUTFS_LOCK_ZONE 4
|
||||
/* Items only stored in server btrees */
|
||||
@@ -592,7 +598,7 @@ struct scoutfs_log_merge_freeing {
|
||||
#define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE 2
|
||||
#define SCOUTFS_INODE_INDEX_NR 3 /* don't forget to update */
|
||||
|
||||
/* rid zone (also used in server alloc btree) */
|
||||
/* orphan zone, redundant type used for clarity */
|
||||
#define SCOUTFS_ORPHAN_TYPE 1
|
||||
|
||||
/* fs zone */
|
||||
|
||||
331
kmod/src/inode.c
331
kmod/src/inode.c
@@ -34,6 +34,7 @@
|
||||
#include "client.h"
|
||||
#include "cmp.h"
|
||||
#include "omap.h"
|
||||
#include "forest.h"
|
||||
|
||||
/*
|
||||
* XXX
|
||||
@@ -54,10 +55,19 @@ struct inode_allocator {
|
||||
};
|
||||
|
||||
struct inode_sb_info {
|
||||
struct super_block *sb;
|
||||
bool stopped;
|
||||
|
||||
spinlock_t writeback_lock;
|
||||
struct rb_root writeback_inodes;
|
||||
struct inode_allocator dir_ino_alloc;
|
||||
struct inode_allocator ino_alloc;
|
||||
|
||||
struct delayed_work orphan_scan_dwork;
|
||||
|
||||
/* serialize multiple inode ->evict trying to delete same ino's items */
|
||||
spinlock_t deleting_items_lock;
|
||||
struct list_head deleting_items_list;
|
||||
};
|
||||
|
||||
#define DECLARE_INODE_SB_INFO(sb, name) \
|
||||
@@ -352,7 +362,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
|
||||
if (!S_ISREG(inode->i_mode))
|
||||
return 0;
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -379,7 +389,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
|
||||
LIST_HEAD(ind_locks);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -494,7 +504,7 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1207,7 +1217,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
|
||||
* Returns > 0 if the seq changed and the locks should be retried.
|
||||
*/
|
||||
int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
|
||||
struct list_head *list, u64 seq)
|
||||
struct list_head *list, u64 seq, bool allocing)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct index_lock *ind_lock;
|
||||
@@ -1223,7 +1233,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_hold_trans(sb);
|
||||
ret = scoutfs_hold_trans(sb, allocing);
|
||||
if (ret == 0 && seq != sbi->trans_seq) {
|
||||
scoutfs_release_trans(sb);
|
||||
ret = 1;
|
||||
@@ -1237,7 +1247,7 @@ out:
|
||||
}
|
||||
|
||||
int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
|
||||
bool set_data_seq)
|
||||
bool set_data_seq, bool allocing)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
int ret;
|
||||
@@ -1247,7 +1257,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
|
||||
ret = scoutfs_inode_index_start(sb, &seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, list, inode,
|
||||
set_data_seq) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, list, seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing);
|
||||
} while (ret > 0);
|
||||
|
||||
return ret;
|
||||
@@ -1437,41 +1447,74 @@ out:
|
||||
return inode;
|
||||
}
|
||||
|
||||
static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
|
||||
static void init_orphan_key(struct scoutfs_key *key, u64 ino)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_RID_ZONE,
|
||||
.sko_rid = cpu_to_le64(rid),
|
||||
.sk_type = SCOUTFS_ORPHAN_TYPE,
|
||||
.sk_zone = SCOUTFS_ORPHAN_ZONE,
|
||||
.sko_ino = cpu_to_le64(ino),
|
||||
.sk_type = SCOUTFS_ORPHAN_TYPE,
|
||||
};
|
||||
}
|
||||
|
||||
int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
|
||||
/*
|
||||
* Create an orphan item. The orphan items are maintained in their own
|
||||
* zone under a write only lock while the caller has the inode protected
|
||||
* by a write lock.
|
||||
*/
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_lock *lock = sbi->rid_lock;
|
||||
struct scoutfs_key key;
|
||||
|
||||
init_orphan_key(&key, sbi->rid, ino);
|
||||
init_orphan_key(&key, ino);
|
||||
|
||||
return scoutfs_item_dirty(sb, &key, lock);
|
||||
return scoutfs_item_create_force(sb, &key, NULL, 0, lock);
|
||||
}
|
||||
|
||||
int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
|
||||
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_lock *lock = sbi->rid_lock;
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
init_orphan_key(&key, sbi->rid, ino);
|
||||
init_orphan_key(&key, ino);
|
||||
|
||||
ret = scoutfs_item_delete(sb, &key, lock);
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
return scoutfs_item_delete_force(sb, &key, lock);
|
||||
}
|
||||
|
||||
return ret;
|
||||
struct deleting_ino_entry {
|
||||
struct list_head head;
|
||||
u64 ino;
|
||||
};
|
||||
|
||||
static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
|
||||
{
|
||||
struct deleting_ino_entry *tmp;
|
||||
bool added = true;
|
||||
|
||||
spin_lock(&inf->deleting_items_lock);
|
||||
|
||||
list_for_each_entry(tmp, &inf->deleting_items_list, head) {
|
||||
if (tmp->ino == ino) {
|
||||
added = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (added) {
|
||||
del->ino = ino;
|
||||
list_add_tail(&del->head, &inf->deleting_items_list);
|
||||
}
|
||||
|
||||
spin_unlock(&inf->deleting_items_lock);
|
||||
|
||||
return added;
|
||||
}
|
||||
|
||||
static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
|
||||
{
|
||||
if (del->ino) {
|
||||
spin_lock(&inf->deleting_items_lock);
|
||||
list_del_init(&del->head);
|
||||
spin_unlock(&inf->deleting_items_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1482,9 +1525,21 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
|
||||
* orphan item will continue triggering attempts to finish previous
|
||||
* partial deletion until all deletion is complete and the orphan item
|
||||
* is removed.
|
||||
*
|
||||
* Currently this can be called multiple times for multiple cached
|
||||
* inodes for a given ino number (ilookup avoids freeing inodes to avoid
|
||||
* cluster lock<->inode flag waiting inversions). Some items are not
|
||||
* safe to delete concurrently, for example concurrent data truncation
|
||||
* could free extents multiple times. We use a very silly list of inos
|
||||
* being deleted. Duplicates just return success. If the first
|
||||
* deletion ends up failing orphan deletion will come back around later
|
||||
* and retry.
|
||||
*/
|
||||
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
|
||||
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
|
||||
struct scoutfs_lock *orph_lock)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
struct deleting_ino_entry del = {{NULL, }};
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
LIST_HEAD(ind_locks);
|
||||
@@ -1494,6 +1549,11 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
|
||||
u64 size;
|
||||
int ret;
|
||||
|
||||
if (!added_deleting_ino(inf, &del, ino)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
init_inode_key(&key, ino);
|
||||
|
||||
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
|
||||
@@ -1531,7 +1591,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -1553,8 +1613,9 @@ retry:
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_orphan_delete(sb, ino);
|
||||
ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
|
||||
out:
|
||||
del_deleting_ino(inf, &del);
|
||||
if (release)
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
@@ -1568,11 +1629,17 @@ out:
|
||||
* tear down. We use locking and open inode number bitmaps to decide if
|
||||
* we should finally destroy an inode that is no longer open nor
|
||||
* reachable through directory entries.
|
||||
*
|
||||
* Because lookup ignores freeing inodes we can get here from multiple
|
||||
* instances of an inode that is being deleted. Orphan scanning in
|
||||
* particular can race with deletion. delete_inode_items() resolves
|
||||
* concurrent attempts.
|
||||
*/
|
||||
void scoutfs_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_lock *orph_lock;
|
||||
struct scoutfs_lock *lock;
|
||||
int ret;
|
||||
|
||||
@@ -1584,14 +1651,21 @@ void scoutfs_evict_inode(struct inode *inode)
|
||||
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
|
||||
ret = scoutfs_omap_should_delete(sb, inode, &lock);
|
||||
ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
|
||||
if (ret > 0) {
|
||||
ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
|
||||
ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
}
|
||||
if (ret < 0)
|
||||
if (ret == -ERESTARTSYS) {
|
||||
/* can be in task with pending, could be found as orphan */
|
||||
scoutfs_inc_counter(sb, inode_evict_intr);
|
||||
ret = 0;
|
||||
}
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
|
||||
ret, ino);
|
||||
}
|
||||
|
||||
scoutfs_omap_dec(sb, ino);
|
||||
|
||||
@@ -1626,75 +1700,141 @@ int scoutfs_drop_inode(struct inode *inode)
|
||||
}
|
||||
|
||||
/*
|
||||
* Find orphan items and process each one.
|
||||
*
|
||||
* Runtime of this will be bounded by the number of orphans, which could
|
||||
* theoretically be very large. If that becomes a problem we might want to push
|
||||
* this work off to a thread.
|
||||
*
|
||||
* This only scans orphans for this node. This will need to be covered by
|
||||
* the rest of node zone cleanup.
|
||||
* All mounts are performing this work concurrently. We introduce
|
||||
* significant jitter between them to try and keep them from all
|
||||
* bunching up and working on the same inodes.
|
||||
*/
|
||||
int scoutfs_scan_orphans(struct super_block *sb)
|
||||
static void schedule_orphan_dwork(struct inode_sb_info *inf)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_lock *lock = sbi->rid_lock;
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_key key;
|
||||
#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
|
||||
#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
|
||||
unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
|
||||
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
|
||||
if (!inf->stopped) {
|
||||
delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
|
||||
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
|
||||
schedule_delayed_work(&inf->orphan_scan_dwork, delay);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and delete inodes whose only remaining reference is the
|
||||
* persistent orphan item that was created as they were unlinked.
|
||||
*
|
||||
* Orphan items are created as the final directory entry referring to an
|
||||
* inode is deleted. They're deleted as the final cached inode is
|
||||
* evicted and the inode items are destroyed. They can linger if all
|
||||
* the cached inodes pinning the inode fail to delete as they are
|
||||
* evicted from the cache -- either through crashing or errors.
|
||||
*
|
||||
* This work runs in all mounts in the background looking for orphaned
|
||||
* inodes that should be deleted.
|
||||
*
|
||||
* We use the forest hint call to read the persistent forest trees
|
||||
* looking for orphan items without creating lock contention. Orphan
|
||||
* items exist for O_TMPFILE users and we don't want to force them to
|
||||
* commit by trying to acquire a conflicting read lock the orphan zone.
|
||||
* There's no rush to reclaim deleted items, eventually they will be
|
||||
* found in the persistent item btrees.
|
||||
*
|
||||
* Once we find candidate orphan items we can first check our local
|
||||
* inode cache for inodes that are already on their way to eviction and
|
||||
* can be skipped. Then we ask the server for the open map containing
|
||||
* the inode. Only if we don't have it cached, and no one else does, do
|
||||
* we try and read it into our cache and evict it to trigger the final
|
||||
* inode deletion process.
|
||||
*
|
||||
* Orphaned items that make it that far should be very rare. They can
|
||||
* only exist if all the mounts that were using an inode after it had
|
||||
* been unlinked (or created with o_tmpfile) didn't unmount cleanly.
|
||||
*/
|
||||
static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
{
|
||||
struct inode_sb_info *inf = container_of(work, struct inode_sb_info,
|
||||
orphan_scan_dwork.work);
|
||||
struct super_block *sb = inf->sb;
|
||||
struct scoutfs_open_ino_map omap;
|
||||
struct scoutfs_key last;
|
||||
struct scoutfs_key next;
|
||||
struct scoutfs_key key;
|
||||
struct inode *inode;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
u64 ino;
|
||||
int err = 0;
|
||||
int ret;
|
||||
|
||||
trace_scoutfs_scan_orphans(sb);
|
||||
scoutfs_inc_counter(sb, orphan_scan);
|
||||
|
||||
init_orphan_key(&key, sbi->rid, 0);
|
||||
init_orphan_key(&last, sbi->rid, ~0ULL);
|
||||
init_orphan_key(&last, U64_MAX);
|
||||
omap.args.group_nr = cpu_to_le64(U64_MAX);
|
||||
|
||||
while (1) {
|
||||
ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
|
||||
if (ret == -ENOENT) /* No more orphan items */
|
||||
break;
|
||||
if (ret < 0)
|
||||
for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) {
|
||||
if (inf->stopped) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
|
||||
ino = le64_to_cpu(key.sko_ino);
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
|
||||
if (ret == 0) {
|
||||
ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
}
|
||||
if (ret && ret != -ENOENT && !err)
|
||||
err = ret;
|
||||
|
||||
if (le64_to_cpu(key.sko_ino) == U64_MAX) {
|
||||
ret = -ENOENT;
|
||||
/* find the next orphan item */
|
||||
init_orphan_key(&key, ino);
|
||||
ret = scoutfs_forest_next_hint(sb, &key, &next);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (scoutfs_key_compare(&next, &last) > 0)
|
||||
break;
|
||||
|
||||
scoutfs_inc_counter(sb, orphan_scan_item);
|
||||
ino = le64_to_cpu(next.sko_ino);
|
||||
|
||||
/* locally cached inodes will already be deleted */
|
||||
inode = scoutfs_ilookup(sb, ino);
|
||||
if (inode) {
|
||||
scoutfs_inc_counter(sb, orphan_scan_cached);
|
||||
iput(inode);
|
||||
continue;
|
||||
}
|
||||
le64_add_cpu(&key.sko_ino, 1);
|
||||
|
||||
/* get an omap that covers the orphaned ino */
|
||||
group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
|
||||
bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
|
||||
|
||||
if (le64_to_cpu(omap.args.group_nr) != group_nr) {
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* don't need to evict if someone else has it open (cached) */
|
||||
if (test_bit_le(bit_nr, omap.bits)) {
|
||||
scoutfs_inc_counter(sb, orphan_scan_omap_set);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* try to cached and evict unused inode to delete, can be racing */
|
||||
inode = scoutfs_iget(sb, ino);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ENOENT)
|
||||
continue;
|
||||
else
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, orphan_scan_read);
|
||||
SCOUTFS_I(inode)->drop_invalidated = true;
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
return err ? err : ret;
|
||||
}
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, orphan_scan_error);
|
||||
|
||||
int scoutfs_orphan_inode(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_lock *lock = sbi->rid_lock;
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
trace_scoutfs_orphan_inode(sb, inode);
|
||||
|
||||
init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
|
||||
|
||||
return ret;
|
||||
schedule_orphan_dwork(inf);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1803,16 +1943,43 @@ int scoutfs_inode_setup(struct super_block *sb)
|
||||
if (!inf)
|
||||
return -ENOMEM;
|
||||
|
||||
inf->sb = sb;
|
||||
spin_lock_init(&inf->writeback_lock);
|
||||
inf->writeback_inodes = RB_ROOT;
|
||||
spin_lock_init(&inf->dir_ino_alloc.lock);
|
||||
spin_lock_init(&inf->ino_alloc.lock);
|
||||
INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
|
||||
spin_lock_init(&inf->deleting_items_lock);
|
||||
INIT_LIST_HEAD(&inf->deleting_items_list);
|
||||
|
||||
sbi->inode_sb_info = inf;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Our inode subsystem is setup pretty early but orphan scanning uses
|
||||
* many other subsystems like networking and the server. We only kick
|
||||
* it off once everything is ready.
|
||||
*/
|
||||
int scoutfs_inode_start(struct super_block *sb)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
|
||||
schedule_orphan_dwork(inf);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void scoutfs_inode_stop(struct super_block *sb)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
|
||||
if (inf) {
|
||||
inf->stopped = true;
|
||||
cancel_delayed_work_sync(&inf->orphan_scan_dwork);
|
||||
}
|
||||
}
|
||||
|
||||
void scoutfs_inode_destroy(struct super_block *sb)
|
||||
{
|
||||
struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
|
||||
|
||||
@@ -75,7 +75,6 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
|
||||
void scoutfs_destroy_inode(struct inode *inode);
|
||||
int scoutfs_drop_inode(struct inode *inode);
|
||||
void scoutfs_evict_inode(struct inode *inode);
|
||||
int scoutfs_orphan_inode(struct inode *inode);
|
||||
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
|
||||
@@ -89,9 +88,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
|
||||
struct list_head *list, u64 ino,
|
||||
umode_t mode);
|
||||
int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
|
||||
struct list_head *list, u64 seq);
|
||||
struct list_head *list, u64 seq, bool allocing);
|
||||
int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
|
||||
bool set_data_seq);
|
||||
bool set_data_seq, bool allocing);
|
||||
void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);
|
||||
|
||||
int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
|
||||
@@ -120,9 +119,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
|
||||
int scoutfs_scan_orphans(struct super_block *sb);
|
||||
int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
|
||||
int scoutfs_orphan_delete(struct super_block *sb, u64 ino);
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
|
||||
void scoutfs_inode_queue_writeback(struct inode *inode);
|
||||
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
|
||||
@@ -133,6 +131,8 @@ void scoutfs_inode_exit(void);
|
||||
int scoutfs_inode_init(void);
|
||||
|
||||
int scoutfs_inode_setup(struct super_block *sb);
|
||||
int scoutfs_inode_start(struct super_block *sb);
|
||||
void scoutfs_inode_stop(struct super_block *sb);
|
||||
void scoutfs_inode_destroy(struct super_block *sb);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "hash.h"
|
||||
#include "srch.h"
|
||||
#include "alloc.h"
|
||||
#include "server.h"
|
||||
#include "scoutfs_trace.h"
|
||||
|
||||
/*
|
||||
@@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
|
||||
|
||||
/* setting only so we don't see 0 data seq with nonzero data_version */
|
||||
set_data_seq = sm.data_version != 0 ? true : false;
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
@@ -879,6 +880,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
|
||||
sfm.rid = sbi->rid;
|
||||
sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
|
||||
sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
|
||||
sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb);
|
||||
|
||||
ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
|
||||
if (ret)
|
||||
|
||||
@@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more {
|
||||
__u64 committed_seq;
|
||||
__u64 total_meta_blocks;
|
||||
__u64 total_data_blocks;
|
||||
__u64 reserved_meta_blocks;
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
|
||||
|
||||
164
kmod/src/item.c
164
kmod/src/item.c
@@ -95,7 +95,7 @@ struct item_cache_info {
|
||||
|
||||
/* written by page readers, read by shrink */
|
||||
spinlock_t active_lock;
|
||||
struct rb_root active_root;
|
||||
struct list_head active_list;
|
||||
};
|
||||
|
||||
#define DECLARE_ITEM_CACHE_INFO(sb, name) \
|
||||
@@ -127,6 +127,7 @@ struct cached_page {
|
||||
unsigned long lru_time;
|
||||
struct list_head dirty_list;
|
||||
struct list_head dirty_head;
|
||||
u64 max_liv_seq;
|
||||
struct page *page;
|
||||
unsigned int page_off;
|
||||
unsigned int erased_bytes;
|
||||
@@ -385,6 +386,14 @@ static void put_pg(struct super_block *sb, struct cached_page *pg)
|
||||
}
|
||||
}
|
||||
|
||||
static void update_pg_max_liv_seq(struct cached_page *pg, struct cached_item *item)
|
||||
{
|
||||
u64 liv_seq = le64_to_cpu(item->liv.seq);
|
||||
|
||||
if (liv_seq > pg->max_liv_seq)
|
||||
pg->max_liv_seq = liv_seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate space for a new item from the free offset at the end of a
|
||||
* cached page. This isn't a blocking allocation, and it's likely that
|
||||
@@ -416,6 +425,8 @@ static struct cached_item *alloc_item(struct cached_page *pg,
|
||||
if (val_len)
|
||||
memcpy(item->val, val, val_len);
|
||||
|
||||
update_pg_max_liv_seq(pg, item);
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
@@ -622,6 +633,8 @@ static void mark_item_dirty(struct super_block *sb,
|
||||
list_add_tail(&item->dirty_head, &pg->dirty_list);
|
||||
item->dirty = 1;
|
||||
}
|
||||
|
||||
update_pg_max_liv_seq(pg, item);
|
||||
}
|
||||
|
||||
static void clear_item_dirty(struct super_block *sb,
|
||||
@@ -1260,46 +1273,76 @@ static int cache_empty_page(struct super_block *sb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Readers operate independently from dirty items and transactions.
|
||||
* They read a set of persistent items and insert them into the cache
|
||||
* when there aren't already pages whose key range contains the items.
|
||||
* This naturally prefers cached dirty items over stale read items.
|
||||
*
|
||||
* We have to deal with the case where dirty items are written and
|
||||
* invalidated while a read is in flight. The reader won't have seen
|
||||
* the items that were dirty in their persistent roots as they started
|
||||
* reading. By the time they insert their read pages the previously
|
||||
* dirty items have been reclaimed and are not in the cache. The old
|
||||
* stale items will be inserted in their place, effectively corrupting
|
||||
* by having the dirty items disappear.
|
||||
*
|
||||
* We fix this by tracking the max seq of items in pages. As readers
|
||||
* start they record the current transaction seq. Invalidation skips
|
||||
* pages with a max seq greater than the first reader seq because the
|
||||
* items in the page have to stick around to prevent the readers stale
|
||||
* items from being inserted.
|
||||
*
|
||||
* This naturally only affects a small set of pages with items that were
|
||||
* written relatively recently. If we're in memory pressure then we
|
||||
* probably have a lot of pages and they'll naturally have items that
|
||||
* were visible to any raders. We don't bother with the complicated and
|
||||
* expensive further refinement of tracking the ranges that are being
|
||||
* read and comparing those with pages to invalidate.
|
||||
*/
|
||||
struct active_reader {
|
||||
struct rb_node node;
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct list_head head;
|
||||
u64 seq;
|
||||
};
|
||||
|
||||
static struct active_reader *active_rbtree_walk(struct rb_root *root,
|
||||
struct scoutfs_key *start,
|
||||
struct scoutfs_key *end,
|
||||
struct rb_node **par,
|
||||
struct rb_node ***pnode)
|
||||
#define INIT_ACTIVE_READER(rdr) \
|
||||
struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
|
||||
|
||||
static void add_active_reader(struct super_block *sb, struct active_reader *active)
|
||||
{
|
||||
DECLARE_ITEM_CACHE_INFO(sb, cinf);
|
||||
|
||||
BUG_ON(!list_empty(&active->head));
|
||||
|
||||
active->seq = scoutfs_trans_sample_seq(sb);
|
||||
|
||||
spin_lock(&cinf->active_lock);
|
||||
list_add_tail(&active->head, &cinf->active_list);
|
||||
spin_unlock(&cinf->active_lock);
|
||||
}
|
||||
|
||||
static u64 first_active_reader_seq(struct item_cache_info *cinf)
|
||||
{
|
||||
struct rb_node **node = &root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct active_reader *ret = NULL;
|
||||
struct active_reader *active;
|
||||
int cmp;
|
||||
u64 first;
|
||||
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
active = container_of(*node, struct active_reader, node);
|
||||
/* only the calling task adds or deletes this active */
|
||||
spin_lock(&cinf->active_lock);
|
||||
active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
|
||||
first = active ? active->seq : U64_MAX;
|
||||
spin_unlock(&cinf->active_lock);
|
||||
|
||||
cmp = scoutfs_key_compare_ranges(start, end, &active->start,
|
||||
&active->end);
|
||||
if (cmp < 0) {
|
||||
node = &(*node)->rb_left;
|
||||
} else if (cmp > 0) {
|
||||
node = &(*node)->rb_right;
|
||||
} else {
|
||||
ret = active;
|
||||
node = &(*node)->rb_left;
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
|
||||
{
|
||||
/* only the calling task adds or deletes this active */
|
||||
if (!list_empty(&active->head)) {
|
||||
spin_lock(&cinf->active_lock);
|
||||
list_del_init(&active->head);
|
||||
spin_unlock(&cinf->active_lock);
|
||||
}
|
||||
|
||||
if (par)
|
||||
*par = parent;
|
||||
if (pnode)
|
||||
*pnode = node;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1399,22 +1442,15 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
|
||||
* locks held, but without locking the cache. The regions we read can
|
||||
* be stale with respect to the current cache, which can be read and
|
||||
* dirtied by other cluster lock holders on our node, but the cluster
|
||||
* locks protect the stable items we read.
|
||||
*
|
||||
* There's also the exciting case where a reader can populate the cache
|
||||
* with stale old persistent data which was read before another local
|
||||
* cluster lock holder was able to read, dirty, write, and then shrink
|
||||
* the cache. In this case the cache couldn't be cleared by lock
|
||||
* invalidation because the caller is actively holding the lock. But
|
||||
* shrinking could evict the cache within the held lock. So we record
|
||||
* that we're an active reader in the range covered by the lock and
|
||||
* shrink will refuse to reclaim any pages that intersect with our read.
|
||||
* locks protect the stable items we read. Invalidation is careful not
|
||||
* to drop pages that have items that we couldn't see because they were
|
||||
* dirty when we started reading.
|
||||
*/
|
||||
static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
|
||||
struct scoutfs_key *key, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct rb_root root = RB_ROOT;
|
||||
struct active_reader active;
|
||||
INIT_ACTIVE_READER(active);
|
||||
struct cached_page *right = NULL;
|
||||
struct cached_page *pg;
|
||||
struct cached_page *rd;
|
||||
@@ -1430,15 +1466,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
|
||||
int pgi;
|
||||
int ret;
|
||||
|
||||
/* stop shrink from freeing new clean data, would let us cache stale */
|
||||
active.start = lock->start;
|
||||
active.end = lock->end;
|
||||
spin_lock(&cinf->active_lock);
|
||||
active_rbtree_walk(&cinf->active_root, &active.start, &active.end,
|
||||
&par, &pnode);
|
||||
rbtree_insert(&active.node, par, pnode, &cinf->active_root);
|
||||
spin_unlock(&cinf->active_lock);
|
||||
|
||||
/* start with an empty page that covers the whole lock */
|
||||
pg = alloc_pg(sb, 0);
|
||||
if (!pg) {
|
||||
@@ -1449,6 +1476,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
|
||||
pg->end = lock->end;
|
||||
rbtree_insert(&pg->node, NULL, &root.rb_node, &root);
|
||||
|
||||
/* set active reader seq before reading persistent roots */
|
||||
add_active_reader(sb, &active);
|
||||
|
||||
ret = scoutfs_forest_read_items(sb, lock, key, &start, &end,
|
||||
read_page_item, &root);
|
||||
if (ret < 0)
|
||||
@@ -1526,9 +1556,7 @@ retry:
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
spin_lock(&cinf->active_lock);
|
||||
rbtree_erase(&active.node, &cinf->active_root);
|
||||
spin_unlock(&cinf->active_lock);
|
||||
del_active_reader(cinf, &active);
|
||||
|
||||
/* free any pages we left dangling on error */
|
||||
for_each_page_safe(&root, rd, pg_tmp) {
|
||||
@@ -1830,8 +1858,8 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
|
||||
if (!item || item->deletion) {
|
||||
ret = -ENOENT;
|
||||
} else {
|
||||
mark_item_dirty(sb, cinf, pg, NULL, item);
|
||||
item->liv.seq = item_seq(sb, lock);
|
||||
mark_item_dirty(sb, cinf, pg, NULL, item);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
@@ -2406,9 +2434,9 @@ retry:
|
||||
|
||||
/*
|
||||
* Shrink the size the item cache. We're operating against the fast
|
||||
* path lock ordering and we skip pages if we can't acquire locks.
|
||||
* Similarly, we can run into dirty pages or pages which intersect with
|
||||
* active readers that we can't shrink and also choose to skip.
|
||||
* path lock ordering and we skip pages if we can't acquire locks. We
|
||||
* can run into dirty pages or pages with items that weren't visible to
|
||||
* the earliest active reader which must be skipped.
|
||||
*/
|
||||
static int item_lru_shrink(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
@@ -2417,26 +2445,24 @@ static int item_lru_shrink(struct shrinker *shrink,
|
||||
struct item_cache_info,
|
||||
shrinker);
|
||||
struct super_block *sb = cinf->sb;
|
||||
struct active_reader *active;
|
||||
struct cached_page *tmp;
|
||||
struct cached_page *pg;
|
||||
u64 first_reader_seq;
|
||||
int nr;
|
||||
|
||||
if (sc->nr_to_scan == 0)
|
||||
goto out;
|
||||
nr = sc->nr_to_scan;
|
||||
|
||||
/* can't invalidate pages with items that weren't visible to first reader */
|
||||
first_reader_seq = first_active_reader_seq(cinf);
|
||||
|
||||
write_lock(&cinf->rwlock);
|
||||
spin_lock(&cinf->lru_lock);
|
||||
|
||||
list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {
|
||||
|
||||
/* can't invalidate ranges being read, reader might be stale */
|
||||
spin_lock(&cinf->active_lock);
|
||||
active = active_rbtree_walk(&cinf->active_root, &pg->start,
|
||||
&pg->end, NULL, NULL);
|
||||
spin_unlock(&cinf->active_lock);
|
||||
if (active) {
|
||||
if (first_reader_seq <= pg->max_liv_seq) {
|
||||
scoutfs_inc_counter(sb, item_shrink_page_reader);
|
||||
continue;
|
||||
}
|
||||
@@ -2505,7 +2531,7 @@ int scoutfs_item_setup(struct super_block *sb)
|
||||
spin_lock_init(&cinf->lru_lock);
|
||||
INIT_LIST_HEAD(&cinf->lru_list);
|
||||
spin_lock_init(&cinf->active_lock);
|
||||
cinf->active_root = RB_ROOT;
|
||||
INIT_LIST_HEAD(&cinf->active_list);
|
||||
|
||||
cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
|
||||
if (!cinf->pcpu_pages)
|
||||
@@ -2536,7 +2562,7 @@ void scoutfs_item_destroy(struct super_block *sb)
|
||||
int cpu;
|
||||
|
||||
if (cinf) {
|
||||
BUG_ON(!RB_EMPTY_ROOT(&cinf->active_root));
|
||||
BUG_ON(!list_empty(&cinf->active_list));
|
||||
|
||||
unregister_hotcpu_notifier(&cinf->notifier);
|
||||
unregister_shrinker(&cinf->shrinker);
|
||||
|
||||
@@ -1347,29 +1347,28 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
|
||||
}
|
||||
|
||||
/*
|
||||
* The rid lock protects a mount's private persistent items in the rid
|
||||
* zone. It's held for the duration of the mount. It lets the mount
|
||||
* modify the rid items at will and signals to other mounts that we're
|
||||
* still alive and our rid items shouldn't be reclaimed.
|
||||
* Orphan items are stored in their own zone which are modified with
|
||||
* shared write_only locks and are read inconsistently without locks by
|
||||
* background scanning work.
|
||||
*
|
||||
* Being held for the entire mount prevents other nodes from reclaiming
|
||||
* our items, like free blocks, when it would make sense for them to be
|
||||
* able to. Maybe we have a bunch free and they're trying to allocate
|
||||
* and are getting ENOSPC.
|
||||
* Since we only use write_only locks we just lock the entire zone, but
|
||||
* the api provides the inode in case we ever change the locking scheme.
|
||||
*/
|
||||
int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
|
||||
u64 rid, struct scoutfs_lock **lock)
|
||||
int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
|
||||
struct scoutfs_lock **lock)
|
||||
{
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
|
||||
scoutfs_key_set_zeros(&start);
|
||||
start.sk_zone = SCOUTFS_RID_ZONE;
|
||||
start.sko_rid = cpu_to_le64(rid);
|
||||
start.sk_zone = SCOUTFS_ORPHAN_ZONE;
|
||||
start.sko_ino = 0;
|
||||
start.sk_type = SCOUTFS_ORPHAN_TYPE;
|
||||
|
||||
scoutfs_key_set_ones(&end);
|
||||
end.sk_zone = SCOUTFS_RID_ZONE;
|
||||
end.sko_rid = cpu_to_le64(rid);
|
||||
scoutfs_key_set_zeros(&end);
|
||||
end.sk_zone = SCOUTFS_ORPHAN_ZONE;
|
||||
end.sko_ino = cpu_to_le64(U64_MAX);
|
||||
end.sk_type = SCOUTFS_ORPHAN_TYPE;
|
||||
|
||||
return lock_key_range(sb, mode, flags, &start, &end, lock);
|
||||
}
|
||||
|
||||
@@ -85,8 +85,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
|
||||
struct inode *d, struct scoutfs_lock **D_lock);
|
||||
int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
|
||||
struct scoutfs_lock **lock);
|
||||
int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
|
||||
u64 rid, struct scoutfs_lock **lock);
|
||||
int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
|
||||
u64 ino, struct scoutfs_lock **lock);
|
||||
void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
enum scoutfs_lock_mode mode);
|
||||
|
||||
|
||||
@@ -595,10 +595,6 @@ out:
|
||||
free_req(req);
|
||||
}
|
||||
|
||||
/* it's fine if we couldn't send to a client that left */
|
||||
if (ret == -ENOTCONN)
|
||||
ret = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -908,9 +904,9 @@ out:
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 1 and give the caller a write inode lock if it is safe to be
|
||||
* deleted. It's safe to be deleted when it is no longer reachable and
|
||||
* nothing is referencing it.
|
||||
* Return 1 and give the caller their locks when they should delete the
|
||||
* inode items. It's safe to delete the inode items when it is no
|
||||
* longer reachable and nothing is referencing it.
|
||||
*
|
||||
* The inode is unreachable when nlink hits zero. Cluster locks protect
|
||||
* modification and testing of nlink. We use the ino_lock_cov covrage
|
||||
@@ -925,15 +921,17 @@ out:
|
||||
* increase nlink from zero and let people get a reference to the inode.
|
||||
*/
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret)
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* lock group and omap constants are defined independently */
|
||||
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
|
||||
@@ -964,12 +962,19 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
out:
|
||||
trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);
|
||||
|
||||
if (ret > 0) {
|
||||
err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
|
||||
if (err < 0)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
if (ret <= 0) {
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
lock = NULL;
|
||||
}
|
||||
|
||||
*lock_ret = lock;
|
||||
*orph_lock_ret = orph_lock;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret);
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
|
||||
int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
|
||||
struct scoutfs_open_ino_map_args *args);
|
||||
|
||||
@@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func,
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
TP_PROTO(struct super_block *sb, void *journal_info, int holders),
|
||||
TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
|
||||
|
||||
TP_ARGS(sb, journal_info, holders),
|
||||
TP_ARGS(sb, journal_info, holders, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(unsigned long, journal_info)
|
||||
__field(int, holders)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
__entry->holders = holders;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" journal_info 0x%0lx holders %d",
|
||||
SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
|
||||
TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
|
||||
SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
|
||||
TP_PROTO(struct super_block *sb, void *journal_info, int holders),
|
||||
TP_ARGS(sb, journal_info, holders)
|
||||
DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans,
|
||||
TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
|
||||
TP_ARGS(sb, journal_info, holders, ret)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
|
||||
TP_PROTO(struct super_block *sb, void *journal_info, int holders),
|
||||
TP_ARGS(sb, journal_info, holders)
|
||||
TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
|
||||
TP_ARGS(sb, journal_info, holders, ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_ioc_release,
|
||||
@@ -985,22 +986,6 @@ TRACE_EVENT(scoutfs_delete_inode,
|
||||
__entry->mode, __entry->size)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_scan_orphans,
|
||||
TP_PROTO(struct super_block *sb),
|
||||
|
||||
TP_ARGS(sb),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = sb->s_dev;
|
||||
),
|
||||
|
||||
TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_key_class,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
|
||||
TP_ARGS(sb, key),
|
||||
|
||||
@@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
struct commit_waiter *cw;
|
||||
struct commit_waiter *pos;
|
||||
struct llist_node *node;
|
||||
u64 reserved;
|
||||
int ret;
|
||||
|
||||
trace_scoutfs_server_commit_work_enter(sb, 0, 0);
|
||||
@@ -387,11 +388,17 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
server->other_avail = &super->server_meta_avail[server->other_ind];
|
||||
server->other_freed = &super->server_meta_freed[server->other_ind];
|
||||
|
||||
/* swap avail/free if avail gets low and freed is high */
|
||||
if (le64_to_cpu(server->meta_avail->total_len) <=
|
||||
SCOUTFS_SERVER_META_ALLOC_MIN &&
|
||||
le64_to_cpu(server->meta_freed->total_len) >
|
||||
SCOUTFS_SERVER_META_ALLOC_MIN)
|
||||
/*
|
||||
* The reserved metadata blocks includes the max size of
|
||||
* outstanding allocators and a server transaction could be
|
||||
* asked to refill all those allocators from meta_avail. If our
|
||||
* meta_avail falls below the reserved count, and freed is still
|
||||
* above it, then swap so that we don't start returning enospc
|
||||
* until we're truly low.
|
||||
*/
|
||||
reserved = scoutfs_server_reserved_meta_blocks(sb);
|
||||
if (le64_to_cpu(server->meta_avail->total_len) <= reserved &&
|
||||
le64_to_cpu(server->meta_freed->total_len) > reserved)
|
||||
swap(server->meta_avail, server->meta_freed);
|
||||
|
||||
ret = 0;
|
||||
@@ -479,6 +486,57 @@ static int alloc_move_empty(struct super_block *sb,
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy on write transactions need to allocate new dirty blocks as they
|
||||
* make modifications to delete items and eventually free more blocks.
|
||||
* The reserved blocks are meant to keep enough available blocks in
|
||||
* flight to allow servers and clients to perform transactions that
|
||||
* don't consume additional space. We have quite a few allocators in
|
||||
* flight across the server and various client mechanisms (posix items,
|
||||
* srch compaction, and log merging). We also want to include
|
||||
* sufficient blocks for client log btrees to grow tall enough to be
|
||||
* finalized and merges.
|
||||
*
|
||||
* The reserved blocks calculation is a policy of the server but it's
|
||||
* exposed to the statfs_more interface so that df isn't misleading.
|
||||
* Requiring this synchronization without explicit protocol
|
||||
* communication isn't great.
|
||||
*/
|
||||
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u64 server_blocks;
|
||||
u64 client_blocks;
|
||||
u64 log_blocks;
|
||||
u64 nr_clients;
|
||||
|
||||
/* server has two meta_avail lists it swaps between */
|
||||
server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;
|
||||
|
||||
/*
|
||||
* Log trees will be compacted once they hit a height of 3.
|
||||
* That'll be the grandparent, two parents resulting from a
|
||||
* split, and all their child blocks (roughly calculated,
|
||||
* overestimating).
|
||||
*/
|
||||
log_blocks = 3 + (SCOUTFS_BLOCK_LG_SIZE /
|
||||
(sizeof(struct scoutfs_btree_item) + sizeof(struct scoutfs_block_ref)));
|
||||
|
||||
/*
|
||||
* Each client can have a meta_avail list, srch compaction
|
||||
* request, log merge request, and a log btree it's building.
|
||||
*/
|
||||
client_blocks = SCOUTFS_SERVER_META_FILL_TARGET + SCOUTFS_SERVER_META_FILL_TARGET +
|
||||
SCOUTFS_SERVER_MERGE_FILL_TARGET + log_blocks;
|
||||
|
||||
/* we should reserve for voting majority, too */
|
||||
spin_lock(&server->lock);
|
||||
nr_clients = server->nr_clients;
|
||||
spin_unlock(&server->lock);
|
||||
|
||||
return server_blocks + (max(1ULL, nr_clients) * client_blocks);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set all the bits in the destination which overlap with the extent.
|
||||
*/
|
||||
@@ -662,6 +720,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
bool have_fin = false;
|
||||
bool unlock_alloc = false;
|
||||
u64 data_zone_blocks;
|
||||
u64 nr;
|
||||
int ret;
|
||||
@@ -701,8 +760,15 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
lt.nr = cpu_to_le64(nr);
|
||||
}
|
||||
|
||||
/* finalize an existing root when large enough and don't have one */
|
||||
if (lt.item_root.height > 2 && !have_fin) {
|
||||
/*
|
||||
* Finalize the client log btree when it has enough leaf blocks
|
||||
* to allow some degree of merging concurrency. Smaller btrees
|
||||
* are also finalized when meta was low so that deleted items
|
||||
* are merged promptly and freed blocks can bring the client out
|
||||
* of enospc.
|
||||
*/
|
||||
if (!have_fin && ((lt.item_root.height > 2) ||
|
||||
(le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) {
|
||||
fin = lt;
|
||||
memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
|
||||
memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
|
||||
@@ -734,24 +800,45 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
data_zone_blocks = 0;
|
||||
}
|
||||
|
||||
/* return freed to server for emptying, refill avail */
|
||||
/*
|
||||
* Reclaim the freed meta and data allocators and refill the
|
||||
* avail allocators, setting low flags if they drop too low.
|
||||
*/
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
|
||||
server->other_freed,
|
||||
unlock_alloc = true;
|
||||
|
||||
ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
|
||||
<.meta_freed) ?:
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed) ?:
|
||||
scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
|
||||
<.meta_avail, server->meta_avail,
|
||||
SCOUTFS_SERVER_META_FILL_LO,
|
||||
SCOUTFS_SERVER_META_FILL_TARGET) ?:
|
||||
alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc,
|
||||
SCOUTFS_SERVER_DATA_FILL_LO,
|
||||
SCOUTFS_SERVER_DATA_FILL_TARGET,
|
||||
exclusive, vacant, data_zone_blocks);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
|
||||
<.meta_avail, server->meta_avail,
|
||||
SCOUTFS_SERVER_META_FILL_LO,
|
||||
SCOUTFS_SERVER_META_FILL_TARGET);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb))
|
||||
lt.meta_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
|
||||
else
|
||||
lt.meta_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
|
||||
|
||||
ret = alloc_move_refill_zoned(sb, <.data_avail, &super->data_alloc,
|
||||
SCOUTFS_SERVER_DATA_FILL_LO, SCOUTFS_SERVER_DATA_FILL_TARGET,
|
||||
exclusive, vacant, data_zone_blocks);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO)
|
||||
lt.data_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
|
||||
else
|
||||
lt.data_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
|
||||
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
unlock_alloc = false;
|
||||
|
||||
/* record data alloc zone bits */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
if (data_zone_blocks != 0) {
|
||||
@@ -772,6 +859,8 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
unlock:
|
||||
if (unlock_alloc)
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
@@ -2277,15 +2366,27 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
|
||||
open_ino_map_response, NULL, NULL);
|
||||
}
|
||||
|
||||
/* The server is sending an omap response to the client */
|
||||
/*
|
||||
* The server is sending an omap response to the client that originated
|
||||
* the request. These responses are sent long after the incoming
|
||||
* request has pinned the client connection and guaranteed that we'll be
|
||||
* able to queue a response. This can race with the client connection
|
||||
* being torn down and it's OK if we drop the response. Either the
|
||||
* client is being evicted and we don't care about them anymore or we're
|
||||
* tearing down in unmount and the client will resend to thee next
|
||||
* server.
|
||||
*/
|
||||
int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
|
||||
struct scoutfs_open_ino_map *map, int err)
|
||||
{
|
||||
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
||||
int ret;
|
||||
|
||||
return scoutfs_net_response_node(sb, server->conn, rid,
|
||||
SCOUTFS_NET_CMD_OPEN_INO_MAP, id, err,
|
||||
map, sizeof(*map));
|
||||
ret = scoutfs_net_response_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP,
|
||||
id, err, map, sizeof(*map));
|
||||
if (ret == -ENOTCONN)
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The server is receiving an omap request from the client */
|
||||
|
||||
@@ -56,6 +56,8 @@ do { \
|
||||
__entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \
|
||||
__entry->name##_error
|
||||
|
||||
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb);
|
||||
|
||||
int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
|
||||
@@ -75,8 +77,6 @@ u64 scoutfs_server_seq(struct super_block *sb);
|
||||
u64 scoutfs_server_next_seq(struct super_block *sb);
|
||||
void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
|
||||
|
||||
struct sockaddr_in;
|
||||
struct scoutfs_quorum_elected_info;
|
||||
int scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_abort(struct super_block *sb);
|
||||
void scoutfs_server_stop(struct super_block *sb);
|
||||
|
||||
@@ -247,11 +247,10 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
|
||||
trace_scoutfs_put_super(sb);
|
||||
|
||||
scoutfs_inode_stop(sb);
|
||||
scoutfs_forest_stop(sb);
|
||||
scoutfs_srch_destroy(sb);
|
||||
|
||||
scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
|
||||
sbi->rid_lock = NULL;
|
||||
|
||||
scoutfs_lock_shutdown(sb);
|
||||
|
||||
scoutfs_shutdown_trans(sb);
|
||||
@@ -623,10 +622,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
scoutfs_quorum_setup(sb) ?:
|
||||
scoutfs_client_setup(sb) ?:
|
||||
scoutfs_volopt_setup(sb) ?:
|
||||
scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
|
||||
&sbi->rid_lock) ?:
|
||||
scoutfs_trans_get_log_trees(sb) ?:
|
||||
scoutfs_srch_setup(sb);
|
||||
scoutfs_srch_setup(sb) ?:
|
||||
scoutfs_inode_start(sb);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -647,7 +645,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
|
||||
scoutfs_trans_restart_sync_deadline(sb);
|
||||
// scoutfs_scan_orphans(sb);
|
||||
ret = 0;
|
||||
out:
|
||||
/* on error, generic_shutdown_super calls put_super if s_root */
|
||||
|
||||
@@ -36,7 +36,6 @@ struct scoutfs_sb_info {
|
||||
|
||||
/* assigned once at the start of each mount, read-only */
|
||||
u64 rid;
|
||||
struct scoutfs_lock *rid_lock;
|
||||
|
||||
struct scoutfs_super_block super;
|
||||
|
||||
|
||||
@@ -436,8 +436,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Try to refill data allocator before premature enospc */
|
||||
if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
|
||||
/* if we're low and can't refill then alloc could empty and return enospc */
|
||||
if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) {
|
||||
scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
|
||||
return true;
|
||||
}
|
||||
@@ -445,38 +445,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool acquired_hold(struct super_block *sb)
|
||||
/*
|
||||
* called as a wait_event condition, needs to be careful to not change
|
||||
* task state and is racing with waking paths that sub_return, test, and
|
||||
* wake.
|
||||
*/
|
||||
static bool holders_no_writer(struct trans_info *tri)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_TRANS_INFO(sb, tri);
|
||||
bool acquired;
|
||||
|
||||
/* if a caller already has a hold we acquire unconditionally */
|
||||
if (inc_journal_info_holders()) {
|
||||
atomic_inc(&tri->holders);
|
||||
acquired = true;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* wait if the writer is blocking holds */
|
||||
if (!inc_holders_unless_writer(tri)) {
|
||||
dec_journal_info_holders();
|
||||
acquired = false;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* wait if we're triggering another commit */
|
||||
if (commit_before_hold(sb, tri)) {
|
||||
release_holders(sb);
|
||||
queue_trans_work(sbi);
|
||||
acquired = false;
|
||||
goto out;
|
||||
}
|
||||
|
||||
trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
|
||||
acquired = true;
|
||||
out:
|
||||
return acquired;
|
||||
smp_mb(); /* make sure task in wait_event queue before atomic read */
|
||||
return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -492,15 +469,64 @@ out:
|
||||
* The writing thread marks itself as a global trans_task which
|
||||
* short-circuits all the hold machinery so it can call code that would
|
||||
* otherwise try to hold transactions while it is writing.
|
||||
*
|
||||
* If the caller is adding metadata items that will eventually consume
|
||||
* free space -- not dirtying existing items or adding deletion items --
|
||||
* then we can return enospc if our metadata allocator indicates that
|
||||
* we're low on space.
|
||||
*/
|
||||
int scoutfs_hold_trans(struct super_block *sb)
|
||||
int scoutfs_hold_trans(struct super_block *sb, bool allocing)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_TRANS_INFO(sb, tri);
|
||||
u64 seq;
|
||||
int ret;
|
||||
|
||||
if (current == sbi->trans_task)
|
||||
return 0;
|
||||
|
||||
return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
|
||||
for (;;) {
|
||||
/* if a caller already has a hold we acquire unconditionally */
|
||||
if (inc_journal_info_holders()) {
|
||||
atomic_inc(&tri->holders);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* wait until the writer work is finished */
|
||||
if (!inc_holders_unless_writer(tri)) {
|
||||
dec_journal_info_holders();
|
||||
ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri));
|
||||
if (ret < 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* return enospc if server is into reserved blocks and we're allocating */
|
||||
if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) {
|
||||
release_holders(sb);
|
||||
ret = -ENOSPC;
|
||||
break;
|
||||
}
|
||||
|
||||
/* see if we need to trigger and wait for a commit before holding */
|
||||
if (commit_before_hold(sb, tri)) {
|
||||
seq = scoutfs_trans_sample_seq(sb);
|
||||
release_holders(sb);
|
||||
queue_trans_work(sbi);
|
||||
ret = wait_event_interruptible(sbi->trans_hold_wq,
|
||||
scoutfs_trans_sample_seq(sb) != seq);
|
||||
if (ret < 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -525,7 +551,7 @@ void scoutfs_release_trans(struct super_block *sb)
|
||||
|
||||
release_holders(sb);
|
||||
|
||||
trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
|
||||
trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -1,18 +1,13 @@
|
||||
#ifndef _SCOUTFS_TRANS_H_
|
||||
#define _SCOUTFS_TRANS_H_
|
||||
|
||||
/* the server will attempt to fill data allocs for each trans */
|
||||
#define SCOUTFS_TRANS_DATA_ALLOC_HWM (2ULL * 1024 * 1024 * 1024)
|
||||
/* the client will force commits if data allocators get too low */
|
||||
#define SCOUTFS_TRANS_DATA_ALLOC_LWM (256ULL * 1024 * 1024)
|
||||
|
||||
void scoutfs_trans_write_func(struct work_struct *work);
|
||||
int scoutfs_trans_sync(struct super_block *sb, int wait);
|
||||
int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync);
|
||||
void scoutfs_trans_restart_sync_deadline(struct super_block *sb);
|
||||
|
||||
int scoutfs_hold_trans(struct super_block *sb);
|
||||
int scoutfs_hold_trans(struct super_block *sb, bool allocing);
|
||||
bool scoutfs_trans_held(void);
|
||||
void scoutfs_release_trans(struct super_block *sb);
|
||||
u64 scoutfs_trans_sample_seq(struct super_block *sb);
|
||||
|
||||
@@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
|
||||
&tgs) != 0)
|
||||
memset(&tgs, 0, sizeof(tgs));
|
||||
|
||||
ret = scoutfs_hold_trans(sb);
|
||||
ret = scoutfs_hold_trans(sb, false);
|
||||
if (ret < 0)
|
||||
break;
|
||||
release = true;
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -5,3 +5,4 @@ src/handle_cat
|
||||
src/bulk_create_paths
|
||||
src/find_xattrs
|
||||
src/stage_tmpfile
|
||||
src/create_xattr_loop
|
||||
|
||||
@@ -7,7 +7,8 @@ BIN := src/createmany \
|
||||
src/handle_cat \
|
||||
src/bulk_create_paths \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs
|
||||
src/find_xattrs \
|
||||
src/create_xattr_loop
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
@@ -71,6 +71,7 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* quorum .* error"
|
||||
re="$re|scoutfs .* error reading quorum block"
|
||||
re="$re|scoutfs .* error .* writing quorum block"
|
||||
re="$re|scoutfs .* error .* while checking to delete inode"
|
||||
|
||||
egrep -v "($re)"
|
||||
}
|
||||
|
||||
8
tests/golden/enospc
Normal file
8
tests/golden/enospc
Normal file
@@ -0,0 +1,8 @@
|
||||
== prepare directories and files
|
||||
== fallocate until enospc
|
||||
== remove all the files and verify free data blocks
|
||||
== make small meta fs
|
||||
== create large xattrs until we fill up metadata
|
||||
== remove files with xattrs after enospc
|
||||
== make sure we can create again
|
||||
== cleanup small meta fs
|
||||
4
tests/golden/orphan-inodes
Normal file
4
tests/golden/orphan-inodes
Normal file
@@ -0,0 +1,4 @@
|
||||
== test our inode existance function
|
||||
== unlinked and opened inodes still exist
|
||||
== orphan from failed evict deletion is picked up
|
||||
== orphaned inos in all mounts all deleted
|
||||
@@ -7,6 +7,7 @@ simple-release-extents.sh
|
||||
setattr_more.sh
|
||||
offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
enospc.sh
|
||||
srch-basic-functionality.sh
|
||||
simple-xattr-unit.sh
|
||||
lock-refleak.sh
|
||||
@@ -29,6 +30,7 @@ cross-mount-data-free.sh
|
||||
persistent-item-vers.sh
|
||||
setup-error-teardown.sh
|
||||
fence-and-reclaim.sh
|
||||
orphan-inodes.sh
|
||||
mount-unmount-race.sh
|
||||
createmany-parallel-mounts.sh
|
||||
archive-light-cycle.sh
|
||||
|
||||
113
tests/src/create_xattr_loop.c
Normal file
113
tests/src/create_xattr_loop.c
Normal file
@@ -0,0 +1,113 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/xattr.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
|
||||
static void exit_usage(void)
|
||||
{
|
||||
printf(" -h/-? output this usage message and exit\n"
|
||||
" -c <count> number of xattrs to create\n"
|
||||
" -n <string> xattr name prefix, -NR is appended\n"
|
||||
" -p <path> string with path to file with xattrs\n"
|
||||
" -s <size> xattr value size\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char *pref = NULL;
|
||||
char *path = NULL;
|
||||
char *val;
|
||||
char *name;
|
||||
unsigned long long count = 0;
|
||||
unsigned long long size = 0;
|
||||
unsigned long long i;
|
||||
int ret;
|
||||
int c;
|
||||
|
||||
while ((c = getopt(argc, argv, "+c:n:p:s:")) != -1) {
|
||||
|
||||
switch (c) {
|
||||
case 'c':
|
||||
count = strtoull(optarg, NULL, 0);
|
||||
break;
|
||||
case 'n':
|
||||
pref = strdup(optarg);
|
||||
break;
|
||||
case 'p':
|
||||
path = strdup(optarg);
|
||||
break;
|
||||
case 's':
|
||||
size = strtoull(optarg, NULL, 0);
|
||||
break;
|
||||
case '?':
|
||||
printf("unknown argument: %c\n", optind);
|
||||
case 'h':
|
||||
exit_usage();
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0) {
|
||||
printf("specify count of xattrs to create with -c\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (count == ULLONG_MAX) {
|
||||
printf("invalid -c count\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (size == 0) {
|
||||
printf("specify xattrs value size with -s\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (size == ULLONG_MAX || size < 2) {
|
||||
printf("invalid -s size\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (path == NULL) {
|
||||
printf("specify path to file with -p\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (pref == NULL) {
|
||||
printf("specify xattr name prefix string with -n\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ret = snprintf(NULL, 0, "%s-%llu", pref, ULLONG_MAX) + 1;
|
||||
name = malloc(ret);
|
||||
if (!name) {
|
||||
printf("couldn't allocate xattr name buffer\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
val = malloc(size);
|
||||
if (!val) {
|
||||
printf("couldn't allocate xattr value buffer\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
memset(val, 'a', size - 1);
|
||||
val[size - 1] = '\0';
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
sprintf(name, "%s-%llu", pref, i);
|
||||
|
||||
ret = setxattr(path, name, val, size, 0);
|
||||
if (ret) {
|
||||
printf("returned %d errno %d (%s)\n",
|
||||
ret, errno, strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
100
tests/tests/enospc.sh
Normal file
100
tests/tests/enospc.sh
Normal file
@@ -0,0 +1,100 @@
|
||||
#
|
||||
# test hititng enospc by filling with data or metadata and
|
||||
# then recovering by removing what we filled.
|
||||
#
|
||||
|
||||
# Type Size Total Used Free Use%
|
||||
#MetaData 64KB 1048576 32782 1015794 3
|
||||
# Data 4KB 16777152 0 16777152 0
|
||||
free_blocks() {
|
||||
local md="$1"
|
||||
local mnt="$2"
|
||||
scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
|
||||
}
|
||||
|
||||
t_require_commands scoutfs stat fallocate createmany
|
||||
|
||||
echo "== prepare directories and files"
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/dir-$n/file-$n"
|
||||
mkdir -p $(dirname $path)
|
||||
touch $path
|
||||
done
|
||||
sync
|
||||
|
||||
echo "== fallocate until enospc"
|
||||
before=$(free_blocks Data "$T_M0")
|
||||
finished=0
|
||||
while [ $finished != 1 ]; do
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/dir-$n/file-$n"
|
||||
off=$(stat -c "%s" "$path")
|
||||
|
||||
LC_ALL=C fallocate -o $off -l 128MiB "$path" > $T_TMP.fallocate 2>&1
|
||||
err="$?"
|
||||
|
||||
if grep -qi "no space" $T_TMP.fallocate; then
|
||||
finished=1
|
||||
break
|
||||
fi
|
||||
if [ "$err" != "0" ]; then
|
||||
t_fail "fallocate failed with $err"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
echo "== remove all the files and verify free data blocks"
|
||||
for n in $(t_fs_nrs); do
|
||||
eval dir="\$T_D${n}/dir-$n"
|
||||
rm -rf "$dir"
|
||||
done
|
||||
sync
|
||||
after=$(free_blocks Data "$T_M0")
|
||||
# nothing else should be modifying data blocks
|
||||
test "$before" == "$after" || \
|
||||
t_fail "$after free data blocks after rm, expected $before"
|
||||
|
||||
# XXX this is all pretty manual, would be nice to have helpers
|
||||
echo "== make small meta fs"
|
||||
# meta device just big enough for reserves and the metadata we'll fill
|
||||
scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
|
||||
t_fail "mkfs failed"
|
||||
SCR="/mnt/scoutfs.enospc"
|
||||
mkdir -p "$SCR"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
|
||||
"$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
echo "== create large xattrs until we fill up metadata"
|
||||
mkdir -p "$SCR/xattrs"
|
||||
|
||||
for f in $(seq 1 100000); do
|
||||
file="$SCR/xattrs/file-$f"
|
||||
touch "$file"
|
||||
|
||||
LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1
|
||||
err="$?"
|
||||
|
||||
if grep -qi "no space" $T_TMP.cxl; then
|
||||
echo "enospc at f $f" >> $T_TMP.cxl
|
||||
break
|
||||
fi
|
||||
if [ "$err" != "0" ]; then
|
||||
t_fail "create_xattr_loop failed with $err"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "== remove files with xattrs after enospc"
|
||||
rm -rf "$SCR/xattrs"
|
||||
|
||||
echo "== make sure we can create again"
|
||||
file="$SCR/file-after"
|
||||
touch $file
|
||||
setfattr -n user.scoutfs-enospc -v 1 "$file"
|
||||
sync
|
||||
rm -f "$file"
|
||||
|
||||
echo "== cleanup small meta fs"
|
||||
umount "$SCR"
|
||||
rmdir "$SCR"
|
||||
|
||||
t_pass
|
||||
77
tests/tests/orphan-inodes.sh
Normal file
77
tests/tests/orphan-inodes.sh
Normal file
@@ -0,0 +1,77 @@
|
||||
#
|
||||
# make sure we clean up orphaned inodes
|
||||
#
|
||||
|
||||
t_require_commands sleep touch sync stat handle_cat kill rm
|
||||
t_require_mounts 2
|
||||
|
||||
#
|
||||
# usually bash prints an annoying output message when jobs
|
||||
# are killed. We can avoid that by redirecting stderr for
|
||||
# the bash process when it reaps the jobs that are killed.
|
||||
#
|
||||
silent_kill() {
|
||||
exec {ERR}>&2 2>/dev/null
|
||||
kill "$@"
|
||||
wait "$@"
|
||||
exec 2>&$ERR {ERR}>&-
|
||||
}
|
||||
|
||||
#
|
||||
# We don't have a great way to test that inode items still exist. We
|
||||
# don't prevent opening handles with nlink 0 today, so we'll use that.
|
||||
# This would have to change to some other method.
|
||||
#
|
||||
inode_exists()
|
||||
{
|
||||
local ino="$1"
|
||||
|
||||
handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
|
||||
}
|
||||
|
||||
echo "== test our inode existance function"
|
||||
path="$T_D0/file"
|
||||
touch "$path"
|
||||
ino=$(stat -c "%i" "$path")
|
||||
inode_exists $ino || echo "$ino didn't exist"
|
||||
|
||||
echo "== unlinked and opened inodes still exist"
|
||||
sleep 1000000 < "$path" &
|
||||
pid="$!"
|
||||
rm -f "$path"
|
||||
inode_exists $ino || echo "$ino didn't exist"
|
||||
|
||||
echo "== orphan from failed evict deletion is picked up"
|
||||
# pending kill signal stops evict from getting locks and deleting
|
||||
silent_kill $pid
|
||||
sleep 55
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
|
||||
echo "== orphaned inos in all mounts all deleted"
|
||||
pids=""
|
||||
inos=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
eval path="\$T_D${nr}/file-$nr"
|
||||
touch "$path"
|
||||
inos="$inos $(stat -c %i $path)"
|
||||
sleep 1000000 < "$path" &
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
sync
|
||||
silent_kill $pids
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_force_umount $nr
|
||||
done
|
||||
t_mount_all
|
||||
# wait for all fence requests to complete
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
# wait for orphan scans to run
|
||||
sleep 55
|
||||
for ino in $inos; do
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
done
|
||||
|
||||
t_pass
|
||||
@@ -32,10 +32,18 @@ A path within a ScoutFS filesystem.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force]"
|
||||
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size]"
|
||||
.sp
|
||||
Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
|
||||
separate block devices for its metadata and data storage, two are required.
|
||||
The internal structures and nature of metadata and data transactions
|
||||
lead to minimum viable device sizes.
|
||||
.B mkfs
|
||||
will check both devices and fail with an error if either are under the
|
||||
minimum size. If
|
||||
.B --allow-small-size
|
||||
is given then sizes under the minimum size will be
|
||||
allowed after printing an informational warning.
|
||||
.sp
|
||||
If
|
||||
.B --force
|
||||
@@ -81,6 +89,10 @@ kibibytes, mebibytes, etc.
|
||||
.B "-d, --max-data-size SIZE"
|
||||
Same as previous, but for limiting the size of the data device.
|
||||
.TP
|
||||
.B "-A, --allow-small-size"
|
||||
Allows use of specified device sizes less than the minimum. This can
|
||||
result in bad behaviour and is only intended for testing.
|
||||
.TP
|
||||
.B "-z, --data-alloc-zone-blocks BLOCKS"
|
||||
Set the data_alloc_zone_blocks volume option, as described in
|
||||
.BR scoutfs (5).
|
||||
|
||||
@@ -6,12 +6,13 @@
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/fs.h>
|
||||
#include <errno.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "dev.h"
|
||||
|
||||
int device_size(char *path, int fd,
|
||||
u64 min_size, u64 max_size,
|
||||
u64 min_size, u64 max_size, bool allow_small_size,
|
||||
char *use_type, u64 *size_ret)
|
||||
{
|
||||
struct stat st;
|
||||
@@ -63,10 +64,13 @@ int device_size(char *path, int fd,
|
||||
if (size < min_size) {
|
||||
fprintf(stderr,
|
||||
BASE_SIZE_FMT" %s too small for min "
|
||||
BASE_SIZE_FMT" %s device\n",
|
||||
BASE_SIZE_FMT" %s device%s\n",
|
||||
BASE_SIZE_ARGS(size), target_type,
|
||||
BASE_SIZE_ARGS(min_size), use_type);
|
||||
return -EINVAL;
|
||||
BASE_SIZE_ARGS(min_size), use_type,
|
||||
allow_small_size ? ", allowing with -A" : "");
|
||||
|
||||
if (!allow_small_size)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*size_ret = size;
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#ifndef _DEV_H_
|
||||
#define _DEV_H_
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
#define BASE_SIZE_FMT "%.2f%s"
|
||||
#define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)
|
||||
|
||||
@@ -8,7 +10,7 @@
|
||||
#define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)
|
||||
|
||||
int device_size(char *path, int fd,
|
||||
u64 min_size, u64 max_size,
|
||||
u64 min_size, u64 max_size, bool allow_small_size,
|
||||
char *use_type, u64 *size_ret);
|
||||
float size_flt(u64 nr, unsigned size);
|
||||
char *size_str(u64 nr, unsigned size);
|
||||
|
||||
@@ -86,6 +86,11 @@ static int do_df(struct df_args *args)
|
||||
data_free += ade[i].blocks;
|
||||
}
|
||||
|
||||
if (meta_free >= sfm.reserved_meta_blocks)
|
||||
meta_free -= sfm.reserved_meta_blocks;
|
||||
else
|
||||
meta_free = 0;
|
||||
|
||||
snprintf(cells[0][0], CHARS, "Type");
|
||||
snprintf(cells[0][1], CHARS, "Size");
|
||||
snprintf(cells[0][2], CHARS, "Total");
|
||||
|
||||
@@ -135,6 +135,7 @@ struct mkfs_args {
|
||||
unsigned long long max_data_size;
|
||||
u64 data_alloc_zone_blocks;
|
||||
bool force;
|
||||
bool allow_small_size;
|
||||
int nr_slots;
|
||||
struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
};
|
||||
@@ -215,13 +216,15 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024),
|
||||
args->max_meta_size, "meta", &meta_size);
|
||||
/* minumum meta device size to make reserved blocks reasonably large */
|
||||
ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
|
||||
args->max_meta_size, args->allow_small_size, "meta", &meta_size);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024),
|
||||
args->max_data_size, "data", &data_size);
|
||||
/* .. then arbitrarily the same minimum data device size */
|
||||
ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
|
||||
args->max_data_size, args->allow_small_size, "data", &data_size);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -520,6 +523,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
prev_val, args->max_data_size);
|
||||
break;
|
||||
}
|
||||
case 'A':
|
||||
args->allow_small_size = true;
|
||||
break;
|
||||
case 'z': /* data-alloc-zone-blocks */
|
||||
{
|
||||
ret = parse_u64(arg, &args->data_alloc_zone_blocks);
|
||||
@@ -559,6 +565,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
static struct argp_option options[] = {
|
||||
{ "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"},
|
||||
{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
|
||||
{ "allow-small-size", 'A', NULL, 0, "Allow specified meta/data devices less than minimum, still warns"},
|
||||
{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
|
||||
{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
|
||||
{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},
|
||||
|
||||
@@ -158,7 +158,7 @@ static print_func_t find_printer(u8 zone, u8 type)
|
||||
type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
|
||||
return print_inode_index;
|
||||
|
||||
if (zone == SCOUTFS_RID_ZONE) {
|
||||
if (zone == SCOUTFS_ORPHAN_ZONE) {
|
||||
if (type == SCOUTFS_ORPHAN_TYPE)
|
||||
return print_orphan;
|
||||
}
|
||||
@@ -245,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
|
||||
le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq)
|
||||
|
||||
#define AL_HEAD_F \
|
||||
AL_REF_F" total_nr %llu first_nr %u"
|
||||
AL_REF_F" total_nr %llu first_nr %u flags 0x%x"
|
||||
#define AL_HEAD_A(p) \
|
||||
AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\
|
||||
le32_to_cpu((p)->first_nr)
|
||||
le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags)
|
||||
|
||||
#define ALCROOT_F \
|
||||
BTROOT_F" total_len %llu"
|
||||
BTROOT_F" total_len %llu flags 0x%x"
|
||||
#define ALCROOT_A(ar) \
|
||||
BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len)
|
||||
BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags)
|
||||
|
||||
#define SRE_FMT "%016llx.%llu.%llu"
|
||||
#define SRE_A(sre) \
|
||||
|
||||
Reference in New Issue
Block a user