Merge pull request #46 from versity/zab/orphan_deletion_and_enospc

Zab/orphan deletion and enospc
This commit is contained in:
Zach Brown
2021-07-08 10:52:53 -07:00
committed by GitHub
41 changed files with 1097 additions and 374 deletions

View File

@@ -676,6 +676,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
*
* Unlike meta allocations, the caller is expected to serialize
* allocations from the root.
*
* ENOBUFS is returned if the data allocator ran out of space and we can
* probably refill it from the server. The caller is expected to back
* out, commit the transaction, and try again.
*
* ENOSPC is returned if the data allocator ran out of space but we have
* a flag from the server telling us that there's no more space
* available. This is a hard error and should be returned.
*/
int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
@@ -724,13 +732,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
ret = 0;
out:
if (ret < 0) {
/*
* Special retval meaning there wasn't space to alloc from
* this txn. Doesn't mean filesystem is completely full.
* Maybe upper layers want to try again.
*/
if (ret == -ENOENT)
ret = -ENOBUFS;
if (ret == -ENOENT) {
if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW)
ret = -ENOSPC;
else
ret = -ENOBUFS;
}
*blkno_ret = 0;
*count_ret = 0;
} else {
@@ -1261,6 +1269,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
return lo;
}
bool scoutfs_alloc_test_flag(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 flag)
{
unsigned int seq;
bool set;
do {
seq = read_seqbegin(&alloc->seqlock);
set = !!(le32_to_cpu(alloc->avail.flags) & flag);
} while (read_seqretry(&alloc->seqlock, seq));
return set;
}
/*
* Call the callers callback for every persistent allocator structure
* we can find.

View File

@@ -38,6 +38,10 @@
#define SCOUTFS_ALLOC_DATA_LG_THRESH \
(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
/* the client will force commits if data allocators get too low */
#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \
((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
/*
* Fill client alloc roots to the target when they fall below the lo
* threshold.
@@ -55,6 +59,7 @@
#define SCOUTFS_SERVER_DATA_FILL_LO \
(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
/*
* Log merge meta allocations are only used for one request and will
* never use more than the dirty limit.
@@ -65,16 +70,6 @@
((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4)
#define SCOUTFS_SERVER_MERGE_FILL_LO SCOUTFS_SERVER_MERGE_FILL_TARGET
/*
* Each of the server meta_alloc roots will try to keep a minimum amount
* of free blocks. The server will swap roots when its current avail
* falls below the threshold while the freed root is still above it. It
* must have room for all the largest allocation attempted in a
* transaction on the server.
*/
#define SCOUTFS_SERVER_META_ALLOC_MIN \
(SCOUTFS_SERVER_META_FILL_TARGET * 2)
/*
* A run-time use of a pair of persistent avail/freed roots as a
* metadata allocator. It has the machinery needed to lock and avoid
@@ -157,6 +152,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
bool scoutfs_alloc_meta_low(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 nr);
bool scoutfs_alloc_test_flag(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 flag);
typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
int owner, u64 id,

View File

@@ -88,6 +88,7 @@
EXPAND_COUNTER(forest_read_items) \
EXPAND_COUNTER(forest_roots_next_hint) \
EXPAND_COUNTER(forest_set_bloom_bits) \
EXPAND_COUNTER(inode_evict_intr) \
EXPAND_COUNTER(item_clear_dirty) \
EXPAND_COUNTER(item_create) \
EXPAND_COUNTER(item_delete) \
@@ -151,6 +152,12 @@
EXPAND_COUNTER(net_recv_invalid_message) \
EXPAND_COUNTER(net_recv_messages) \
EXPAND_COUNTER(net_unknown_request) \
EXPAND_COUNTER(orphan_scan) \
EXPAND_COUNTER(orphan_scan_cached) \
EXPAND_COUNTER(orphan_scan_error) \
EXPAND_COUNTER(orphan_scan_item) \
EXPAND_COUNTER(orphan_scan_omap_set) \
EXPAND_COUNTER(orphan_scan_read) \
EXPAND_COUNTER(quorum_elected) \
EXPAND_COUNTER(quorum_fence_error) \
EXPAND_COUNTER(quorum_fence_leader) \

View File

@@ -312,10 +312,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
while (iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
true);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
else
ret = scoutfs_hold_trans(sb);
ret = scoutfs_hold_trans(sb, false);
if (ret)
break;
@@ -756,8 +755,7 @@ retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
true) ?:
scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
ind_seq);
scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
} while (ret > 0);
if (ret < 0)
goto out;
@@ -1010,7 +1008,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
while(iblock <= last) {
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
if (ret)
goto out;
@@ -1086,7 +1084,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
}
/* we're updating meta_seq with offline block count */
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
if (ret < 0)
goto out;
@@ -1238,7 +1236,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
ret = scoutfs_inode_index_start(sb, &seq) ?:
scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
if (ret > 0)
continue;
if (ret < 0)
@@ -1844,13 +1842,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb)
return ret;
}
u64 scoutfs_data_alloc_free_bytes(struct super_block *sb)
/*
* Return true if the data allocator is lower than the caller's
* requirement and we haven't been told by the server that we're out of
* free extents.
*/
bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
{
DECLARE_DATA_INFO(sb, datinf);
return scoutfs_dalloc_total_len(&datinf->dalloc) <<
SCOUTFS_BLOCK_SM_SHIFT;
return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
!(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
}
int scoutfs_data_setup(struct super_block *sb)

View File

@@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb,
void scoutfs_data_get_btrees(struct super_block *sb,
struct scoutfs_log_trees *lt);
int scoutfs_data_prepare_commit(struct super_block *sb);
u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);
bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks);
int scoutfs_data_setup(struct super_block *sb);
void scoutfs_data_destroy(struct super_block *sb);

View File

@@ -669,6 +669,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev,
struct scoutfs_lock **dir_lock,
struct scoutfs_lock **inode_lock,
struct scoutfs_lock **orph_lock,
struct list_head *ind_locks)
{
struct super_block *sb = dir->i_sb;
@@ -701,11 +702,17 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
if (ret)
goto out_unlock;
if (orph_lock) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
if (ret < 0)
goto out_unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
@@ -725,9 +732,13 @@ out_unlock:
if (ret) {
scoutfs_inode_index_unlock(sb, ind_locks);
scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
*dir_lock = NULL;
scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
*inode_lock = NULL;
if (orph_lock) {
scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
*orph_lock = NULL;
}
inode = ERR_PTR(ret);
}
@@ -752,7 +763,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
inode = lock_hold_create(dir, dentry, mode, rdev,
&dir_lock, &inode_lock, &ind_locks);
&dir_lock, &inode_lock, NULL, &ind_locks);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -813,13 +824,15 @@ static int scoutfs_link(struct dentry *old_dentry,
struct super_block *sb = dir->i_sb;
struct scoutfs_lock *dir_lock;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
LIST_HEAD(ind_locks);
bool del_orphan;
bool del_orphan = false;
u64 dir_size;
u64 ind_seq;
u64 hash;
u64 pos;
int ret;
int err;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
@@ -843,13 +856,20 @@ static int scoutfs_link(struct dentry *old_dentry,
goto out_unlock;
dir_size = i_size_read(dir) + dentry->d_name.len;
del_orphan = (inode->i_nlink == 0);
if (inode->i_nlink == 0) {
del_orphan = true;
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
&orph_lock);
if (ret < 0)
goto out_unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
@@ -860,7 +880,7 @@ retry:
goto out;
if (del_orphan) {
ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
if (ret)
goto out;
}
@@ -871,8 +891,11 @@ retry:
dentry->d_name.name, dentry->d_name.len,
scoutfs_ino(inode), inode->i_mode, dir_lock,
inode_lock);
if (ret)
if (ret) {
err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
goto out;
}
update_dentry_info(sb, dentry, hash, pos, dir_lock);
i_size_write(dir, dir_size);
@@ -880,11 +903,6 @@ retry:
inode->i_ctime = dir->i_mtime;
inc_nlink(inode);
if (del_orphan) {
ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
WARN_ON_ONCE(ret);
}
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
@@ -896,6 +914,8 @@ out_unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
@@ -920,6 +940,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct timespec ts = current_kernel_time();
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_lock *dir_lock = NULL;
LIST_HEAD(ind_locks);
u64 ind_seq;
@@ -937,32 +958,36 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
goto unlock;
}
if (should_orphan(inode)) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
&orph_lock);
if (ret < 0)
goto unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
if (ret > 0)
goto retry;
if (ret)
goto unlock;
if (should_orphan(inode)) {
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
if (ret < 0)
goto out;
}
ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
dentry_info_pos(dentry), scoutfs_ino(inode),
dir_lock, inode_lock);
if (ret)
if (ret) {
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
WARN_ON_ONCE(ret); /* should have been dirty */
goto out;
if (should_orphan(inode)) {
/*
* Insert the orphan item before we modify any inode
* metadata so we can gracefully exit should it
* fail.
*/
ret = scoutfs_orphan_inode(inode);
WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
if (ret)
goto out;
}
dir->i_ctime = ts;
@@ -984,6 +1009,7 @@ unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
@@ -1176,7 +1202,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
return ret;
inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
&dir_lock, &inode_lock, &ind_locks);
&dir_lock, &inode_lock, NULL, &ind_locks);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -1535,6 +1561,7 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct scoutfs_lock *new_dir_lock = NULL;
struct scoutfs_lock *old_inode_lock = NULL;
struct scoutfs_lock *new_inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct timespec now;
bool ins_new = false;
bool del_new = false;
@@ -1599,6 +1626,13 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_unlock;
if (should_orphan(new_inode)) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
&orph_lock);
if (ret < 0)
goto out_unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
@@ -1607,7 +1641,7 @@ retry:
scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
(new_inode == NULL ? 0 :
scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
@@ -1658,7 +1692,7 @@ retry:
ins_old = true;
if (should_orphan(new_inode)) {
ret = scoutfs_orphan_inode(new_inode);
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
if (ret)
goto out;
}
@@ -1762,6 +1796,7 @@ out_unlock:
scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
@@ -1781,6 +1816,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
struct inode *inode = NULL;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
LIST_HEAD(ind_locks);
int ret;
@@ -1788,25 +1824,32 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
return -ENAMETOOLONG;
inode = lock_hold_create(dir, dentry, mode, 0,
&dir_lock, &inode_lock, &ind_locks);
&dir_lock, &inode_lock, &orph_lock, &ind_locks);
if (IS_ERR(inode))
return PTR_ERR(inode);
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
if (ret < 0) {
iput(inode);
goto out; /* XXX returning error but items created */
}
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
insert_inode_hash(inode);
ihold(inode); /* need to update inode modifications in d_tmpfile */
d_tmpfile(dentry, inode);
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
scoutfs_inode_index_unlock(sb, &ind_locks);
iput(inode);
ret = scoutfs_orphan_inode(inode);
WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
out:
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}

View File

@@ -758,6 +758,16 @@ out:
return 0;
}
void scoutfs_forest_stop(struct super_block *sb)
{
DECLARE_FOREST_INFO(sb, finf);
if (finf && finf->workq) {
cancel_delayed_work_sync(&finf->log_merge_dwork);
destroy_workqueue(finf->workq);
}
}
void scoutfs_forest_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -766,11 +776,6 @@ void scoutfs_forest_destroy(struct super_block *sb)
if (finf) {
scoutfs_block_put(sb, finf->srch_bl);
if (finf->workq) {
cancel_delayed_work_sync(&finf->log_merge_dwork);
destroy_workqueue(finf->workq);
}
kfree(finf);
sbi->forest_info = NULL;
}

View File

@@ -39,6 +39,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
struct scoutfs_log_trees *lt);
int scoutfs_forest_setup(struct super_block *sb);
void scoutfs_forest_stop(struct super_block *sb);
void scoutfs_forest_destroy(struct super_block *sb);
#endif

View File

@@ -286,9 +286,10 @@ struct scoutfs_alloc_list_head {
struct scoutfs_block_ref ref;
__le64 total_nr;
__le32 first_nr;
__u8 __pad[4];
__le32 flags;
};
/*
* While the main allocator uses extent items in btree blocks, metadata
* allocations for a single transaction are recorded in arrays in
@@ -317,9 +318,14 @@ struct scoutfs_alloc_list_block {
*/
struct scoutfs_alloc_root {
__le64 total_len;
__le32 flags;
__le32 _pad;
struct scoutfs_btree_root root;
};
/* Shared by _alloc_list_head and _alloc_root */
#define SCOUTFS_ALLOC_FLAG_LOW (1U << 0)
/* types of allocators, exposed to alloc_detail ioctl */
#define SCOUTFS_ALLOC_OWNER_NONE 0
#define SCOUTFS_ALLOC_OWNER_SERVER 1
@@ -570,7 +576,7 @@ struct scoutfs_log_merge_freeing {
* Keys are first sorted by major key zones.
*/
#define SCOUTFS_INODE_INDEX_ZONE 1
#define SCOUTFS_RID_ZONE 2
#define SCOUTFS_ORPHAN_ZONE 2
#define SCOUTFS_FS_ZONE 3
#define SCOUTFS_LOCK_ZONE 4
/* Items only stored in server btrees */
@@ -592,7 +598,7 @@ struct scoutfs_log_merge_freeing {
#define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE 2
#define SCOUTFS_INODE_INDEX_NR 3 /* don't forget to update */
/* rid zone (also used in server alloc btree) */
/* orphan zone, redundant type used for clarity */
#define SCOUTFS_ORPHAN_TYPE 1
/* fs zone */

View File

@@ -34,6 +34,7 @@
#include "client.h"
#include "cmp.h"
#include "omap.h"
#include "forest.h"
/*
* XXX
@@ -54,10 +55,19 @@ struct inode_allocator {
};
struct inode_sb_info {
struct super_block *sb;
bool stopped;
spinlock_t writeback_lock;
struct rb_root writeback_inodes;
struct inode_allocator dir_ino_alloc;
struct inode_allocator ino_alloc;
struct delayed_work orphan_scan_dwork;
/* serialize multiple inode ->evict trying to delete same ino's items */
spinlock_t deleting_items_lock;
struct list_head deleting_items_list;
};
#define DECLARE_INODE_SB_INFO(sb, name) \
@@ -352,7 +362,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
if (!S_ISREG(inode->i_mode))
return 0;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
if (ret)
return ret;
@@ -379,7 +389,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
LIST_HEAD(ind_locks);
int ret;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
if (ret)
return ret;
@@ -494,7 +504,7 @@ retry:
}
}
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
if (ret)
goto out;
@@ -1207,7 +1217,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
* Returns > 0 if the seq changed and the locks should be retried.
*/
int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
struct list_head *list, u64 seq)
struct list_head *list, u64 seq, bool allocing)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct index_lock *ind_lock;
@@ -1223,7 +1233,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
goto out;
}
ret = scoutfs_hold_trans(sb);
ret = scoutfs_hold_trans(sb, allocing);
if (ret == 0 && seq != sbi->trans_seq) {
scoutfs_release_trans(sb);
ret = 1;
@@ -1237,7 +1247,7 @@ out:
}
int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
bool set_data_seq)
bool set_data_seq, bool allocing)
{
struct super_block *sb = inode->i_sb;
int ret;
@@ -1247,7 +1257,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
ret = scoutfs_inode_index_start(sb, &seq) ?:
scoutfs_inode_index_prepare(sb, list, inode,
set_data_seq) ?:
scoutfs_inode_index_try_lock_hold(sb, list, seq);
scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing);
} while (ret > 0);
return ret;
@@ -1437,41 +1447,74 @@ out:
return inode;
}
static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
static void init_orphan_key(struct scoutfs_key *key, u64 ino)
{
*key = (struct scoutfs_key) {
.sk_zone = SCOUTFS_RID_ZONE,
.sko_rid = cpu_to_le64(rid),
.sk_type = SCOUTFS_ORPHAN_TYPE,
.sk_zone = SCOUTFS_ORPHAN_ZONE,
.sko_ino = cpu_to_le64(ino),
.sk_type = SCOUTFS_ORPHAN_TYPE,
};
}
int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
/*
* Create an orphan item. The orphan items are maintained in their own
* zone under a write only lock while the caller has the inode protected
* by a write lock.
*/
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_lock *lock = sbi->rid_lock;
struct scoutfs_key key;
init_orphan_key(&key, sbi->rid, ino);
init_orphan_key(&key, ino);
return scoutfs_item_dirty(sb, &key, lock);
return scoutfs_item_create_force(sb, &key, NULL, 0, lock);
}
int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_lock *lock = sbi->rid_lock;
struct scoutfs_key key;
int ret;
init_orphan_key(&key, sbi->rid, ino);
init_orphan_key(&key, ino);
ret = scoutfs_item_delete(sb, &key, lock);
if (ret == -ENOENT)
ret = 0;
return scoutfs_item_delete_force(sb, &key, lock);
}
return ret;
struct deleting_ino_entry {
struct list_head head;
u64 ino;
};
static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
{
struct deleting_ino_entry *tmp;
bool added = true;
spin_lock(&inf->deleting_items_lock);
list_for_each_entry(tmp, &inf->deleting_items_list, head) {
if (tmp->ino == ino) {
added = false;
break;
}
}
if (added) {
del->ino = ino;
list_add_tail(&del->head, &inf->deleting_items_list);
}
spin_unlock(&inf->deleting_items_lock);
return added;
}
static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
{
if (del->ino) {
spin_lock(&inf->deleting_items_lock);
list_del_init(&del->head);
spin_unlock(&inf->deleting_items_lock);
}
}
/*
@@ -1482,9 +1525,21 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
* orphan item will continue triggering attempts to finish previous
* partial deletion until all deletion is complete and the orphan item
* is removed.
*
* Currently this can be called multiple times for multiple cached
* inodes for a given ino number (ilookup avoids freeing inodes to avoid
* cluster lock<->inode flag waiting inversions). Some items are not
* safe to delete concurrently, for example concurrent data truncation
* could free extents multiple times. We use a very silly list of inos
* being deleted. Duplicates just return success. If the first
* deletion ends up failing orphan deletion will come back around later
* and retry.
*/
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
struct scoutfs_lock *orph_lock)
{
DECLARE_INODE_SB_INFO(sb, inf);
struct deleting_ino_entry del = {{NULL, }};
struct scoutfs_inode sinode;
struct scoutfs_key key;
LIST_HEAD(ind_locks);
@@ -1494,6 +1549,11 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
u64 size;
int ret;
if (!added_deleting_ino(inf, &del, ino)) {
ret = 0;
goto out;
}
init_inode_key(&key, ino);
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
@@ -1531,7 +1591,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
if (ret > 0)
goto retry;
if (ret)
@@ -1553,8 +1613,9 @@ retry:
if (ret)
goto out;
ret = scoutfs_orphan_delete(sb, ino);
ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
out:
del_deleting_ino(inf, &del);
if (release)
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
@@ -1568,11 +1629,17 @@ out:
* tear down. We use locking and open inode number bitmaps to decide if
* we should finally destroy an inode that is no longer open nor
* reachable through directory entries.
*
* Because lookup ignores freeing inodes we can get here from multiple
* instances of an inode that is being deleted. Orphan scanning in
* particular can race with deletion. delete_inode_items() resolves
* concurrent attempts.
*/
void scoutfs_evict_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_lock *orph_lock;
struct scoutfs_lock *lock;
int ret;
@@ -1584,14 +1651,21 @@ void scoutfs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
ret = scoutfs_omap_should_delete(sb, inode, &lock);
ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
if (ret > 0) {
ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
}
if (ret < 0)
if (ret == -ERESTARTSYS) {
/* can be in task with pending, could be found as orphan */
scoutfs_inc_counter(sb, inode_evict_intr);
ret = 0;
}
if (ret < 0) {
scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
ret, ino);
}
scoutfs_omap_dec(sb, ino);
@@ -1626,75 +1700,141 @@ int scoutfs_drop_inode(struct inode *inode)
}
/*
* Find orphan items and process each one.
*
* Runtime of this will be bounded by the number of orphans, which could
* theoretically be very large. If that becomes a problem we might want to push
* this work off to a thread.
*
* This only scans orphans for this node. This will need to be covered by
* the rest of node zone cleanup.
* All mounts are performing this work concurrently. We introduce
* significant jitter between them to try and keep them from all
* bunching up and working on the same inodes.
*/
int scoutfs_scan_orphans(struct super_block *sb)
static void schedule_orphan_dwork(struct inode_sb_info *inf)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_lock *lock = sbi->rid_lock;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_key key;
#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
if (!inf->stopped) {
delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
schedule_delayed_work(&inf->orphan_scan_dwork, delay);
}
}
/*
* Find and delete inodes whose only remaining reference is the
* persistent orphan item that was created as they were unlinked.
*
* Orphan items are created as the final directory entry referring to an
* inode is deleted. They're deleted as the final cached inode is
* evicted and the inode items are destroyed. They can linger if all
* the cached inodes pinning the inode fail to delete as they are
* evicted from the cache -- either through crashing or errors.
*
* This work runs in all mounts in the background looking for orphaned
* inodes that should be deleted.
*
* We use the forest hint call to read the persistent forest trees
* looking for orphan items without creating lock contention. Orphan
* items exist for O_TMPFILE users and we don't want to force them to
* commit by trying to acquire a conflicting read lock the orphan zone.
* There's no rush to reclaim deleted items, eventually they will be
* found in the persistent item btrees.
*
* Once we find candidate orphan items we can first check our local
* inode cache for inodes that are already on their way to eviction and
* can be skipped. Then we ask the server for the open map containing
* the inode. Only if we don't have it cached, and no one else does, do
* we try and read it into our cache and evict it to trigger the final
* inode deletion process.
*
* Orphaned items that make it that far should be very rare. They can
* only exist if all the mounts that were using an inode after it had
* been unlinked (or created with o_tmpfile) didn't unmount cleanly.
*/
static void inode_orphan_scan_worker(struct work_struct *work)
{
struct inode_sb_info *inf = container_of(work, struct inode_sb_info,
orphan_scan_dwork.work);
struct super_block *sb = inf->sb;
struct scoutfs_open_ino_map omap;
struct scoutfs_key last;
struct scoutfs_key next;
struct scoutfs_key key;
struct inode *inode;
u64 group_nr;
int bit_nr;
u64 ino;
int err = 0;
int ret;
trace_scoutfs_scan_orphans(sb);
scoutfs_inc_counter(sb, orphan_scan);
init_orphan_key(&key, sbi->rid, 0);
init_orphan_key(&last, sbi->rid, ~0ULL);
init_orphan_key(&last, U64_MAX);
omap.args.group_nr = cpu_to_le64(U64_MAX);
while (1) {
ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
if (ret == -ENOENT) /* No more orphan items */
break;
if (ret < 0)
for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) {
if (inf->stopped) {
ret = 0;
goto out;
ino = le64_to_cpu(key.sko_ino);
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
if (ret == 0) {
ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
}
if (ret && ret != -ENOENT && !err)
err = ret;
if (le64_to_cpu(key.sko_ino) == U64_MAX) {
ret = -ENOENT;
/* find the next orphan item */
init_orphan_key(&key, ino);
ret = scoutfs_forest_next_hint(sb, &key, &next);
if (ret < 0) {
if (ret == -ENOENT)
break;
goto out;
}
if (scoutfs_key_compare(&next, &last) > 0)
break;
scoutfs_inc_counter(sb, orphan_scan_item);
ino = le64_to_cpu(next.sko_ino);
/* locally cached inodes will already be deleted */
inode = scoutfs_ilookup(sb, ino);
if (inode) {
scoutfs_inc_counter(sb, orphan_scan_cached);
iput(inode);
continue;
}
le64_add_cpu(&key.sko_ino, 1);
/* get an omap that covers the orphaned ino */
group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
if (le64_to_cpu(omap.args.group_nr) != group_nr) {
ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
if (ret < 0)
goto out;
}
/* don't need to evict if someone else has it open (cached) */
if (test_bit_le(bit_nr, omap.bits)) {
scoutfs_inc_counter(sb, orphan_scan_omap_set);
continue;
}
/* try to cached and evict unused inode to delete, can be racing */
inode = scoutfs_iget(sb, ino);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
if (ret == -ENOENT)
continue;
else
goto out;
}
scoutfs_inc_counter(sb, orphan_scan_read);
SCOUTFS_I(inode)->drop_invalidated = true;
iput(inode);
}
ret = 0;
out:
return err ? err : ret;
}
if (ret < 0)
scoutfs_inc_counter(sb, orphan_scan_error);
int scoutfs_orphan_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_lock *lock = sbi->rid_lock;
struct scoutfs_key key;
int ret;
trace_scoutfs_orphan_inode(sb, inode);
init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));
ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
return ret;
schedule_orphan_dwork(inf);
}
/*
@@ -1803,16 +1943,43 @@ int scoutfs_inode_setup(struct super_block *sb)
if (!inf)
return -ENOMEM;
inf->sb = sb;
spin_lock_init(&inf->writeback_lock);
inf->writeback_inodes = RB_ROOT;
spin_lock_init(&inf->dir_ino_alloc.lock);
spin_lock_init(&inf->ino_alloc.lock);
INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
spin_lock_init(&inf->deleting_items_lock);
INIT_LIST_HEAD(&inf->deleting_items_list);
sbi->inode_sb_info = inf;
return 0;
}
/*
* Our inode subsystem is setup pretty early but orphan scanning uses
* many other subsystems like networking and the server. We only kick
* it off once everything is ready.
*/
int scoutfs_inode_start(struct super_block *sb)
{
DECLARE_INODE_SB_INFO(sb, inf);
schedule_orphan_dwork(inf);
return 0;
}
void scoutfs_inode_stop(struct super_block *sb)
{
DECLARE_INODE_SB_INFO(sb, inf);
if (inf) {
inf->stopped = true;
cancel_delayed_work_sync(&inf->orphan_scan_dwork);
}
}
void scoutfs_inode_destroy(struct super_block *sb)
{
struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;

View File

@@ -75,7 +75,6 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
void scoutfs_destroy_inode(struct inode *inode);
int scoutfs_drop_inode(struct inode *inode);
void scoutfs_evict_inode(struct inode *inode);
int scoutfs_orphan_inode(struct inode *inode);
struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
@@ -89,9 +88,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
struct list_head *list, u64 ino,
umode_t mode);
int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
struct list_head *list, u64 seq);
struct list_head *list, u64 seq, bool allocing);
int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
bool set_data_seq);
bool set_data_seq, bool allocing);
void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);
int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
@@ -120,9 +119,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
int scoutfs_scan_orphans(struct super_block *sb);
int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
int scoutfs_orphan_delete(struct super_block *sb, u64 ino);
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
void scoutfs_inode_queue_writeback(struct inode *inode);
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
@@ -133,6 +131,8 @@ void scoutfs_inode_exit(void);
int scoutfs_inode_init(void);
int scoutfs_inode_setup(struct super_block *sb);
int scoutfs_inode_start(struct super_block *sb);
void scoutfs_inode_stop(struct super_block *sb);
void scoutfs_inode_destroy(struct super_block *sb);
#endif

View File

@@ -38,6 +38,7 @@
#include "hash.h"
#include "srch.h"
#include "alloc.h"
#include "server.h"
#include "scoutfs_trace.h"
/*
@@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
/* setting only so we don't see 0 data seq with nonzero data_version */
set_data_seq = sm.data_version != 0 ? true : false;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false);
if (ret)
goto unlock;
@@ -879,6 +880,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
sfm.rid = sbi->rid;
sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb);
ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
if (ret)

View File

@@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more {
__u64 committed_seq;
__u64 total_meta_blocks;
__u64 total_data_blocks;
__u64 reserved_meta_blocks;
};
#define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \

View File

@@ -95,7 +95,7 @@ struct item_cache_info {
/* written by page readers, read by shrink */
spinlock_t active_lock;
struct rb_root active_root;
struct list_head active_list;
};
#define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -127,6 +127,7 @@ struct cached_page {
unsigned long lru_time;
struct list_head dirty_list;
struct list_head dirty_head;
u64 max_liv_seq;
struct page *page;
unsigned int page_off;
unsigned int erased_bytes;
@@ -385,6 +386,14 @@ static void put_pg(struct super_block *sb, struct cached_page *pg)
}
}
static void update_pg_max_liv_seq(struct cached_page *pg, struct cached_item *item)
{
u64 liv_seq = le64_to_cpu(item->liv.seq);
if (liv_seq > pg->max_liv_seq)
pg->max_liv_seq = liv_seq;
}
/*
* Allocate space for a new item from the free offset at the end of a
* cached page. This isn't a blocking allocation, and it's likely that
@@ -416,6 +425,8 @@ static struct cached_item *alloc_item(struct cached_page *pg,
if (val_len)
memcpy(item->val, val, val_len);
update_pg_max_liv_seq(pg, item);
return item;
}
@@ -622,6 +633,8 @@ static void mark_item_dirty(struct super_block *sb,
list_add_tail(&item->dirty_head, &pg->dirty_list);
item->dirty = 1;
}
update_pg_max_liv_seq(pg, item);
}
static void clear_item_dirty(struct super_block *sb,
@@ -1260,46 +1273,76 @@ static int cache_empty_page(struct super_block *sb,
return 0;
}
/*
* Readers operate independently from dirty items and transactions.
* They read a set of persistent items and insert them into the cache
* when there aren't already pages whose key range contains the items.
* This naturally prefers cached dirty items over stale read items.
*
* We have to deal with the case where dirty items are written and
* invalidated while a read is in flight. The reader won't have seen
* the items that were dirty in their persistent roots as they started
* reading. By the time they insert their read pages the previously
* dirty items have been reclaimed and are not in the cache. The old
* stale items will be inserted in their place, effectively corrupting
* by having the dirty items disappear.
*
* We fix this by tracking the max seq of items in pages. As readers
* start they record the current transaction seq. Invalidation skips
* pages with a max seq greater than the first reader seq because the
* items in the page have to stick around to prevent the readers stale
* items from being inserted.
*
* This naturally only affects a small set of pages with items that were
* written relatively recently. If we're in memory pressure then we
* probably have a lot of pages and they'll naturally have items that
* were visible to any raders. We don't bother with the complicated and
* expensive further refinement of tracking the ranges that are being
* read and comparing those with pages to invalidate.
*/
struct active_reader {
struct rb_node node;
struct scoutfs_key start;
struct scoutfs_key end;
struct list_head head;
u64 seq;
};
static struct active_reader *active_rbtree_walk(struct rb_root *root,
struct scoutfs_key *start,
struct scoutfs_key *end,
struct rb_node **par,
struct rb_node ***pnode)
#define INIT_ACTIVE_READER(rdr) \
struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
static void add_active_reader(struct super_block *sb, struct active_reader *active)
{
DECLARE_ITEM_CACHE_INFO(sb, cinf);
BUG_ON(!list_empty(&active->head));
active->seq = scoutfs_trans_sample_seq(sb);
spin_lock(&cinf->active_lock);
list_add_tail(&active->head, &cinf->active_list);
spin_unlock(&cinf->active_lock);
}
static u64 first_active_reader_seq(struct item_cache_info *cinf)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct active_reader *ret = NULL;
struct active_reader *active;
int cmp;
u64 first;
while (*node) {
parent = *node;
active = container_of(*node, struct active_reader, node);
/* only the calling task adds or deletes this active */
spin_lock(&cinf->active_lock);
active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
first = active ? active->seq : U64_MAX;
spin_unlock(&cinf->active_lock);
cmp = scoutfs_key_compare_ranges(start, end, &active->start,
&active->end);
if (cmp < 0) {
node = &(*node)->rb_left;
} else if (cmp > 0) {
node = &(*node)->rb_right;
} else {
ret = active;
node = &(*node)->rb_left;
}
return first;
}
static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
{
/* only the calling task adds or deletes this active */
if (!list_empty(&active->head)) {
spin_lock(&cinf->active_lock);
list_del_init(&active->head);
spin_unlock(&cinf->active_lock);
}
if (par)
*par = parent;
if (pnode)
*pnode = node;
return ret;
}
/*
@@ -1399,22 +1442,15 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
* locks held, but without locking the cache. The regions we read can
* be stale with respect to the current cache, which can be read and
* dirtied by other cluster lock holders on our node, but the cluster
* locks protect the stable items we read.
*
* There's also the exciting case where a reader can populate the cache
* with stale old persistent data which was read before another local
* cluster lock holder was able to read, dirty, write, and then shrink
* the cache. In this case the cache couldn't be cleared by lock
* invalidation because the caller is actively holding the lock. But
* shrinking could evict the cache within the held lock. So we record
* that we're an active reader in the range covered by the lock and
* shrink will refuse to reclaim any pages that intersect with our read.
* locks protect the stable items we read. Invalidation is careful not
* to drop pages that have items that we couldn't see because they were
* dirty when we started reading.
*/
static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
struct scoutfs_key *key, struct scoutfs_lock *lock)
{
struct rb_root root = RB_ROOT;
struct active_reader active;
INIT_ACTIVE_READER(active);
struct cached_page *right = NULL;
struct cached_page *pg;
struct cached_page *rd;
@@ -1430,15 +1466,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
int pgi;
int ret;
/* stop shrink from freeing new clean data, would let us cache stale */
active.start = lock->start;
active.end = lock->end;
spin_lock(&cinf->active_lock);
active_rbtree_walk(&cinf->active_root, &active.start, &active.end,
&par, &pnode);
rbtree_insert(&active.node, par, pnode, &cinf->active_root);
spin_unlock(&cinf->active_lock);
/* start with an empty page that covers the whole lock */
pg = alloc_pg(sb, 0);
if (!pg) {
@@ -1449,6 +1476,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
pg->end = lock->end;
rbtree_insert(&pg->node, NULL, &root.rb_node, &root);
/* set active reader seq before reading persistent roots */
add_active_reader(sb, &active);
ret = scoutfs_forest_read_items(sb, lock, key, &start, &end,
read_page_item, &root);
if (ret < 0)
@@ -1526,9 +1556,7 @@ retry:
ret = 0;
out:
spin_lock(&cinf->active_lock);
rbtree_erase(&active.node, &cinf->active_root);
spin_unlock(&cinf->active_lock);
del_active_reader(cinf, &active);
/* free any pages we left dangling on error */
for_each_page_safe(&root, rd, pg_tmp) {
@@ -1830,8 +1858,8 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
if (!item || item->deletion) {
ret = -ENOENT;
} else {
mark_item_dirty(sb, cinf, pg, NULL, item);
item->liv.seq = item_seq(sb, lock);
mark_item_dirty(sb, cinf, pg, NULL, item);
ret = 0;
}
@@ -2406,9 +2434,9 @@ retry:
/*
* Shrink the size the item cache. We're operating against the fast
* path lock ordering and we skip pages if we can't acquire locks.
* Similarly, we can run into dirty pages or pages which intersect with
* active readers that we can't shrink and also choose to skip.
* path lock ordering and we skip pages if we can't acquire locks. We
* can run into dirty pages or pages with items that weren't visible to
* the earliest active reader which must be skipped.
*/
static int item_lru_shrink(struct shrinker *shrink,
struct shrink_control *sc)
@@ -2417,26 +2445,24 @@ static int item_lru_shrink(struct shrinker *shrink,
struct item_cache_info,
shrinker);
struct super_block *sb = cinf->sb;
struct active_reader *active;
struct cached_page *tmp;
struct cached_page *pg;
u64 first_reader_seq;
int nr;
if (sc->nr_to_scan == 0)
goto out;
nr = sc->nr_to_scan;
/* can't invalidate pages with items that weren't visible to first reader */
first_reader_seq = first_active_reader_seq(cinf);
write_lock(&cinf->rwlock);
spin_lock(&cinf->lru_lock);
list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {
/* can't invalidate ranges being read, reader might be stale */
spin_lock(&cinf->active_lock);
active = active_rbtree_walk(&cinf->active_root, &pg->start,
&pg->end, NULL, NULL);
spin_unlock(&cinf->active_lock);
if (active) {
if (first_reader_seq <= pg->max_liv_seq) {
scoutfs_inc_counter(sb, item_shrink_page_reader);
continue;
}
@@ -2505,7 +2531,7 @@ int scoutfs_item_setup(struct super_block *sb)
spin_lock_init(&cinf->lru_lock);
INIT_LIST_HEAD(&cinf->lru_list);
spin_lock_init(&cinf->active_lock);
cinf->active_root = RB_ROOT;
INIT_LIST_HEAD(&cinf->active_list);
cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
if (!cinf->pcpu_pages)
@@ -2536,7 +2562,7 @@ void scoutfs_item_destroy(struct super_block *sb)
int cpu;
if (cinf) {
BUG_ON(!RB_EMPTY_ROOT(&cinf->active_root));
BUG_ON(!list_empty(&cinf->active_list));
unregister_hotcpu_notifier(&cinf->notifier);
unregister_shrinker(&cinf->shrinker);

View File

@@ -1347,29 +1347,28 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
}
/*
* The rid lock protects a mount's private persistent items in the rid
* zone. It's held for the duration of the mount. It lets the mount
* modify the rid items at will and signals to other mounts that we're
* still alive and our rid items shouldn't be reclaimed.
* Orphan items are stored in their own zone which are modified with
* shared write_only locks and are read inconsistently without locks by
* background scanning work.
*
* Being held for the entire mount prevents other nodes from reclaiming
* our items, like free blocks, when it would make sense for them to be
* able to. Maybe we have a bunch free and they're trying to allocate
* and are getting ENOSPC.
* Since we only use write_only locks we just lock the entire zone, but
* the api provides the inode in case we ever change the locking scheme.
*/
int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
u64 rid, struct scoutfs_lock **lock)
int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
struct scoutfs_lock **lock)
{
struct scoutfs_key start;
struct scoutfs_key end;
scoutfs_key_set_zeros(&start);
start.sk_zone = SCOUTFS_RID_ZONE;
start.sko_rid = cpu_to_le64(rid);
start.sk_zone = SCOUTFS_ORPHAN_ZONE;
start.sko_ino = 0;
start.sk_type = SCOUTFS_ORPHAN_TYPE;
scoutfs_key_set_ones(&end);
end.sk_zone = SCOUTFS_RID_ZONE;
end.sko_rid = cpu_to_le64(rid);
scoutfs_key_set_zeros(&end);
end.sk_zone = SCOUTFS_ORPHAN_ZONE;
end.sko_ino = cpu_to_le64(U64_MAX);
end.sk_type = SCOUTFS_ORPHAN_TYPE;
return lock_key_range(sb, mode, flags, &start, &end, lock);
}

View File

@@ -85,8 +85,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
struct inode *d, struct scoutfs_lock **D_lock);
int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
struct scoutfs_lock **lock);
int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
u64 rid, struct scoutfs_lock **lock);
int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
u64 ino, struct scoutfs_lock **lock);
void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
enum scoutfs_lock_mode mode);

View File

@@ -595,10 +595,6 @@ out:
free_req(req);
}
/* it's fine if we couldn't send to a client that left */
if (ret == -ENOTCONN)
ret = 0;
return ret;
}
@@ -908,9 +904,9 @@ out:
}
/*
* Return 1 and give the caller a write inode lock if it is safe to be
* deleted. It's safe to be deleted when it is no longer reachable and
* nothing is referencing it.
* Return 1 and give the caller their locks when they should delete the
* inode items. It's safe to delete the inode items when it is no
* longer reachable and nothing is referencing it.
*
* The inode is unreachable when nlink hits zero. Cluster locks protect
* modification and testing of nlink. We use the ino_lock_cov covrage
@@ -925,15 +921,17 @@ out:
* increase nlink from zero and let people get a reference to the inode.
*/
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
struct scoutfs_lock **lock_ret)
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_lock *lock = NULL;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_omap_lock_data *ldata;
u64 group_nr;
int bit_nr;
int ret;
int err;
/* lock group and omap constants are defined independently */
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
@@ -964,12 +962,19 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
out:
trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);
if (ret > 0) {
err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
if (err < 0)
ret = err;
}
if (ret <= 0) {
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
lock = NULL;
}
*lock_ret = lock;
*orph_lock_ret = orph_lock;
return ret;
}

View File

@@ -4,7 +4,7 @@
int scoutfs_omap_inc(struct super_block *sb, u64 ino);
void scoutfs_omap_dec(struct super_block *sb, u64 ino);
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
struct scoutfs_lock **lock_ret);
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
struct scoutfs_open_ino_map_args *args);

View File

@@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func,
);
DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
TP_PROTO(struct super_block *sb, void *journal_info, int holders),
TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
TP_ARGS(sb, journal_info, holders),
TP_ARGS(sb, journal_info, holders, ret),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(unsigned long, journal_info)
__field(int, holders)
__field(int, ret)
),
TP_fast_assign(
@@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
__entry->holders = holders;
),
TP_printk(SCSBF" journal_info 0x%0lx holders %d",
SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret)
);
DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
TP_PROTO(struct super_block *sb, void *journal_info, int holders),
TP_ARGS(sb, journal_info, holders)
DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans,
TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
TP_ARGS(sb, journal_info, holders, ret)
);
DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
TP_PROTO(struct super_block *sb, void *journal_info, int holders),
TP_ARGS(sb, journal_info, holders)
TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
TP_ARGS(sb, journal_info, holders, ret)
);
TRACE_EVENT(scoutfs_ioc_release,
@@ -985,22 +986,6 @@ TRACE_EVENT(scoutfs_delete_inode,
__entry->mode, __entry->size)
);
TRACE_EVENT(scoutfs_scan_orphans,
TP_PROTO(struct super_block *sb),
TP_ARGS(sb),
TP_STRUCT__entry(
__field(dev_t, dev)
),
TP_fast_assign(
__entry->dev = sb->s_dev;
),
TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
);
DECLARE_EVENT_CLASS(scoutfs_key_class,
TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
TP_ARGS(sb, key),

View File

@@ -323,6 +323,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
struct commit_waiter *cw;
struct commit_waiter *pos;
struct llist_node *node;
u64 reserved;
int ret;
trace_scoutfs_server_commit_work_enter(sb, 0, 0);
@@ -387,11 +388,17 @@ static void scoutfs_server_commit_func(struct work_struct *work)
server->other_avail = &super->server_meta_avail[server->other_ind];
server->other_freed = &super->server_meta_freed[server->other_ind];
/* swap avail/free if avail gets low and freed is high */
if (le64_to_cpu(server->meta_avail->total_len) <=
SCOUTFS_SERVER_META_ALLOC_MIN &&
le64_to_cpu(server->meta_freed->total_len) >
SCOUTFS_SERVER_META_ALLOC_MIN)
/*
* The reserved metadata blocks includes the max size of
* outstanding allocators and a server transaction could be
* asked to refill all those allocators from meta_avail. If our
* meta_avail falls below the reserved count, and freed is still
* above it, then swap so that we don't start returning enospc
* until we're truly low.
*/
reserved = scoutfs_server_reserved_meta_blocks(sb);
if (le64_to_cpu(server->meta_avail->total_len) <= reserved &&
le64_to_cpu(server->meta_freed->total_len) > reserved)
swap(server->meta_avail, server->meta_freed);
ret = 0;
@@ -479,6 +486,57 @@ static int alloc_move_empty(struct super_block *sb,
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
}
/*
* Copy on write transactions need to allocate new dirty blocks as they
* make modifications to delete items and eventually free more blocks.
* The reserved blocks are meant to keep enough available blocks in
* flight to allow servers and clients to perform transactions that
* don't consume additional space. We have quite a few allocators in
* flight across the server and various client mechanisms (posix items,
* srch compaction, and log merging). We also want to include
* sufficient blocks for client log btrees to grow tall enough to be
* finalized and merges.
*
* The reserved blocks calculation is a policy of the server but it's
* exposed to the statfs_more interface so that df isn't misleading.
* Requiring this synchronization without explicit protocol
* communication isn't great.
*/
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb)
{
DECLARE_SERVER_INFO(sb, server);
u64 server_blocks;
u64 client_blocks;
u64 log_blocks;
u64 nr_clients;
/* server has two meta_avail lists it swaps between */
server_blocks = SCOUTFS_SERVER_META_FILL_TARGET * 2;
/*
* Log trees will be compacted once they hit a height of 3.
* That'll be the grandparent, two parents resulting from a
* split, and all their child blocks (roughly calculated,
* overestimating).
*/
log_blocks = 3 + (SCOUTFS_BLOCK_LG_SIZE /
(sizeof(struct scoutfs_btree_item) + sizeof(struct scoutfs_block_ref)));
/*
* Each client can have a meta_avail list, srch compaction
* request, log merge request, and a log btree it's building.
*/
client_blocks = SCOUTFS_SERVER_META_FILL_TARGET + SCOUTFS_SERVER_META_FILL_TARGET +
SCOUTFS_SERVER_MERGE_FILL_TARGET + log_blocks;
/* we should reserve for voting majority, too */
spin_lock(&server->lock);
nr_clients = server->nr_clients;
spin_unlock(&server->lock);
return server_blocks + (max(1ULL, nr_clients) * client_blocks);
}
/*
* Set all the bits in the destination which overlap with the extent.
*/
@@ -662,6 +720,7 @@ static int server_get_log_trees(struct super_block *sb,
struct scoutfs_log_trees lt;
struct scoutfs_key key;
bool have_fin = false;
bool unlock_alloc = false;
u64 data_zone_blocks;
u64 nr;
int ret;
@@ -701,8 +760,15 @@ static int server_get_log_trees(struct super_block *sb,
lt.nr = cpu_to_le64(nr);
}
/* finalize an existing root when large enough and don't have one */
if (lt.item_root.height > 2 && !have_fin) {
/*
* Finalize the client log btree when it has enough leaf blocks
* to allow some degree of merging concurrency. Smaller btrees
* are also finalized when meta was low so that deleted items
* are merged promptly and freed blocks can bring the client out
* of enospc.
*/
if (!have_fin && ((lt.item_root.height > 2) ||
(le32_to_cpu(lt.meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW))) {
fin = lt;
memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
@@ -734,24 +800,45 @@ static int server_get_log_trees(struct super_block *sb,
data_zone_blocks = 0;
}
/* return freed to server for emptying, refill avail */
/*
* Reclaim the freed meta and data allocators and refill the
* avail allocators, setting low flags if they drop too low.
*/
mutex_lock(&server->alloc_mutex);
ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri,
server->other_freed,
unlock_alloc = true;
ret = scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
&lt.meta_freed) ?:
alloc_move_empty(sb, &super->data_alloc, &lt.data_freed) ?:
scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
&lt.meta_avail, server->meta_avail,
SCOUTFS_SERVER_META_FILL_LO,
SCOUTFS_SERVER_META_FILL_TARGET) ?:
alloc_move_refill_zoned(sb, &lt.data_avail, &super->data_alloc,
SCOUTFS_SERVER_DATA_FILL_LO,
SCOUTFS_SERVER_DATA_FILL_TARGET,
exclusive, vacant, data_zone_blocks);
mutex_unlock(&server->alloc_mutex);
alloc_move_empty(sb, &super->data_alloc, &lt.data_freed);
if (ret < 0)
goto unlock;
ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
&lt.meta_avail, server->meta_avail,
SCOUTFS_SERVER_META_FILL_LO,
SCOUTFS_SERVER_META_FILL_TARGET);
if (ret < 0)
goto unlock;
if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb))
lt.meta_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
else
lt.meta_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
ret = alloc_move_refill_zoned(sb, &lt.data_avail, &super->data_alloc,
SCOUTFS_SERVER_DATA_FILL_LO, SCOUTFS_SERVER_DATA_FILL_TARGET,
exclusive, vacant, data_zone_blocks);
if (ret < 0)
goto unlock;
if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO)
lt.data_avail.flags |= cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
else
lt.data_avail.flags &= ~cpu_to_le32(SCOUTFS_ALLOC_FLAG_LOW);
mutex_unlock(&server->alloc_mutex);
unlock_alloc = false;
/* record data alloc zone bits */
zero_data_alloc_zone_bits(&lt);
if (data_zone_blocks != 0) {
@@ -772,6 +859,8 @@ static int server_get_log_trees(struct super_block *sb,
ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
&super->logs_root, &key, &lt, sizeof(lt));
unlock:
if (unlock_alloc)
mutex_unlock(&server->alloc_mutex);
mutex_unlock(&server->logs_mutex);
ret = scoutfs_server_apply_commit(sb, ret);
@@ -2277,15 +2366,27 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
open_ino_map_response, NULL, NULL);
}
/* The server is sending an omap response to the client */
/*
* The server is sending an omap response to the client that originated
* the request. These responses are sent long after the incoming
* request has pinned the client connection and guaranteed that we'll be
* able to queue a response. This can race with the client connection
* being torn down and it's OK if we drop the response. Either the
* client is being evicted and we don't care about them anymore or we're
* tearing down in unmount and the client will resend to thee next
* server.
*/
int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
struct scoutfs_open_ino_map *map, int err)
{
struct server_info *server = SCOUTFS_SB(sb)->server_info;
int ret;
return scoutfs_net_response_node(sb, server->conn, rid,
SCOUTFS_NET_CMD_OPEN_INO_MAP, id, err,
map, sizeof(*map));
ret = scoutfs_net_response_node(sb, server->conn, rid, SCOUTFS_NET_CMD_OPEN_INO_MAP,
id, err, map, sizeof(*map));
if (ret == -ENOTCONN)
ret = 0;
return ret;
}
/* The server is receiving an omap request from the client */

View File

@@ -56,6 +56,8 @@ do { \
__entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \
__entry->name##_error
u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb);
int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
struct scoutfs_net_lock *nl);
int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
@@ -75,8 +77,6 @@ u64 scoutfs_server_seq(struct super_block *sb);
u64 scoutfs_server_next_seq(struct super_block *sb);
void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
struct sockaddr_in;
struct scoutfs_quorum_elected_info;
int scoutfs_server_start(struct super_block *sb, u64 term);
void scoutfs_server_abort(struct super_block *sb);
void scoutfs_server_stop(struct super_block *sb);

View File

@@ -247,11 +247,10 @@ static void scoutfs_put_super(struct super_block *sb)
trace_scoutfs_put_super(sb);
scoutfs_inode_stop(sb);
scoutfs_forest_stop(sb);
scoutfs_srch_destroy(sb);
scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
sbi->rid_lock = NULL;
scoutfs_lock_shutdown(sb);
scoutfs_shutdown_trans(sb);
@@ -623,10 +622,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
scoutfs_quorum_setup(sb) ?:
scoutfs_client_setup(sb) ?:
scoutfs_volopt_setup(sb) ?:
scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
&sbi->rid_lock) ?:
scoutfs_trans_get_log_trees(sb) ?:
scoutfs_srch_setup(sb);
scoutfs_srch_setup(sb) ?:
scoutfs_inode_start(sb);
if (ret)
goto out;
@@ -647,7 +645,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
scoutfs_trans_restart_sync_deadline(sb);
// scoutfs_scan_orphans(sb);
ret = 0;
out:
/* on error, generic_shutdown_super calls put_super if s_root */

View File

@@ -36,7 +36,6 @@ struct scoutfs_sb_info {
/* assigned once at the start of each mount, read-only */
u64 rid;
struct scoutfs_lock *rid_lock;
struct scoutfs_super_block super;

View File

@@ -436,8 +436,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
return true;
}
/* Try to refill data allocator before premature enospc */
if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
/* if we're low and can't refill then alloc could empty and return enospc */
if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) {
scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
return true;
}
@@ -445,38 +445,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
return false;
}
static bool acquired_hold(struct super_block *sb)
/*
* called as a wait_event condition, needs to be careful to not change
* task state and is racing with waking paths that sub_return, test, and
* wake.
*/
static bool holders_no_writer(struct trans_info *tri)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
DECLARE_TRANS_INFO(sb, tri);
bool acquired;
/* if a caller already has a hold we acquire unconditionally */
if (inc_journal_info_holders()) {
atomic_inc(&tri->holders);
acquired = true;
goto out;
}
/* wait if the writer is blocking holds */
if (!inc_holders_unless_writer(tri)) {
dec_journal_info_holders();
acquired = false;
goto out;
}
/* wait if we're triggering another commit */
if (commit_before_hold(sb, tri)) {
release_holders(sb);
queue_trans_work(sbi);
acquired = false;
goto out;
}
trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
acquired = true;
out:
return acquired;
smp_mb(); /* make sure task in wait_event queue before atomic read */
return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT);
}
/*
@@ -492,15 +469,64 @@ out:
* The writing thread marks itself as a global trans_task which
* short-circuits all the hold machinery so it can call code that would
* otherwise try to hold transactions while it is writing.
*
* If the caller is adding metadata items that will eventually consume
* free space -- not dirtying existing items or adding deletion items --
* then we can return enospc if our metadata allocator indicates that
* we're low on space.
*/
int scoutfs_hold_trans(struct super_block *sb)
int scoutfs_hold_trans(struct super_block *sb, bool allocing)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
DECLARE_TRANS_INFO(sb, tri);
u64 seq;
int ret;
if (current == sbi->trans_task)
return 0;
return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
for (;;) {
/* if a caller already has a hold we acquire unconditionally */
if (inc_journal_info_holders()) {
atomic_inc(&tri->holders);
ret = 0;
break;
}
/* wait until the writer work is finished */
if (!inc_holders_unless_writer(tri)) {
dec_journal_info_holders();
ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri));
if (ret < 0)
break;
continue;
}
/* return enospc if server is into reserved blocks and we're allocating */
if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) {
release_holders(sb);
ret = -ENOSPC;
break;
}
/* see if we need to trigger and wait for a commit before holding */
if (commit_before_hold(sb, tri)) {
seq = scoutfs_trans_sample_seq(sb);
release_holders(sb);
queue_trans_work(sbi);
ret = wait_event_interruptible(sbi->trans_hold_wq,
scoutfs_trans_sample_seq(sb) != seq);
if (ret < 0)
break;
continue;
}
ret = 0;
break;
}
trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret);
return ret;
}
/*
@@ -525,7 +551,7 @@ void scoutfs_release_trans(struct super_block *sb)
release_holders(sb);
trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0);
}
/*

View File

@@ -1,18 +1,13 @@
#ifndef _SCOUTFS_TRANS_H_
#define _SCOUTFS_TRANS_H_
/* the server will attempt to fill data allocs for each trans */
#define SCOUTFS_TRANS_DATA_ALLOC_HWM (2ULL * 1024 * 1024 * 1024)
/* the client will force commits if data allocators get too low */
#define SCOUTFS_TRANS_DATA_ALLOC_LWM (256ULL * 1024 * 1024)
void scoutfs_trans_write_func(struct work_struct *work);
int scoutfs_trans_sync(struct super_block *sb, int wait);
int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
void scoutfs_trans_restart_sync_deadline(struct super_block *sb);
int scoutfs_hold_trans(struct super_block *sb);
int scoutfs_hold_trans(struct super_block *sb, bool allocing);
bool scoutfs_trans_held(void);
void scoutfs_release_trans(struct super_block *sb);
u64 scoutfs_trans_sample_seq(struct super_block *sb);

View File

@@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
@@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
&tgs) != 0)
memset(&tgs, 0, sizeof(tgs));
ret = scoutfs_hold_trans(sb);
ret = scoutfs_hold_trans(sb, false);
if (ret < 0)
break;
release = true;

1
tests/.gitignore vendored
View File

@@ -5,3 +5,4 @@ src/handle_cat
src/bulk_create_paths
src/find_xattrs
src/stage_tmpfile
src/create_xattr_loop

View File

@@ -7,7 +7,8 @@ BIN := src/createmany \
src/handle_cat \
src/bulk_create_paths \
src/stage_tmpfile \
src/find_xattrs
src/find_xattrs \
src/create_xattr_loop
DEPS := $(wildcard src/*.d)

View File

@@ -71,6 +71,7 @@ t_filter_dmesg()
re="$re|scoutfs .* quorum .* error"
re="$re|scoutfs .* error reading quorum block"
re="$re|scoutfs .* error .* writing quorum block"
re="$re|scoutfs .* error .* while checking to delete inode"
egrep -v "($re)"
}

8
tests/golden/enospc Normal file
View File

@@ -0,0 +1,8 @@
== prepare directories and files
== fallocate until enospc
== remove all the files and verify free data blocks
== make small meta fs
== create large xattrs until we fill up metadata
== remove files with xattrs after enospc
== make sure we can create again
== cleanup small meta fs

View File

@@ -0,0 +1,4 @@
== test our inode existance function
== unlinked and opened inodes still exist
== orphan from failed evict deletion is picked up
== orphaned inos in all mounts all deleted

View File

@@ -7,6 +7,7 @@ simple-release-extents.sh
setattr_more.sh
offline-extent-waiting.sh
move-blocks.sh
enospc.sh
srch-basic-functionality.sh
simple-xattr-unit.sh
lock-refleak.sh
@@ -29,6 +30,7 @@ cross-mount-data-free.sh
persistent-item-vers.sh
setup-error-teardown.sh
fence-and-reclaim.sh
orphan-inodes.sh
mount-unmount-race.sh
createmany-parallel-mounts.sh
archive-light-cycle.sh

View File

@@ -0,0 +1,113 @@
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/xattr.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
static void exit_usage(void)
{
printf(" -h/-? output this usage message and exit\n"
" -c <count> number of xattrs to create\n"
" -n <string> xattr name prefix, -NR is appended\n"
" -p <path> string with path to file with xattrs\n"
" -s <size> xattr value size\n");
exit(1);
}
int main(int argc, char **argv)
{
char *pref = NULL;
char *path = NULL;
char *val;
char *name;
unsigned long long count = 0;
unsigned long long size = 0;
unsigned long long i;
int ret;
int c;
while ((c = getopt(argc, argv, "+c:n:p:s:")) != -1) {
switch (c) {
case 'c':
count = strtoull(optarg, NULL, 0);
break;
case 'n':
pref = strdup(optarg);
break;
case 'p':
path = strdup(optarg);
break;
case 's':
size = strtoull(optarg, NULL, 0);
break;
case '?':
printf("unknown argument: %c\n", optind);
case 'h':
exit_usage();
}
}
if (count == 0) {
printf("specify count of xattrs to create with -c\n");
exit(1);
}
if (count == ULLONG_MAX) {
printf("invalid -c count\n");
exit(1);
}
if (size == 0) {
printf("specify xattrs value size with -s\n");
exit(1);
}
if (size == ULLONG_MAX || size < 2) {
printf("invalid -s size\n");
exit(1);
}
if (path == NULL) {
printf("specify path to file with -p\n");
exit(1);
}
if (pref == NULL) {
printf("specify xattr name prefix string with -n\n");
exit(1);
}
ret = snprintf(NULL, 0, "%s-%llu", pref, ULLONG_MAX) + 1;
name = malloc(ret);
if (!name) {
printf("couldn't allocate xattr name buffer\n");
exit(1);
}
val = malloc(size);
if (!val) {
printf("couldn't allocate xattr value buffer\n");
exit(1);
}
memset(val, 'a', size - 1);
val[size - 1] = '\0';
for (i = 0; i < count; i++) {
sprintf(name, "%s-%llu", pref, i);
ret = setxattr(path, name, val, size, 0);
if (ret) {
printf("returned %d errno %d (%s)\n",
ret, errno, strerror(errno));
return 1;
}
}
return 0;
}

100
tests/tests/enospc.sh Normal file
View File

@@ -0,0 +1,100 @@
#
# test hititng enospc by filling with data or metadata and
# then recovering by removing what we filled.
#
# Type Size Total Used Free Use%
#MetaData 64KB 1048576 32782 1015794 3
# Data 4KB 16777152 0 16777152 0
free_blocks() {
local md="$1"
local mnt="$2"
scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
}
t_require_commands scoutfs stat fallocate createmany
echo "== prepare directories and files"
for n in $(t_fs_nrs); do
eval path="\$T_D${n}/dir-$n/file-$n"
mkdir -p $(dirname $path)
touch $path
done
sync
echo "== fallocate until enospc"
before=$(free_blocks Data "$T_M0")
finished=0
while [ $finished != 1 ]; do
for n in $(t_fs_nrs); do
eval path="\$T_D${n}/dir-$n/file-$n"
off=$(stat -c "%s" "$path")
LC_ALL=C fallocate -o $off -l 128MiB "$path" > $T_TMP.fallocate 2>&1
err="$?"
if grep -qi "no space" $T_TMP.fallocate; then
finished=1
break
fi
if [ "$err" != "0" ]; then
t_fail "fallocate failed with $err"
fi
done
done
echo "== remove all the files and verify free data blocks"
for n in $(t_fs_nrs); do
eval dir="\$T_D${n}/dir-$n"
rm -rf "$dir"
done
sync
after=$(free_blocks Data "$T_M0")
# nothing else should be modifying data blocks
test "$before" == "$after" || \
t_fail "$after free data blocks after rm, expected $before"
# XXX this is all pretty manual, would be nice to have helpers
echo "== make small meta fs"
# meta device just big enough for reserves and the metadata we'll fill
scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
t_fail "mkfs failed"
SCR="/mnt/scoutfs.enospc"
mkdir -p "$SCR"
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR"
echo "== create large xattrs until we fill up metadata"
mkdir -p "$SCR/xattrs"
for f in $(seq 1 100000); do
file="$SCR/xattrs/file-$f"
touch "$file"
LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1
err="$?"
if grep -qi "no space" $T_TMP.cxl; then
echo "enospc at f $f" >> $T_TMP.cxl
break
fi
if [ "$err" != "0" ]; then
t_fail "create_xattr_loop failed with $err"
fi
done
echo "== remove files with xattrs after enospc"
rm -rf "$SCR/xattrs"
echo "== make sure we can create again"
file="$SCR/file-after"
touch $file
setfattr -n user.scoutfs-enospc -v 1 "$file"
sync
rm -f "$file"
echo "== cleanup small meta fs"
umount "$SCR"
rmdir "$SCR"
t_pass

View File

@@ -0,0 +1,77 @@
#
# make sure we clean up orphaned inodes
#
t_require_commands sleep touch sync stat handle_cat kill rm
t_require_mounts 2
#
# usually bash prints an annoying output message when jobs
# are killed. We can avoid that by redirecting stderr for
# the bash process when it reaps the jobs that are killed.
#
silent_kill() {
exec {ERR}>&2 2>/dev/null
kill "$@"
wait "$@"
exec 2>&$ERR {ERR}>&-
}
#
# We don't have a great way to test that inode items still exist. We
# don't prevent opening handles with nlink 0 today, so we'll use that.
# This would have to change to some other method.
#
inode_exists()
{
local ino="$1"
handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
}
echo "== test our inode existance function"
path="$T_D0/file"
touch "$path"
ino=$(stat -c "%i" "$path")
inode_exists $ino || echo "$ino didn't exist"
echo "== unlinked and opened inodes still exist"
sleep 1000000 < "$path" &
pid="$!"
rm -f "$path"
inode_exists $ino || echo "$ino didn't exist"
echo "== orphan from failed evict deletion is picked up"
# pending kill signal stops evict from getting locks and deleting
silent_kill $pid
sleep 55
inode_exists $ino && echo "$ino still exists"
echo "== orphaned inos in all mounts all deleted"
pids=""
inos=""
for nr in $(t_fs_nrs); do
eval path="\$T_D${nr}/file-$nr"
touch "$path"
inos="$inos $(stat -c %i $path)"
sleep 1000000 < "$path" &
pids="$pids $!"
rm -f "$path"
done
sync
silent_kill $pids
for nr in $(t_fs_nrs); do
t_force_umount $nr
done
t_mount_all
# wait for all fence requests to complete
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
sleep .5
done
# wait for orphan scans to run
sleep 55
for ino in $inos; do
inode_exists $ino && echo "$ino still exists"
done
t_pass

View File

@@ -32,10 +32,18 @@ A path within a ScoutFS filesystem.
.PD
.TP
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force]"
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size]"
.sp
Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
separate block devices for its metadata and data storage, two are required.
The internal structures and nature of metadata and data transactions
lead to minimum viable device sizes.
.B mkfs
will check both devices and fail with an error if either are under the
minimum size. If
.B --allow-small-size
is given then sizes under the minimum size will be
allowed after printing an informational warning.
.sp
If
.B --force
@@ -81,6 +89,10 @@ kibibytes, mebibytes, etc.
.B "-d, --max-data-size SIZE"
Same as previous, but for limiting the size of the data device.
.TP
.B "-A, --allow-small-size"
Allows use of specified device sizes less than the minimum. This can
result in bad behaviour and is only intended for testing.
.TP
.B "-z, --data-alloc-zone-blocks BLOCKS"
Set the data_alloc_zone_blocks volume option, as described in
.BR scoutfs (5).

View File

@@ -6,12 +6,13 @@
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <errno.h>
#include <stdbool.h>
#include "sparse.h"
#include "dev.h"
int device_size(char *path, int fd,
u64 min_size, u64 max_size,
u64 min_size, u64 max_size, bool allow_small_size,
char *use_type, u64 *size_ret)
{
struct stat st;
@@ -63,10 +64,13 @@ int device_size(char *path, int fd,
if (size < min_size) {
fprintf(stderr,
BASE_SIZE_FMT" %s too small for min "
BASE_SIZE_FMT" %s device\n",
BASE_SIZE_FMT" %s device%s\n",
BASE_SIZE_ARGS(size), target_type,
BASE_SIZE_ARGS(min_size), use_type);
return -EINVAL;
BASE_SIZE_ARGS(min_size), use_type,
allow_small_size ? ", allowing with -A" : "");
if (!allow_small_size)
return -EINVAL;
}
*size_ret = size;

View File

@@ -1,6 +1,8 @@
#ifndef _DEV_H_
#define _DEV_H_
#include <stdbool.h>
#define BASE_SIZE_FMT "%.2f%s"
#define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)
@@ -8,7 +10,7 @@
#define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)
int device_size(char *path, int fd,
u64 min_size, u64 max_size,
u64 min_size, u64 max_size, bool allow_small_size,
char *use_type, u64 *size_ret);
float size_flt(u64 nr, unsigned size);
char *size_str(u64 nr, unsigned size);

View File

@@ -86,6 +86,11 @@ static int do_df(struct df_args *args)
data_free += ade[i].blocks;
}
if (meta_free >= sfm.reserved_meta_blocks)
meta_free -= sfm.reserved_meta_blocks;
else
meta_free = 0;
snprintf(cells[0][0], CHARS, "Type");
snprintf(cells[0][1], CHARS, "Size");
snprintf(cells[0][2], CHARS, "Total");

View File

@@ -135,6 +135,7 @@ struct mkfs_args {
unsigned long long max_data_size;
u64 data_alloc_zone_blocks;
bool force;
bool allow_small_size;
int nr_slots;
struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
};
@@ -215,13 +216,15 @@ static int do_mkfs(struct mkfs_args *args)
goto out;
}
ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024),
args->max_meta_size, "meta", &meta_size);
/* minumum meta device size to make reserved blocks reasonably large */
ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
args->max_meta_size, args->allow_small_size, "meta", &meta_size);
if (ret)
goto out;
ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024),
args->max_data_size, "data", &data_size);
/* .. then arbitrarily the same minimum data device size */
ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
args->max_data_size, args->allow_small_size, "data", &data_size);
if (ret)
goto out;
@@ -520,6 +523,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
prev_val, args->max_data_size);
break;
}
case 'A':
args->allow_small_size = true;
break;
case 'z': /* data-alloc-zone-blocks */
{
ret = parse_u64(arg, &args->data_alloc_zone_blocks);
@@ -559,6 +565,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
static struct argp_option options[] = {
{ "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"},
{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
{ "allow-small-size", 'A', NULL, 0, "Allow specified meta/data devices less than minimum, still warns"},
{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},

View File

@@ -158,7 +158,7 @@ static print_func_t find_printer(u8 zone, u8 type)
type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
return print_inode_index;
if (zone == SCOUTFS_RID_ZONE) {
if (zone == SCOUTFS_ORPHAN_ZONE) {
if (type == SCOUTFS_ORPHAN_TYPE)
return print_orphan;
}
@@ -245,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq)
#define AL_HEAD_F \
AL_REF_F" total_nr %llu first_nr %u"
AL_REF_F" total_nr %llu first_nr %u flags 0x%x"
#define AL_HEAD_A(p) \
AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\
le32_to_cpu((p)->first_nr)
le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags)
#define ALCROOT_F \
BTROOT_F" total_len %llu"
BTROOT_F" total_len %llu flags 0x%x"
#define ALCROOT_A(ar) \
BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len)
BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags)
#define SRE_FMT "%016llx.%llu.%llu"
#define SRE_A(sre) \