mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-06 12:06:26 +00:00
Compare commits
37 Commits
zab/bag_o_
...
v1.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
029a684c25 | ||
|
|
f2679d9598 | ||
|
|
bddca171ee | ||
|
|
18171b8543 | ||
|
|
d846eec5e8 | ||
|
|
e2c90339c5 | ||
|
|
4a0b14a4f2 | ||
|
|
90518a0fbd | ||
|
|
cd23cc61ca | ||
|
|
a67ea30bb7 | ||
|
|
f3b7c683f0 | ||
|
|
8decc54467 | ||
|
|
5adcf7677f | ||
|
|
07f03d499f | ||
|
|
c5068efef0 | ||
|
|
66678dc63b | ||
|
|
b2834d3c28 | ||
|
|
cff50bec6b | ||
|
|
4d6350b3b0 | ||
|
|
48966b42bb | ||
|
|
97cb8ad50d | ||
|
|
ae08a797ae | ||
|
|
2634fadfcb | ||
|
|
0c1f19556d | ||
|
|
19caae3da8 | ||
|
|
2989afbf46 | ||
|
|
730a84af92 | ||
|
|
5b77133c3b | ||
|
|
329ac0347d | ||
|
|
15d7eec1f9 | ||
|
|
cff17a4cae | ||
|
|
9fa2c6af89 | ||
|
|
e067961714 | ||
|
|
7a96e03148 | ||
|
|
e9b3cc873a | ||
|
|
5f2259c48f | ||
|
|
e14912974d |
@@ -2,9 +2,31 @@ Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.x
|
||||
v1.2
|
||||
\
|
||||
*TBD*
|
||||
*Mar 14, 2022*
|
||||
|
||||
* **Fix deadlock between fallocate() and read() system calls**
|
||||
\
|
||||
Fixed a lock inversion that could cause two tasks to deadlock if they
|
||||
performed fallocate() and read() on a file at the same time. The
|
||||
deadlock was uninterruptible so the machine needed to be rebooted. This
|
||||
was relatively rare as fallocate() is usually used to prepare files
|
||||
before they're used.
|
||||
|
||||
* **Fix instability from heavy file deletion workloads**
|
||||
\
|
||||
Fixed rare circumstances under which background file deletion cleanup
|
||||
tasks could try to delete a file while it is being deleted by another
|
||||
task. Heavy load across multiple nodes, either many files being deleted
|
||||
or large files being deleted, increased the chances of this happening.
|
||||
Heavy staging could cause this problem because staging can create many
|
||||
internal temporary files that need to be deleted.
|
||||
|
||||
---
|
||||
v1.1
|
||||
\
|
||||
*Feb 4, 2022*
|
||||
|
||||
|
||||
* **Add scoutfs(1) change-quorum-config command**
|
||||
|
||||
@@ -477,12 +477,15 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
|
||||
struct super_block *sb = client->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
const bool am_quorum = opts->quorum_slot_nr >= 0;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_net_greeting greet;
|
||||
struct sockaddr_in sin;
|
||||
bool am_quorum;
|
||||
int ret;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
am_quorum = opts.quorum_slot_nr >= 0;
|
||||
|
||||
/* can unmount once server farewell handling removes our item */
|
||||
if (client->sending_farewell &&
|
||||
lookup_mounted_client_item(sb, sbi->rid) == 0) {
|
||||
|
||||
@@ -152,11 +152,11 @@
|
||||
EXPAND_COUNTER(net_recv_messages) \
|
||||
EXPAND_COUNTER(net_unknown_request) \
|
||||
EXPAND_COUNTER(orphan_scan) \
|
||||
EXPAND_COUNTER(orphan_scan_attempts) \
|
||||
EXPAND_COUNTER(orphan_scan_cached) \
|
||||
EXPAND_COUNTER(orphan_scan_error) \
|
||||
EXPAND_COUNTER(orphan_scan_item) \
|
||||
EXPAND_COUNTER(orphan_scan_omap_set) \
|
||||
EXPAND_COUNTER(orphan_scan_read) \
|
||||
EXPAND_COUNTER(quorum_elected) \
|
||||
EXPAND_COUNTER(quorum_fence_error) \
|
||||
EXPAND_COUNTER(quorum_fence_leader) \
|
||||
|
||||
@@ -983,9 +983,6 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
u64 last;
|
||||
s64 ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
/* XXX support more flags */
|
||||
if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
|
||||
ret = -EOPNOTSUPP;
|
||||
@@ -1003,18 +1000,22 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_mutex;
|
||||
|
||||
inode_dio_wait(inode);
|
||||
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
||||
(offset + len > i_size_read(inode))) {
|
||||
ret = inode_newsize_ok(inode, offset + len);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_extent;
|
||||
}
|
||||
|
||||
iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
@@ -1024,7 +1025,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_extent;
|
||||
|
||||
ret = fallocate_extents(sb, inode, iblock, last, lock);
|
||||
|
||||
@@ -1050,17 +1051,19 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
}
|
||||
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
goto out_extent;
|
||||
|
||||
iblock += ret;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
out_extent:
|
||||
up_write(&si->extent_sem);
|
||||
out_mutex:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
out:
|
||||
trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -511,7 +511,7 @@ out:
|
||||
else if (ino == 0)
|
||||
inode = NULL;
|
||||
else
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
inode = scoutfs_iget(sb, ino, 0, 0);
|
||||
|
||||
/*
|
||||
* We can't splice dir aliases into the dcache. dir entries
|
||||
@@ -720,7 +720,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
|
||||
struct list_head *ind_locks)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode;
|
||||
struct inode *inode = NULL;
|
||||
u64 ind_seq;
|
||||
int ret = 0;
|
||||
u64 ino;
|
||||
@@ -765,11 +765,9 @@ retry:
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
inode = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_dirty_inode_item(dir, *dir_lock);
|
||||
out:
|
||||
@@ -787,6 +785,8 @@ out_unlock:
|
||||
*orph_lock = NULL;
|
||||
}
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
@@ -1319,11 +1319,11 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
insert_inode_hash(inode);
|
||||
/* XXX need to set i_op/fop before here for sec callbacks */
|
||||
d_instantiate(dentry, inode);
|
||||
inode = NULL;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0) {
|
||||
/* XXX remove inode items */
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock,
|
||||
NULL, name_len);
|
||||
@@ -1334,6 +1334,9 @@ out:
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1923,10 +1926,8 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
if (ret < 0) {
|
||||
iput(inode);
|
||||
if (ret < 0)
|
||||
goto out; /* XXX returning error but items created */
|
||||
}
|
||||
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
si->crtime = inode->i_mtime;
|
||||
@@ -1939,7 +1940,6 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
|
||||
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
iput(inode);
|
||||
|
||||
out:
|
||||
scoutfs_release_trans(sb);
|
||||
@@ -1948,6 +1948,9 @@ out:
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ static struct dentry *scoutfs_fh_to_dentry(struct super_block *sb,
|
||||
trace_scoutfs_fh_to_dentry(sb, fh_type, sfid);
|
||||
|
||||
if (scoutfs_valid_fileid(fh_type))
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0);
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0, SCOUTFS_IGF_LINKED);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
@@ -100,7 +100,7 @@ static struct dentry *scoutfs_fh_to_parent(struct super_block *sb,
|
||||
|
||||
if (scoutfs_valid_fileid(fh_type) &&
|
||||
fh_type == FILEID_SCOUTFS_WITH_PARENT)
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0);
|
||||
inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0, SCOUTFS_IGF_LINKED);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
@@ -123,7 +123,7 @@ static struct dentry *scoutfs_get_parent(struct dentry *child)
|
||||
scoutfs_dir_free_backref_path(sb, &list);
|
||||
trace_scoutfs_get_parent(sb, inode, ino);
|
||||
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
inode = scoutfs_iget(sb, ino, 0, SCOUTFS_IGF_LINKED);
|
||||
|
||||
return d_obtain_alias(inode);
|
||||
}
|
||||
|
||||
@@ -395,12 +395,13 @@ int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
|
||||
int scoutfs_fence_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct fence_info *fi;
|
||||
int ret;
|
||||
|
||||
/* can only fence if we can be elected by quorum */
|
||||
if (opts->quorum_slot_nr == -1) {
|
||||
scoutfs_options_read(sb, &opts);
|
||||
if (opts.quorum_slot_nr == -1) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
533
kmod/src/inode.c
533
kmod/src/inode.c
@@ -66,10 +66,6 @@ struct inode_sb_info {
|
||||
|
||||
struct delayed_work orphan_scan_dwork;
|
||||
|
||||
/* serialize multiple inode ->evict trying to delete same ino's items */
|
||||
spinlock_t deleting_items_lock;
|
||||
struct list_head deleting_items_list;
|
||||
|
||||
struct work_struct iput_work;
|
||||
struct llist_head iput_llist;
|
||||
};
|
||||
@@ -276,7 +272,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
set_item_info(si, cinode);
|
||||
}
|
||||
|
||||
static void init_inode_key(struct scoutfs_key *key, u64 ino)
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_FS_ZONE,
|
||||
@@ -296,8 +292,7 @@ static void init_inode_key(struct scoutfs_key *key, u64 ino)
|
||||
* fields because they should have already had a locked refreshed inode
|
||||
* to be dereferencing its contents.
|
||||
*/
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
int flags)
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
@@ -317,7 +312,7 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
if (atomic64_read(&si->last_refreshed) == refresh_gen)
|
||||
return 0;
|
||||
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
|
||||
mutex_lock(&si->item_mutex);
|
||||
if (atomic64_read(&si->last_refreshed) < refresh_gen) {
|
||||
@@ -663,22 +658,12 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
|
||||
} while (read_seqcount_retry(&si->seqcount, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* We have inversions between getting cluster locks while performing
|
||||
* final deletion on a freeing inode and waiting on a freeing inode
|
||||
* while holding a cluster lock.
|
||||
*
|
||||
* We can avoid these deadlocks by hiding freeing inodes in our hash
|
||||
* lookup function. We're fine with either returning null or populating
|
||||
* a new inode overlapping with eviction freeing a previous instance of
|
||||
* the inode.
|
||||
*/
|
||||
static int scoutfs_iget_test(struct inode *inode, void *arg)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
u64 *ino = arg;
|
||||
|
||||
return (si->ino == *ino) && !(inode->i_state & I_FREEING);
|
||||
return si->ino == *ino;
|
||||
}
|
||||
|
||||
static int scoutfs_iget_set(struct inode *inode, void *arg)
|
||||
@@ -692,49 +677,93 @@ static int scoutfs_iget_set(struct inode *inode, void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
|
||||
/*
|
||||
* There's a risk of a deadlock between lock invalidation and eviction.
|
||||
* Invalidation blocks locks while looking up inodes. Eviction blocks
|
||||
* inode lookups while trying to get a lock.
|
||||
*
|
||||
* We have an inode lookup variant which will never block waiting for an
|
||||
* inode. This is more aggressive than base ilookup5_nowait() which
|
||||
* will, you know, wait for inodes that are being freed. We have our
|
||||
* test function hide those inodes from find_inode so that it won't wait
|
||||
* on them.
|
||||
*
|
||||
* These semantics are sufficiently weird that we use a big giant scary
|
||||
* looking function name to deter use.
|
||||
*/
|
||||
static int ilookup_test_nonewfree(struct inode *inode, void *arg)
|
||||
{
|
||||
return ilookup5(sb, ino, scoutfs_iget_test, &ino);
|
||||
return scoutfs_iget_test(inode, arg) &&
|
||||
!(inode->i_state & (I_NEW | I_WILL_FREE | I_FREEING));
|
||||
}
|
||||
struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino)
|
||||
{
|
||||
return ilookup5_nowait(sb, ino, ilookup_test_nonewfree, &ino);
|
||||
}
|
||||
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf)
|
||||
/*
|
||||
* Final iput can delete an unused inode's items which can take multiple
|
||||
* locked transactions. iget (which can call iput in error cases) and
|
||||
* iput must not be called with locks or transactions held.
|
||||
*/
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
|
||||
{
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode;
|
||||
struct inode *inode = NULL;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
|
||||
&ino);
|
||||
/* wait for vfs inode (I_FREEING in particular) before acquiring cluster lock */
|
||||
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino);
|
||||
if (!inode) {
|
||||
inode = ERR_PTR(-ENOMEM);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (inode->i_state & I_NEW) {
|
||||
/* XXX ensure refresh, instead clear in drop_inode? */
|
||||
si = SCOUTFS_I(inode);
|
||||
atomic64_set(&si->last_refreshed, 0);
|
||||
inode->i_version = 0;
|
||||
|
||||
ret = scoutfs_inode_refresh(inode, lock, 0);
|
||||
if (ret == 0)
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
if (ret) {
|
||||
iget_failed(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
} else {
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_inode_refresh(inode, lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* check nlink both for new and after refreshing */
|
||||
if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (inode->i_state & I_NEW) {
|
||||
ret = scoutfs_omap_set(sb, ino);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
set_inode_ops(inode);
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
|
||||
if (ret < 0) {
|
||||
if (inode) {
|
||||
if (inode->i_state & I_NEW)
|
||||
iget_failed(inode);
|
||||
else
|
||||
iput(inode);
|
||||
}
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
@@ -803,7 +832,7 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock)
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (!ret)
|
||||
@@ -1022,7 +1051,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
|
||||
ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list);
|
||||
BUG_ON(ret);
|
||||
|
||||
init_inode_key(&key, ino);
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
|
||||
err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (err) {
|
||||
@@ -1382,10 +1411,14 @@ out:
|
||||
/*
|
||||
* Allocate and initialize a new inode. The caller is responsible for
|
||||
* creating links to it and updating it. @dir can be null.
|
||||
*
|
||||
* This is called with locks and a transaction because it creates the
|
||||
* inode item. We can't call iput on the new inode on error. We
|
||||
* return the inode to the caller *including on error* for them to put
|
||||
* once they've released the transaction.
|
||||
*/
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev, u64 ino,
|
||||
struct scoutfs_lock *lock)
|
||||
int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
|
||||
u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret)
|
||||
{
|
||||
struct scoutfs_inode_info *si;
|
||||
struct scoutfs_key key;
|
||||
@@ -1394,8 +1427,10 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
int ret;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (!inode) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
si = SCOUTFS_I(inode);
|
||||
si->ino = ino;
|
||||
@@ -1421,22 +1456,19 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
set_inode_ops(inode);
|
||||
|
||||
store_inode(&sinode, inode);
|
||||
init_inode_key(&key, scoutfs_ino(inode));
|
||||
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
|
||||
|
||||
ret = scoutfs_omap_inc(sb, ino);
|
||||
ret = scoutfs_omap_set(sb, ino);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (ret < 0)
|
||||
scoutfs_omap_dec(sb, ino);
|
||||
scoutfs_omap_clear(sb, ino);
|
||||
out:
|
||||
if (ret) {
|
||||
iput(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
*inode_ret = inode;
|
||||
|
||||
return inode;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void init_orphan_key(struct scoutfs_key *key, u64 ino)
|
||||
@@ -1471,44 +1503,6 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
|
||||
return scoutfs_item_delete_force(sb, &key, lock);
|
||||
}
|
||||
|
||||
struct deleting_ino_entry {
|
||||
struct list_head head;
|
||||
u64 ino;
|
||||
};
|
||||
|
||||
static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
|
||||
{
|
||||
struct deleting_ino_entry *tmp;
|
||||
bool added = true;
|
||||
|
||||
spin_lock(&inf->deleting_items_lock);
|
||||
|
||||
list_for_each_entry(tmp, &inf->deleting_items_list, head) {
|
||||
if (tmp->ino == ino) {
|
||||
added = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (added) {
|
||||
del->ino = ino;
|
||||
list_add_tail(&del->head, &inf->deleting_items_list);
|
||||
}
|
||||
|
||||
spin_unlock(&inf->deleting_items_lock);
|
||||
|
||||
return added;
|
||||
}
|
||||
|
||||
static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
|
||||
{
|
||||
if (del->ino) {
|
||||
spin_lock(&inf->deleting_items_lock);
|
||||
list_del_init(&del->head);
|
||||
spin_unlock(&inf->deleting_items_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all the items associated with a given inode. This is only
|
||||
* called once nlink has dropped to zero and nothing has the inode open
|
||||
@@ -1517,22 +1511,10 @@ static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entr
|
||||
* orphan item will continue triggering attempts to finish previous
|
||||
* partial deletion until all deletion is complete and the orphan item
|
||||
* is removed.
|
||||
*
|
||||
* Currently this can be called multiple times for multiple cached
|
||||
* inodes for a given ino number (ilookup avoids freeing inodes to avoid
|
||||
* cluster lock<->inode flag waiting inversions). Some items are not
|
||||
* safe to delete concurrently, for example concurrent data truncation
|
||||
* could free extents multiple times. We use a very silly list of inos
|
||||
* being deleted. Duplicates just return success. If the first
|
||||
* deletion ends up failing orphan deletion will come back around later
|
||||
* and retry.
|
||||
*/
|
||||
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
|
||||
struct scoutfs_lock *orph_lock)
|
||||
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
struct deleting_ino_entry del = {{NULL, }};
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
LIST_HEAD(ind_locks);
|
||||
bool release = false;
|
||||
@@ -1541,30 +1523,10 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
|
||||
u64 size;
|
||||
int ret;
|
||||
|
||||
if (!added_deleting_ino(inf, &del, ino)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
|
||||
init_inode_key(&key, ino);
|
||||
|
||||
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
|
||||
lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* XXX corruption, inode probably won't be freed without repair */
|
||||
if (le32_to_cpu(sinode.nlink)) {
|
||||
scoutfs_warn(sb, "Dangling orphan item for inode %llu.", ino);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mode = le32_to_cpu(sinode.mode);
|
||||
size = le64_to_cpu(sinode.size);
|
||||
mode = le32_to_cpu(sinode->mode);
|
||||
size = le64_to_cpu(sinode->size);
|
||||
trace_scoutfs_delete_inode(sb, ino, mode, size);
|
||||
|
||||
/* remove data items in their own transactions */
|
||||
@@ -1582,7 +1544,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
|
||||
/* then delete the small known number of remaining inode items */
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
|
||||
prepare_index_deletion(sb, &ind_locks, ino, mode, sinode) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
@@ -1591,7 +1553,7 @@ retry:
|
||||
|
||||
release = true;
|
||||
|
||||
ret = remove_index_items(sb, ino, &sinode, &ind_locks);
|
||||
ret = remove_index_items(sb, ino, sinode, &ind_locks);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1601,15 +1563,21 @@ retry:
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_item_delete(sb, &key, lock);
|
||||
if (ret)
|
||||
/* make sure inode item and orphan are deleted together */
|
||||
ret = scoutfs_item_dirty(sb, &key, lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
|
||||
if (ret == 0)
|
||||
scoutfs_forest_dec_inode_count(sb);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_item_delete(sb, &key, lock);
|
||||
BUG_ON(ret != 0); /* dirtying should have guaranteed success */
|
||||
|
||||
scoutfs_forest_dec_inode_count(sb);
|
||||
|
||||
out:
|
||||
del_deleting_ino(inf, &del);
|
||||
if (release)
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
@@ -1617,48 +1585,192 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct inode_deletion_lock_data {
|
||||
wait_queue_head_t waitq;
|
||||
atomic64_t seq;
|
||||
struct scoutfs_open_ino_map map;
|
||||
unsigned long trying[DIV_ROUND_UP(SCOUTFS_OPEN_INO_MAP_BITS, BITS_PER_LONG)];
|
||||
};
|
||||
|
||||
/*
|
||||
* iput_final has already written out the dirty pages to the inode
|
||||
* before we get here. We're left with a clean inode that we have to
|
||||
* tear down. We use locking and open inode number bitmaps to decide if
|
||||
* we should finally destroy an inode that is no longer open nor
|
||||
* reachable through directory entries.
|
||||
* Get a lock data struct that has the current omap from this hold of
|
||||
* the lock. The lock data is saved on the lock so it can be used
|
||||
* multiple times until the lock is refreshed. Only one task will send
|
||||
* an omap request at a time, and errors are only returned by each task
|
||||
* as it gets a response to its send.
|
||||
*/
|
||||
static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
struct inode_deletion_lock_data **ldata_ret, u64 group_nr)
|
||||
{
|
||||
struct inode_deletion_lock_data *ldata;
|
||||
u64 seq;
|
||||
int ret;
|
||||
|
||||
/* we're storing omap maps in locks, they need to cover the same number of inodes */
|
||||
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
|
||||
|
||||
/* allocate a new lock data struct as needed */
|
||||
while ((ldata = cmpxchg(&lock->inode_deletion_data, NULL, NULL)) == NULL) {
|
||||
ldata = kzalloc(sizeof(struct inode_deletion_lock_data), GFP_NOFS);
|
||||
if (!ldata) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
atomic64_set(&ldata->seq, lock->write_seq - 1); /* ensure refresh */
|
||||
init_waitqueue_head(&ldata->waitq);
|
||||
|
||||
/* the lock kfrees the inode_deletion_data pointer along with the lock */
|
||||
if (cmpxchg(&lock->inode_deletion_data, NULL, ldata) == NULL)
|
||||
break;
|
||||
else
|
||||
kfree(ldata);
|
||||
}
|
||||
|
||||
/* make sure that the lock's data is current */
|
||||
while ((seq = atomic64_read(&ldata->seq)) != lock->write_seq) {
|
||||
if (seq != U64_MAX && atomic64_cmpxchg(&ldata->seq, seq, U64_MAX) == seq) {
|
||||
/* ask the server for current omap */
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
|
||||
if (ret == 0)
|
||||
atomic64_set(&ldata->seq, lock->write_seq);
|
||||
else
|
||||
atomic64_set(&ldata->seq, lock->write_seq - 1);
|
||||
wake_up(&ldata->waitq);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
} else {
|
||||
/* wait for someone else who's sent a request */
|
||||
wait_event(ldata->waitq, atomic64_read(&ldata->seq) != U64_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
ldata = NULL;
|
||||
*ldata_ret = ldata;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to delete all the items for an unused inode number. This is the
|
||||
* relatively slow path that uses cluster locks, network requests, and
|
||||
* IO to ensure correctness. Callers should try hard to avoid calling
|
||||
* when there's no work to do.
|
||||
*
|
||||
* Because lookup ignores freeing inodes we can get here from multiple
|
||||
* instances of an inode that is being deleted. Orphan scanning in
|
||||
* particular can race with deletion. delete_inode_items() resolves
|
||||
* concurrent attempts.
|
||||
* Inode references are added under cluster locks. In-memory vfs cache
|
||||
* references are added under read cluster locks and are visible in omap
|
||||
* bitmaps. Directory entry references are added under write cluster
|
||||
* locks and are visible in the inode's nlink. Orphan items exist
|
||||
* whenever nlink == 0 and are maintained under write cluster locks.
|
||||
* Directory entries can be added to an inode with nlink == 0 to
|
||||
* instantiate tmpfile inodes into the name space. Cached inodes will
|
||||
* not be created for inodes with an nlink of 0.
|
||||
*
|
||||
* Combining all this we know that it's safe to delete an inode's items
|
||||
* when we hold an exclusive write cluster lock, the inode has nlink ==
|
||||
* 0, and an omap request protected by the lock doesn't have the inode's
|
||||
* bit set.
|
||||
*
|
||||
* This is called by orphan scanning and vfs inode cache eviction after
|
||||
* they've checked that the inode could really be deleted. We serialize
|
||||
* on a bit in the lock data so that we only have one deletion attempt
|
||||
* per inode under this mount's cluster lock.
|
||||
*/
|
||||
static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct inode_deletion_lock_data *ldata = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
ret = get_current_lock_data(sb, lock, &ldata, group_nr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* only one local attempt per inode at a time */
|
||||
if (test_and_set_bit(bit_nr, ldata->trying)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* can't delete if it's cached in local or remote mounts */
|
||||
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inode_init_key(&key, ino);
|
||||
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (le32_to_cpu(sinode.nlink) > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
|
||||
out:
|
||||
if (ldata)
|
||||
clear_bit(bit_nr, ldata->trying);
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* As we drop an inode we need to decide to try and delete its items or
|
||||
* not, which is expensive. The two common cases we want to get right
|
||||
* both have cluster lock coverage and don't want to delete. Dropping
|
||||
* unused inodes during read lock invalidation has the current lock and
|
||||
* sees a nonzero nlink and knows not to delete. Final iput after a
|
||||
* local unlink also has a lock, sees a zero nlink, and tries to perform
|
||||
* item deletion in the task that dropped the last link, as users
|
||||
* expect.
|
||||
*
|
||||
* Evicting an inode outside of cluster locking is the odd slow path
|
||||
* that involves lock contention during use the worst cross-mount
|
||||
* open-unlink/delete case.
|
||||
*/
|
||||
void scoutfs_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_lock *orph_lock;
|
||||
struct scoutfs_lock *lock;
|
||||
int ret;
|
||||
|
||||
trace_scoutfs_evict_inode(inode->i_sb, scoutfs_ino(inode),
|
||||
inode->i_nlink, is_bad_inode(inode));
|
||||
trace_scoutfs_evict_inode(sb, ino, inode->i_nlink, is_bad_inode(inode));
|
||||
|
||||
if (is_bad_inode(inode))
|
||||
goto clear;
|
||||
if (!is_bad_inode(inode)) {
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
/* clear before trying to delete tests */
|
||||
scoutfs_omap_clear(sb, ino);
|
||||
|
||||
ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
|
||||
if (ret > 0) {
|
||||
ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
}
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
|
||||
ret, ino);
|
||||
if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0)
|
||||
try_delete_inode_items(sb, scoutfs_ino(inode));
|
||||
}
|
||||
|
||||
scoutfs_omap_dec(sb, ino);
|
||||
|
||||
clear:
|
||||
clear_inode(inode);
|
||||
}
|
||||
|
||||
@@ -1734,18 +1846,26 @@ void scoutfs_inode_queue_iput(struct inode *inode)
|
||||
/*
|
||||
* All mounts are performing this work concurrently. We introduce
|
||||
* significant jitter between them to try and keep them from all
|
||||
* bunching up and working on the same inodes.
|
||||
* bunching up and working on the same inodes. We always try to delay
|
||||
* for at least one jiffy if precision tricks us into calculating no
|
||||
* delay.
|
||||
*/
|
||||
static void schedule_orphan_dwork(struct inode_sb_info *inf)
|
||||
void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
|
||||
{
|
||||
#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
|
||||
#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
|
||||
unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
|
||||
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
struct scoutfs_mount_options opts;
|
||||
unsigned long low;
|
||||
unsigned long high;
|
||||
unsigned long delay;
|
||||
|
||||
if (!inf->stopped) {
|
||||
delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
|
||||
prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
|
||||
schedule_delayed_work(&inf->orphan_scan_dwork, delay);
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
low = (opts.orphan_scan_delay_ms * 80) / 100;
|
||||
high = (opts.orphan_scan_delay_ms * 120) / 100;
|
||||
delay = msecs_to_jiffies(low + prandom_u32_max(high - low)) ?: 1;
|
||||
|
||||
mod_delayed_work(system_wq, &inf->orphan_scan_dwork, delay);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1753,11 +1873,10 @@ static void schedule_orphan_dwork(struct inode_sb_info *inf)
|
||||
* Find and delete inodes whose only remaining reference is the
|
||||
* persistent orphan item that was created as they were unlinked.
|
||||
*
|
||||
* Orphan items are created as the final directory entry referring to an
|
||||
* inode is deleted. They're deleted as the final cached inode is
|
||||
* evicted and the inode items are destroyed. They can linger if all
|
||||
* the cached inodes pinning the inode fail to delete as they are
|
||||
* evicted from the cache -- either through crashing or errors.
|
||||
* Orphan items are maintained for inodes that have an nlink of 0.
|
||||
* Typically this is from unlink, but tmpfiles are created with orphans.
|
||||
* They're deleted as the final cached inode is evicted and the inode
|
||||
* items are destroyed.
|
||||
*
|
||||
* This work runs in all mounts in the background looking for those
|
||||
* orphaned inodes that weren't fully deleted.
|
||||
@@ -1766,20 +1885,16 @@ static void schedule_orphan_dwork(struct inode_sb_info *inf)
|
||||
* only find orphan items that made it to the fs root after being merged
|
||||
* from a mount's log btree. This naturally avoids orphan items that
|
||||
* exist while inodes have been unlinked but are still cached, including
|
||||
* O_TMPFILE inodes that are actively used during normal operations.
|
||||
* tmpfile inodes that are actively used during normal operations.
|
||||
* Scanning the read-only persistent fs root uses cached blocks and
|
||||
* avoids the lock contention we'd cause if we tried to use the
|
||||
* consistent item cache. The downside is that it adds a bit of
|
||||
* latency. If an orphan was created in error it'll take until the
|
||||
* mount's log btree is finalized and merged. A crash will have the log
|
||||
* btree merged after it is fenced.
|
||||
* latency.
|
||||
*
|
||||
* Once we find candidate orphan items we can first check our local
|
||||
* inode cache for inodes that are already on their way to eviction and
|
||||
* can be skipped. Then we ask the server for the open map containing
|
||||
* the inode. Only if we don't have it cached, and no one else does, do
|
||||
* we try and read it into our cache and evict it to trigger the final
|
||||
* inode deletion process.
|
||||
* Once we find candidate orphan items we first check our local omap for
|
||||
* a locally cached inode. Then we ask the server for the open map
|
||||
* containing the inode. Only if we don't see any cached users do we do
|
||||
* the expensive work of acquiring locks to try and delete the items.
|
||||
*/
|
||||
static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
{
|
||||
@@ -1791,7 +1906,6 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_key last;
|
||||
struct scoutfs_key key;
|
||||
struct inode *inode;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
u64 ino;
|
||||
@@ -1830,17 +1944,14 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
scoutfs_inc_counter(sb, orphan_scan_item);
|
||||
ino = le64_to_cpu(key.sko_ino);
|
||||
|
||||
/* locally cached inodes will already be deleted */
|
||||
inode = scoutfs_ilookup(sb, ino);
|
||||
if (inode) {
|
||||
/* locally cached inodes will try to delete as they evict */
|
||||
if (scoutfs_omap_test(sb, ino)) {
|
||||
scoutfs_inc_counter(sb, orphan_scan_cached);
|
||||
iput(inode);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* get an omap that covers the orphaned ino */
|
||||
group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
|
||||
bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
if (le64_to_cpu(omap.args.group_nr) != group_nr) {
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
|
||||
@@ -1848,25 +1959,15 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* don't need to evict if someone else has it open (cached) */
|
||||
/* remote cached inodes will also try to delete */
|
||||
if (test_bit_le(bit_nr, omap.bits)) {
|
||||
scoutfs_inc_counter(sb, orphan_scan_omap_set);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* try to cached and evict unused inode to delete, can be racing */
|
||||
inode = scoutfs_iget(sb, ino, 0);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ENOENT)
|
||||
continue;
|
||||
else
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, orphan_scan_read);
|
||||
SCOUTFS_I(inode)->drop_invalidated = true;
|
||||
iput(inode);
|
||||
/* seemingly orphaned and unused, get locks and check for sure */
|
||||
scoutfs_inc_counter(sb, orphan_scan_attempts);
|
||||
ret = try_delete_inode_items(sb, ino);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
@@ -1875,7 +1976,7 @@ out:
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, orphan_scan_error);
|
||||
|
||||
schedule_orphan_dwork(inf);
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1983,8 +2084,6 @@ int scoutfs_inode_setup(struct super_block *sb)
|
||||
spin_lock_init(&inf->dir_ino_alloc.lock);
|
||||
spin_lock_init(&inf->ino_alloc.lock);
|
||||
INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
|
||||
spin_lock_init(&inf->deleting_items_lock);
|
||||
INIT_LIST_HEAD(&inf->deleting_items_list);
|
||||
INIT_WORK(&inf->iput_work, iput_worker);
|
||||
init_llist_head(&inf->iput_llist);
|
||||
|
||||
@@ -2000,9 +2099,7 @@ int scoutfs_inode_setup(struct super_block *sb)
|
||||
*/
|
||||
void scoutfs_inode_start(struct super_block *sb)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
|
||||
schedule_orphan_dwork(inf);
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -80,9 +80,13 @@ int scoutfs_drop_inode(struct inode *inode);
|
||||
void scoutfs_evict_inode(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode);
|
||||
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf);
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
|
||||
#define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf);
|
||||
struct inode *scoutfs_ilookup_nowait(struct super_block *sb, u64 ino);
|
||||
struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino);
|
||||
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino);
|
||||
void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
|
||||
u32 minor, u64 ino);
|
||||
int scoutfs_inode_index_start(struct super_block *sb, u64 *seq);
|
||||
@@ -102,9 +106,8 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
|
||||
struct list_head *ind_locks);
|
||||
|
||||
int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret);
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev, u64 ino,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
|
||||
u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret);
|
||||
|
||||
void scoutfs_inode_set_meta_seq(struct inode *inode);
|
||||
void scoutfs_inode_set_data_seq(struct inode *inode);
|
||||
@@ -117,14 +120,14 @@ u64 scoutfs_inode_data_version(struct inode *inode);
|
||||
void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off);
|
||||
int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock);
|
||||
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
|
||||
int flags);
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock);
|
||||
int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb);
|
||||
|
||||
void scoutfs_inode_queue_writeback(struct inode *inode);
|
||||
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
|
||||
|
||||
@@ -387,7 +387,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
|
||||
if (sblock > eblock)
|
||||
return -EINVAL;
|
||||
|
||||
inode = scoutfs_ilookup(sb, args.ino);
|
||||
inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino);
|
||||
if (!inode) {
|
||||
ret = -ESTALE;
|
||||
goto out;
|
||||
@@ -1320,6 +1320,84 @@ out:
|
||||
return ret ?: count;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_get_allocated_inos __user *ugai = (void __user *)arg;
|
||||
struct scoutfs_ioctl_get_allocated_inos gai;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_key key;
|
||||
struct scoutfs_key end;
|
||||
u64 __user *uinos;
|
||||
u64 bytes;
|
||||
u64 ino;
|
||||
int nr;
|
||||
int ret;
|
||||
|
||||
if (!(file->f_mode & FMODE_READ)) {
|
||||
ret = -EBADF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(&gai, ugai, sizeof(gai))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((gai.inos_ptr & (sizeof(__u64) - 1)) || (gai.inos_bytes < sizeof(__u64))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_inode_init_key(&key, gai.start_ino);
|
||||
scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
uinos = (void __user *)gai.inos_ptr;
|
||||
bytes = gai.inos_bytes;
|
||||
nr = 0;
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
while (bytes >= sizeof(*uinos)) {
|
||||
|
||||
ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (key.sk_zone != SCOUTFS_FS_ZONE) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* all fs items are owned by allocated inodes, and _first is always ino */
|
||||
ino = le64_to_cpu(key._sk_first);
|
||||
if (put_user(ino, uinos)) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
uinos++;
|
||||
bytes -= sizeof(*uinos);
|
||||
if (++nr == INT_MAX)
|
||||
break;
|
||||
|
||||
scoutfs_inode_init_key(&key, ino + 1);
|
||||
}
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
out:
|
||||
return ret ?: nr;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -1353,6 +1431,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_resize_devices(file, arg);
|
||||
case SCOUTFS_IOC_READ_XATTR_TOTALS:
|
||||
return scoutfs_ioc_read_xattr_totals(file, arg);
|
||||
case SCOUTFS_IOC_GET_ALLOCATED_INOS:
|
||||
return scoutfs_ioc_get_allocated_inos(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
@@ -520,4 +520,43 @@ struct scoutfs_ioctl_xattr_total {
|
||||
#define SCOUTFS_IOC_READ_XATTR_TOTALS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 15, struct scoutfs_ioctl_read_xattr_totals)
|
||||
|
||||
/*
|
||||
* This fills the caller's inos array with inode numbers that are in use
|
||||
* after the start ino, within an internal inode group.
|
||||
*
|
||||
* This only makes a promise about the state of the inode numbers within
|
||||
* the first and last numbers returned by one call. At one time, all of
|
||||
* those inodes were still allocated. They could have changed before
|
||||
* the call returned. And any numbers outside of the first and last
|
||||
* (or single) are undefined.
|
||||
*
|
||||
* This doesn't iterate over all allocated inodes, it only probes a
|
||||
* single group that the start inode is within. This interface was
|
||||
* first introduced to support tests that needed to find out about a
|
||||
* specific inode, while having some other similarly niche uses. It is
|
||||
* unsuitable for a consistent iteration over all the inode numbers in
|
||||
* use.
|
||||
*
|
||||
* This test of inode items doesn't serialize with the inode lifetime
|
||||
* mechanism. It only tells you the numbers of inodes that were once
|
||||
* active in the system and haven't yet been fully deleted. The inode
|
||||
* numbers returned could have been in the process of being deleted and
|
||||
* were already unreachable even before the call started.
|
||||
*
|
||||
* @start_ino: the first inode number that could be returned
|
||||
* @inos_ptr: pointer to an aligned array of 64bit inode numbers
|
||||
* @inos_bytes: the number of bytes available in the inos_ptr array
|
||||
*
|
||||
* Returns errors or the count of inode numbers returned, quite possibly
|
||||
* including 0.
|
||||
*/
|
||||
struct scoutfs_ioctl_get_allocated_inos {
|
||||
__u64 start_ino;
|
||||
__u64 inos_ptr;
|
||||
__u64 inos_bytes;
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOC_GET_ALLOCATED_INOS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 16, struct scoutfs_ioctl_get_allocated_inos)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -142,7 +142,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode;
|
||||
|
||||
inode = scoutfs_ilookup(sb, ino);
|
||||
inode = scoutfs_ilookup_nowait_nonewfree(sb, ino);
|
||||
if (inode) {
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
@@ -255,7 +255,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
|
||||
BUG_ON(!list_empty(&lock->shrink_head));
|
||||
BUG_ON(!list_empty(&lock->cov_list));
|
||||
|
||||
scoutfs_omap_free_lock_data(lock->omap_data);
|
||||
kfree(lock->inode_deletion_data);
|
||||
kfree(lock);
|
||||
}
|
||||
|
||||
@@ -291,7 +291,6 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
|
||||
lock->mode = SCOUTFS_LOCK_NULL;
|
||||
|
||||
atomic64_set(&lock->forest_bloom_nr, 0);
|
||||
spin_lock_init(&lock->omap_spinlock);
|
||||
|
||||
trace_scoutfs_lock_alloc(sb, lock);
|
||||
|
||||
@@ -1050,7 +1049,7 @@ int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int
|
||||
goto out;
|
||||
|
||||
if (flags & SCOUTFS_LKF_REFRESH_INODE) {
|
||||
ret = scoutfs_inode_refresh(inode, *lock, flags);
|
||||
ret = scoutfs_inode_refresh(inode, *lock);
|
||||
if (ret < 0) {
|
||||
scoutfs_unlock(sb, *lock, mode);
|
||||
*lock = NULL;
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
#define SCOUTFS_LOCK_NR_MODES SCOUTFS_LOCK_INVALID
|
||||
|
||||
struct scoutfs_omap_lock;
|
||||
struct inode_deletion_lock_data;
|
||||
|
||||
/*
|
||||
* A few fields (start, end, refresh_gen, write_seq, granted_mode)
|
||||
@@ -47,9 +47,8 @@ struct scoutfs_lock {
|
||||
/* the forest tracks which log tree last saw bloom bit updates */
|
||||
atomic64_t forest_bloom_nr;
|
||||
|
||||
/* open ino mapping has a valid map for a held write lock */
|
||||
spinlock_t omap_spinlock;
|
||||
struct scoutfs_omap_lock_data *omap_data;
|
||||
/* inode deletion tracks some state per lock */
|
||||
struct inode_deletion_lock_data *inode_deletion_data;
|
||||
};
|
||||
|
||||
struct scoutfs_lock_coverage {
|
||||
|
||||
@@ -675,28 +675,18 @@ static void scoutfs_net_recv_worker(struct work_struct *work)
|
||||
|
||||
scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry);
|
||||
|
||||
/*
|
||||
* We want to drain the proc_workq in order to ensure that
|
||||
* that the inflight lock recovery work is fully flushed out
|
||||
* so that we can prevent the client/server racing trying to
|
||||
* do lock recovery and processing farewell at the same time.
|
||||
*/
|
||||
if (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn)
|
||||
drain_workqueue(conn->proc_workq);
|
||||
|
||||
/*
|
||||
* Initial received greetings and farewell are processed
|
||||
* Initial received greetings are processed
|
||||
* synchronously before any other incoming messages.
|
||||
*
|
||||
* Incoming requests or responses to the lock client are
|
||||
* called synchronously to avoid reordering.
|
||||
*/
|
||||
if (nh.cmd == SCOUTFS_NET_CMD_GREETING ||
|
||||
(nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn) ||
|
||||
(nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn))
|
||||
scoutfs_net_proc_worker(&mrecv->proc_work);
|
||||
else
|
||||
queue_work(conn->proc_workq, &mrecv->proc_work);
|
||||
queue_work(conn->workq, &mrecv->proc_work);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
@@ -861,7 +851,6 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
|
||||
}
|
||||
|
||||
destroy_workqueue(conn->workq);
|
||||
destroy_workqueue(conn->proc_workq);
|
||||
scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry);
|
||||
kfree(conn->info);
|
||||
trace_scoutfs_conn_destroy_free(conn);
|
||||
@@ -1161,8 +1150,6 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
|
||||
/* wait for socket and proc work to finish, includes chained work */
|
||||
drain_workqueue(conn->workq);
|
||||
|
||||
drain_workqueue(conn->proc_workq);
|
||||
|
||||
/* tear down the sock now that all work is done */
|
||||
if (conn->sock) {
|
||||
sock_release(conn->sock);
|
||||
@@ -1360,7 +1347,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
conn->workq = alloc_workqueue("scoutfs_net_workq_%s",
|
||||
conn->workq = alloc_workqueue("scoutfs_net_%s",
|
||||
WQ_UNBOUND | WQ_NON_REENTRANT, 0,
|
||||
name_suffix);
|
||||
if (!conn->workq) {
|
||||
@@ -1369,16 +1356,6 @@ scoutfs_net_alloc_conn(struct super_block *sb,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
conn->proc_workq = alloc_workqueue("scoutfs_net_proc_workq_%s",
|
||||
WQ_UNBOUND | WQ_NON_REENTRANT, 0,
|
||||
name_suffix);
|
||||
if (!conn->proc_workq) {
|
||||
destroy_workqueue(conn->workq);
|
||||
kfree(conn->info);
|
||||
kfree(conn);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
conn->sb = sb;
|
||||
conn->notify_up = notify_up;
|
||||
conn->notify_down = notify_down;
|
||||
@@ -1411,14 +1388,6 @@ scoutfs_net_alloc_conn(struct super_block *sb,
|
||||
return conn;
|
||||
}
|
||||
|
||||
/* Give the caller the client info of the connection. This is used by
|
||||
* server processing the server_farewell, and lock response/recovery.
|
||||
*/
|
||||
void *scoutfs_net_client_info(struct scoutfs_net_connection *conn)
|
||||
{
|
||||
return conn->info;
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the caller the client rid of the connection. This used by rare
|
||||
* server processing callers who want to send async responses after
|
||||
@@ -1803,23 +1772,6 @@ int scoutfs_net_response_node(struct super_block *sb,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* The response function that was submitted with the request is not
|
||||
* called if the request is canceled here.
|
||||
*/
|
||||
void scoutfs_net_cancel_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id)
|
||||
{
|
||||
struct message_send *msend;
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
msend = find_request(conn, cmd, id);
|
||||
if (msend)
|
||||
complete_send(conn, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
}
|
||||
|
||||
struct sync_request_completion {
|
||||
struct completion comp;
|
||||
void *resp;
|
||||
|
||||
@@ -63,7 +63,6 @@ struct scoutfs_net_connection {
|
||||
atomic64_t recv_seq;
|
||||
|
||||
struct workqueue_struct *workq;
|
||||
struct workqueue_struct *proc_workq;
|
||||
struct work_struct listen_work;
|
||||
struct work_struct connect_work;
|
||||
struct work_struct send_work;
|
||||
@@ -116,7 +115,6 @@ scoutfs_net_alloc_conn(struct super_block *sb,
|
||||
scoutfs_net_notify_t notify_up,
|
||||
scoutfs_net_notify_t notify_down, size_t info_size,
|
||||
scoutfs_net_request_t *req_funcs, char *name_suffix);
|
||||
void *scoutfs_net_client_info(struct scoutfs_net_connection *conn);
|
||||
u64 scoutfs_net_client_rid(struct scoutfs_net_connection *conn);
|
||||
int scoutfs_net_connect(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
@@ -136,9 +134,6 @@ int scoutfs_net_submit_request_node(struct super_block *sb,
|
||||
u64 rid, u8 cmd, void *arg, u16 arg_len,
|
||||
scoutfs_net_response_t resp_func,
|
||||
void *resp_data, u64 *id_ret);
|
||||
void scoutfs_net_cancel_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id);
|
||||
int scoutfs_net_sync_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
|
||||
296
kmod/src/omap.c
296
kmod/src/omap.c
@@ -30,27 +30,22 @@
|
||||
/*
|
||||
* As a client removes an inode from its cache with an nlink of 0 it
|
||||
* needs to decide if it is the last client using the inode and should
|
||||
* fully delete all its items. It needs to know if other mounts still
|
||||
* have the inode in use.
|
||||
* fully delete all the inode's items. It needs to know if other mounts
|
||||
* still have the inode in use.
|
||||
*
|
||||
* We need a way to communicate between mounts that an inode is open.
|
||||
* We need a way to communicate between mounts that an inode is in use.
|
||||
* We don't want to pay the synchronous per-file locking round trip
|
||||
* costs associated with per-inode open locks that you'd typically see
|
||||
* in systems to solve this problem.
|
||||
* in systems to solve this problem. The first prototypes of this
|
||||
* tracked open file handles so this was coined the open map, though it
|
||||
* now tracks cached inodes.
|
||||
*
|
||||
* Instead clients maintain open bitmaps that cover groups of inodes.
|
||||
* As inodes enter the cache their bit is set, and as the inode is
|
||||
* evicted the bit is cleared. As an inode is evicted messages are sent
|
||||
* around the cluster to get the current bitmaps for that inode's group
|
||||
* from all active mounts. If the inode's bit is clear then it can be
|
||||
* deleted.
|
||||
*
|
||||
* We associate the open bitmaps with our cluster locking of inode
|
||||
* groups to cache these open bitmaps. As long as we have the lock then
|
||||
* nlink can't be changed on any remote mounts. Specifically, it can't
|
||||
* increase from 0 so any clear bits can gain references on remote
|
||||
* mounts. As long as we have the lock, all clear bits in the group for
|
||||
* inodes with 0 nlink can be deleted.
|
||||
* Clients maintain bitmaps that cover groups of inodes. As inodes
|
||||
* enter the cache their bit is set and as the inode is evicted the bit
|
||||
* is cleared. As deletion is attempted, either by scanning orphans or
|
||||
* evicting an inode with an nlink of 0, messages are sent around the
|
||||
* cluster to get the current bitmaps for that inode's group from all
|
||||
* active mounts. If the inode's bit is clear then it can be deleted.
|
||||
*
|
||||
* This layer maintains a list of client rids to send messages to. The
|
||||
* server calls us as clients enter and leave the cluster. We can't
|
||||
@@ -85,14 +80,12 @@ struct omap_info {
|
||||
struct omap_info *name = SCOUTFS_SB(sb)->omap_info
|
||||
|
||||
/*
|
||||
* The presence of an inode in the inode cache increases the count of
|
||||
* its inode number's position within its lock group. These structs
|
||||
* track the counts for all the inodes in a lock group and maintain a
|
||||
* bitmap whose bits are set for each non-zero count.
|
||||
* The presence of an inode in the inode sets its bit in the lock
|
||||
* group's bitmap.
|
||||
*
|
||||
* We don't want to add additional global synchronization of inode cache
|
||||
* maintenance so these are tracked in an rcu hash table. Once their
|
||||
* total count reaches zero they're removed from the hash and queued for
|
||||
* total reaches zero they're removed from the hash and queued for
|
||||
* freeing and readers should ignore them.
|
||||
*/
|
||||
struct omap_group {
|
||||
@@ -102,7 +95,6 @@ struct omap_group {
|
||||
u64 nr;
|
||||
spinlock_t lock;
|
||||
unsigned int total;
|
||||
unsigned int *counts;
|
||||
__le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S];
|
||||
};
|
||||
|
||||
@@ -111,8 +103,7 @@ do { \
|
||||
__typeof__(group) _grp = (group); \
|
||||
__typeof__(bit_nr) _nr = (bit_nr); \
|
||||
\
|
||||
trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr, \
|
||||
_nr < 0 ? -1 : _grp->counts[_nr]); \
|
||||
trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
@@ -134,18 +125,6 @@ struct omap_request {
|
||||
struct scoutfs_open_ino_map map;
|
||||
};
|
||||
|
||||
/*
|
||||
* In each inode group cluster lock we store data to track the open ino
|
||||
* map which tracks all the inodes that the cluster lock covers. When
|
||||
* the seq shows that the map is stale we send a request to update it.
|
||||
*/
|
||||
struct scoutfs_omap_lock_data {
|
||||
u64 seq;
|
||||
bool req_in_flight;
|
||||
wait_queue_head_t waitq;
|
||||
struct scoutfs_open_ino_map map;
|
||||
};
|
||||
|
||||
static inline void init_rid_list(struct omap_rid_list *list)
|
||||
{
|
||||
INIT_LIST_HEAD(&list->head);
|
||||
@@ -232,7 +211,7 @@ static void free_rids(struct omap_rid_list *list)
|
||||
}
|
||||
}
|
||||
|
||||
static void calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
|
||||
void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
|
||||
{
|
||||
*group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
|
||||
*bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
|
||||
@@ -242,21 +221,13 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
|
||||
{
|
||||
struct omap_group *group;
|
||||
|
||||
BUILD_BUG_ON((sizeof(group->counts[0]) * SCOUTFS_OPEN_INO_MAP_BITS) > PAGE_SIZE);
|
||||
|
||||
group = kzalloc(sizeof(struct omap_group), GFP_NOFS);
|
||||
if (group) {
|
||||
group->sb = sb;
|
||||
group->nr = group_nr;
|
||||
spin_lock_init(&group->lock);
|
||||
|
||||
group->counts = (void *)get_zeroed_page(GFP_NOFS);
|
||||
if (!group->counts) {
|
||||
kfree(group);
|
||||
group = NULL;
|
||||
} else {
|
||||
trace_group(sb, alloc, group, -1);
|
||||
}
|
||||
trace_group(sb, alloc, group, -1);
|
||||
}
|
||||
|
||||
return group;
|
||||
@@ -265,7 +236,6 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
|
||||
static void free_group(struct super_block *sb, struct omap_group *group)
|
||||
{
|
||||
trace_group(sb, free, group, -1);
|
||||
free_page((unsigned long)group->counts);
|
||||
kfree(group);
|
||||
}
|
||||
|
||||
@@ -283,13 +253,16 @@ static const struct rhashtable_params group_ht_params = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Track an cached inode in its group. Our increment can be racing with
|
||||
* a final decrement that removes the group from the hash, sets total to
|
||||
* Track an cached inode in its group. Our set can be racing with a
|
||||
* final clear that removes the group from the hash, sets total to
|
||||
* UINT_MAX, and calls rcu free. We can retry until the dead group is
|
||||
* no longer visible in the hash table and we can insert a new allocated
|
||||
* group.
|
||||
*
|
||||
* The caller must ensure that the bit is clear, -EEXIST will be
|
||||
* returned otherwise.
|
||||
*/
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino)
|
||||
int scoutfs_omap_set(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
@@ -298,7 +271,7 @@ int scoutfs_omap_inc(struct super_block *sb, u64 ino)
|
||||
bool found;
|
||||
int ret = 0;
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
retry:
|
||||
found = false;
|
||||
@@ -308,10 +281,10 @@ retry:
|
||||
spin_lock(&group->lock);
|
||||
if (group->total < UINT_MAX) {
|
||||
found = true;
|
||||
if (group->counts[bit_nr]++ == 0) {
|
||||
set_bit_le(bit_nr, group->bits);
|
||||
if (WARN_ON_ONCE(test_and_set_bit_le(bit_nr, group->bits)))
|
||||
ret = -EEXIST;
|
||||
else
|
||||
group->total++;
|
||||
}
|
||||
}
|
||||
trace_group(sb, inc, group, bit_nr);
|
||||
spin_unlock(&group->lock);
|
||||
@@ -342,29 +315,50 @@ retry:
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool scoutfs_omap_test(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
bool ret = false;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
rcu_read_lock();
|
||||
group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
|
||||
if (group) {
|
||||
spin_lock(&group->lock);
|
||||
ret = !!test_bit_le(bit_nr, group->bits);
|
||||
spin_unlock(&group->lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrement a previously incremented ino count. Not finding a count
|
||||
* implies imbalanced inc/dec or bugs freeing groups. We only free
|
||||
* groups here as the last dec drops the group's total count to 0.
|
||||
* Clear a previously set ino bit. Trying to clear a bit that's already
|
||||
* clear implies imbalanced set/clear or bugs freeing groups. We only
|
||||
* free groups here as the last clear drops the group's total to 0.
|
||||
*/
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino)
|
||||
void scoutfs_omap_clear(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
rcu_read_lock();
|
||||
group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
|
||||
if (group) {
|
||||
spin_lock(&group->lock);
|
||||
WARN_ON_ONCE(group->counts[bit_nr] == 0);
|
||||
WARN_ON_ONCE(!test_bit_le(bit_nr, group->bits));
|
||||
WARN_ON_ONCE(group->total == 0);
|
||||
WARN_ON_ONCE(group->total == UINT_MAX);
|
||||
if (--group->counts[bit_nr] == 0) {
|
||||
clear_bit_le(bit_nr, group->bits);
|
||||
if (test_and_clear_bit_le(bit_nr, group->bits)) {
|
||||
if (--group->total == 0) {
|
||||
group->total = UINT_MAX;
|
||||
rhashtable_remove_fast(&ominf->group_ht, &group->ht_head,
|
||||
@@ -664,8 +658,7 @@ int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id,
|
||||
|
||||
/*
|
||||
* The client is receiving a request from the server for its map for the
|
||||
* given group. Look up the group and copy the bits to the map for
|
||||
* non-zero open counts.
|
||||
* given group. Look up the group and copy the bits to the map.
|
||||
*
|
||||
* The mount originating the request for this bitmap has the inode group
|
||||
* write locked. We can't be adding links to any inodes in the group
|
||||
@@ -814,179 +807,6 @@ void scoutfs_omap_server_shutdown(struct super_block *sb)
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lock_data *ldata)
|
||||
{
|
||||
bool in_flight;
|
||||
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
in_flight = ldata->req_in_flight;
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
|
||||
return in_flight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure the map covered by the cluster lock is current. The caller
|
||||
* holds the cluster lock so once we store lock_data on the cluster lock
|
||||
* it won't be freed and the write_seq in the cluster lock won't change.
|
||||
*
|
||||
* The omap_spinlock protects the omap_data in the cluster lock. We
|
||||
* have to drop it if we have to block to allocate lock_data, send a
|
||||
* request for a new map, or wait for a request in flight to finish.
|
||||
*/
|
||||
static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
struct scoutfs_omap_lock_data **ldata_ret, u64 group_nr)
|
||||
{
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
bool send_req;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
ldata = lock->omap_data;
|
||||
if (ldata == NULL) {
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
ldata = kzalloc(sizeof(struct scoutfs_omap_lock_data), GFP_NOFS);
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
if (!ldata) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (lock->omap_data == NULL) {
|
||||
ldata->seq = lock->write_seq - 1; /* ensure refresh */
|
||||
init_waitqueue_head(&ldata->waitq);
|
||||
|
||||
lock->omap_data = ldata;
|
||||
} else {
|
||||
kfree(ldata);
|
||||
ldata = lock->omap_data;
|
||||
}
|
||||
}
|
||||
|
||||
while (ldata->seq != lock->write_seq) {
|
||||
/* only one waiter sends a request at a time */
|
||||
if (!ldata->req_in_flight) {
|
||||
ldata->req_in_flight = true;
|
||||
send_req = true;
|
||||
} else {
|
||||
send_req = false;
|
||||
}
|
||||
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
if (send_req)
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
|
||||
else
|
||||
wait_event(ldata->waitq, !omap_req_in_flight(lock, ldata));
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
/* only sender can return error, other waiters retry */
|
||||
if (send_req) {
|
||||
ldata->req_in_flight = false;
|
||||
if (ret == 0)
|
||||
ldata->seq = lock->write_seq;
|
||||
wake_up(&ldata->waitq);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
|
||||
if (ret == 0)
|
||||
*ldata_ret = ldata;
|
||||
else
|
||||
*ldata_ret = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 1 and give the caller their locks when they should delete the
|
||||
* inode items. It's safe to delete the inode items when it is no
|
||||
* longer reachable and nothing is referencing it.
|
||||
*
|
||||
* The inode is unreachable when nlink hits zero. Cluster locks protect
|
||||
* modification and testing of nlink. We use the ino_lock_cov covrage
|
||||
* to short circuit the common case of having a locked inode that hasn't
|
||||
* been deleted. If it isn't locked, we have to acquire the lock to
|
||||
* refresh the inode to see its current nlink.
|
||||
*
|
||||
* Then we use an open inode bitmap that covers all the inodes in the
|
||||
* lock group to determine if the inode is present in any other mount's
|
||||
* caches. We refresh it by asking the server for all clients' maps and
|
||||
* then store it in the lock. As long as we hold the lock nothing can
|
||||
* increase nlink from zero and let people get a reference to the inode.
|
||||
*/
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* lock group and omap constants are defined independently */
|
||||
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
|
||||
|
||||
if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (inode->i_nlink > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
/* only one request to refresh the map at a time */
|
||||
ret = get_current_lock_data(sb, lock, &ldata, group_nr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* can delete caller's zero nlink inode if it's not cached in other mounts */
|
||||
ret = !test_bit_le(bit_nr, ldata->map.bits);
|
||||
out:
|
||||
trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);
|
||||
|
||||
if (ret > 0) {
|
||||
err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
|
||||
if (err < 0)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
if (ret <= 0) {
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
lock = NULL;
|
||||
}
|
||||
|
||||
*lock_ret = lock;
|
||||
*orph_lock_ret = orph_lock;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata)
|
||||
{
|
||||
if (ldata) {
|
||||
WARN_ON_ONCE(ldata->req_in_flight);
|
||||
WARN_ON_ONCE(waitqueue_active(&ldata->waitq));
|
||||
kfree(ldata);
|
||||
}
|
||||
}
|
||||
|
||||
int scoutfs_omap_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#ifndef _SCOUTFS_OMAP_H_
|
||||
#define _SCOUTFS_OMAP_H_
|
||||
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
|
||||
int scoutfs_omap_set(struct super_block *sb, u64 ino);
|
||||
bool scoutfs_omap_test(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_clear(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
|
||||
struct scoutfs_open_ino_map_args *args);
|
||||
void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr);
|
||||
|
||||
int scoutfs_omap_add_rid(struct super_block *sb, u64 rid);
|
||||
int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid);
|
||||
|
||||
@@ -26,22 +26,30 @@
|
||||
#include "msg.h"
|
||||
#include "options.h"
|
||||
#include "super.h"
|
||||
#include "inode.h"
|
||||
|
||||
enum {
|
||||
Opt_metadev_path,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
static const match_table_t tokens = {
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
|
||||
struct options_sb_info {
|
||||
struct dentry *debugfs_dir;
|
||||
struct options_info {
|
||||
seqlock_t seqlock;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_sysfs_attrs sysfs_attrs;
|
||||
};
|
||||
|
||||
u32 scoutfs_option_u32(struct super_block *sb, int token)
|
||||
{
|
||||
WARN_ON_ONCE(1);
|
||||
return 0;
|
||||
}
|
||||
#define DECLARE_OPTIONS_INFO(sb, name) \
|
||||
struct options_info *name = SCOUTFS_SB(sb)->options_info
|
||||
|
||||
static int parse_bdev_path(struct super_block *sb, substring_t *substr,
|
||||
char **bdev_path_ret)
|
||||
@@ -89,8 +97,29 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
struct mount_options *parsed)
|
||||
static void free_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
kfree(opts->metadev_path);
|
||||
}
|
||||
|
||||
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
|
||||
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
memset(opts, 0, sizeof(*opts));
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->orphan_scan_delay_ms = DEFAULT_ORPHAN_SCAN_DELAY_MS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse the option string into our options struct. This can allocate
|
||||
* memory in the struct. The caller is responsible for always calling
|
||||
* free_options() when the struct is destroyed, including when we return
|
||||
* an error.
|
||||
*/
|
||||
static int parse_options(struct super_block *sb, char *options, struct scoutfs_mount_options *opts)
|
||||
{
|
||||
substring_t args[MAX_OPT_ARGS];
|
||||
int nr;
|
||||
@@ -98,49 +127,61 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
char *p;
|
||||
int ret;
|
||||
|
||||
/* Set defaults */
|
||||
memset(parsed, 0, sizeof(*parsed));
|
||||
parsed->quorum_slot_nr = -1;
|
||||
|
||||
while ((p = strsep(&options, ",")) != NULL) {
|
||||
if (!*p)
|
||||
continue;
|
||||
|
||||
token = match_token(p, tokens, args);
|
||||
switch (token) {
|
||||
case Opt_quorum_slot_nr:
|
||||
|
||||
if (parsed->quorum_slot_nr != -1) {
|
||||
case Opt_metadev_path:
|
||||
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
|
||||
case Opt_orphan_scan_delay_ms:
|
||||
if (opts->orphan_scan_delay_ms != -1) {
|
||||
scoutfs_err(sb, "multiple orphan_scan_delay_ms options provided, only provide one.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 ||
|
||||
nr < MIN_ORPHAN_SCAN_DELAY_MS || nr > MAX_ORPHAN_SCAN_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid orphan_scan_delay_ms option, must be between %lu and %lu",
|
||||
MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
opts->orphan_scan_delay_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_quorum_slot_nr:
|
||||
if (opts->quorum_slot_nr != -1) {
|
||||
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 || nr < 0 ||
|
||||
nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
||||
if (ret < 0 || nr < 0 || nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
||||
scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u",
|
||||
SCOUTFS_QUORUM_MAX_SLOTS - 1);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
parsed->quorum_slot_nr = nr;
|
||||
opts->quorum_slot_nr = nr;
|
||||
break;
|
||||
case Opt_metadev_path:
|
||||
|
||||
ret = parse_bdev_path(sb, &args[0],
|
||||
&parsed->metadev_path);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
default:
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
|
||||
p);
|
||||
break;
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
if (!parsed->metadev_path) {
|
||||
if (!opts->metadev_path) {
|
||||
scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -148,40 +189,181 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scoutfs_options_setup(struct super_block *sb)
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
unsigned int seq;
|
||||
|
||||
if (WARN_ON_ONCE(optinf == NULL)) {
|
||||
/* trying to use options before early setup or after destroy */
|
||||
init_default_options(opts);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&optinf->seqlock);
|
||||
memcpy(opts, &optinf->opts, sizeof(struct scoutfs_mount_options));
|
||||
} while (read_seqretry(&optinf->seqlock, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Early setup that parses and stores the options so that the rest of
|
||||
* setup can use them. Full options setup that relies on other
|
||||
* components will be done later.
|
||||
*/
|
||||
int scoutfs_options_early_setup(struct super_block *sb, char *options)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct options_sb_info *osi;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct options_info *optinf;
|
||||
int ret;
|
||||
|
||||
osi = kzalloc(sizeof(struct options_sb_info), GFP_KERNEL);
|
||||
if (!osi)
|
||||
return -ENOMEM;
|
||||
init_default_options(&opts);
|
||||
|
||||
sbi->options = osi;
|
||||
ret = parse_options(sb, options, &opts);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
osi->debugfs_dir = debugfs_create_dir("options", sbi->debug_root);
|
||||
if (!osi->debugfs_dir) {
|
||||
optinf = kzalloc(sizeof(struct options_info), GFP_KERNEL);
|
||||
if (!optinf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
seqlock_init(&optinf->seqlock);
|
||||
scoutfs_sysfs_init_attrs(sb, &optinf->sysfs_attrs);
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts = opts;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
sbi->options_info = optinf;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret)
|
||||
if (ret < 0)
|
||||
free_options(&opts);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
seq_printf(seq, ",metadev_path=%s", opts.metadev_path);
|
||||
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
||||
if (opts.quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s", opts.metadev_path);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(metadev_path);
|
||||
|
||||
static ssize_t orphan_scan_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.orphan_scan_delay_ms);
|
||||
}
|
||||
static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[20]; /* more than enough for octal -U32_MAX */
|
||||
long val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtol(nullterm, 0, &val);
|
||||
if (ret < 0 || val < MIN_ORPHAN_SCAN_DELAY_MS || val > MAX_ORPHAN_SCAN_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid orphan_scan_delay_ms value written to options sysfs file, must be between %lu and %lu",
|
||||
MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.orphan_scan_delay_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
|
||||
|
||||
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", opts.quorum_slot_nr);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
int scoutfs_options_setup(struct super_block *sb)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_sysfs_create_attrs(sb, &optinf->sysfs_attrs, options_attrs, "mount_options");
|
||||
if (ret < 0)
|
||||
scoutfs_options_destroy(sb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* We remove the sysfs files early in unmount so that they can't try to call other subsystems
|
||||
* as they're being destroyed.
|
||||
*/
|
||||
void scoutfs_options_stop(struct super_block *sb)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (optinf)
|
||||
scoutfs_sysfs_destroy_attrs(sb, &optinf->sysfs_attrs);
|
||||
}
|
||||
|
||||
void scoutfs_options_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct options_sb_info *osi = sbi->options;
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (osi) {
|
||||
if (osi->debugfs_dir)
|
||||
debugfs_remove_recursive(osi->debugfs_dir);
|
||||
kfree(osi);
|
||||
sbi->options = NULL;
|
||||
scoutfs_options_stop(sb);
|
||||
|
||||
if (optinf) {
|
||||
free_options(&optinf->opts);
|
||||
kfree(optinf);
|
||||
sbi->options_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,23 +5,19 @@
|
||||
#include <linux/in.h>
|
||||
#include "format.h"
|
||||
|
||||
enum scoutfs_mount_options {
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_metadev_path,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
struct mount_options {
|
||||
int quorum_slot_nr;
|
||||
struct scoutfs_mount_options {
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
};
|
||||
|
||||
int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
struct mount_options *parsed);
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
||||
|
||||
int scoutfs_options_early_setup(struct super_block *sb, char *options);
|
||||
int scoutfs_options_setup(struct super_block *sb);
|
||||
void scoutfs_options_stop(struct super_block *sb);
|
||||
void scoutfs_options_destroy(struct super_block *sb);
|
||||
|
||||
u32 scoutfs_option_u32(struct super_block *sb, int token);
|
||||
#define scoutfs_option_bool scoutfs_option_u32
|
||||
|
||||
#endif /* _SCOUTFS_OPTIONS_H_ */
|
||||
|
||||
@@ -116,6 +116,7 @@ struct quorum_info {
|
||||
struct socket *sock;
|
||||
bool shutdown;
|
||||
|
||||
int our_quorum_slot_nr;
|
||||
unsigned long flags;
|
||||
int votes_needed;
|
||||
|
||||
@@ -160,9 +161,7 @@ static ktime_t heartbeat_timeout(void)
|
||||
static int create_socket(struct super_block *sb)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct socket *sock = NULL;
|
||||
struct sockaddr_in sin;
|
||||
int addrlen;
|
||||
@@ -176,7 +175,7 @@ static int create_socket(struct super_block *sb)
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);
|
||||
|
||||
addrlen = sizeof(sin);
|
||||
ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
|
||||
@@ -207,7 +206,6 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
int only)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
ktime_t now;
|
||||
int i;
|
||||
@@ -216,7 +214,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
.fsid = super->hdr.fsid,
|
||||
.term = cpu_to_le64(term),
|
||||
.type = type,
|
||||
.from = opts->quorum_slot_nr,
|
||||
.from = qinf->our_quorum_slot_nr,
|
||||
};
|
||||
struct kvec kv = {
|
||||
.iov_base = &qmes,
|
||||
@@ -238,7 +236,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i) ||
|
||||
(only >= 0 && i != only) || i == opts->quorum_slot_nr)
|
||||
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
|
||||
continue;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, i, &sin);
|
||||
@@ -476,8 +474,8 @@ static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_
|
||||
*/
|
||||
static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
|
||||
{
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
u64 blkno = SCOUTFS_QUORUM_BLKNO + qinf->our_quorum_slot_nr;
|
||||
struct scoutfs_quorum_block blk;
|
||||
int ret;
|
||||
|
||||
@@ -622,7 +620,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst;
|
||||
@@ -724,7 +721,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
qst.term++;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
set_bit(opts->quorum_slot_nr, &qst.vote_bits);
|
||||
set_bit(qinf->our_quorum_slot_nr, &qst.vote_bits);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE,
|
||||
qst.term);
|
||||
qst.timeout = election_timeout();
|
||||
@@ -954,7 +951,6 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
||||
struct mount_options *opts = &SCOUTFS_SB(qinf->sb)->opts;
|
||||
struct quorum_status qst;
|
||||
struct last_msg last;
|
||||
struct timespec64 ts;
|
||||
@@ -971,7 +967,7 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
ret = 0;
|
||||
|
||||
snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n",
|
||||
opts->quorum_slot_nr);
|
||||
qinf->our_quorum_slot_nr);
|
||||
snprintf_ret(buf, size, &ret, "term %llu\n",
|
||||
qst.term);
|
||||
snprintf_ret(buf, size, &ret, "role %d (%s)\n",
|
||||
@@ -1048,7 +1044,6 @@ static inline bool valid_ipv4_port(__be16 port)
|
||||
static int verify_quorum_slots(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct sockaddr_in other;
|
||||
@@ -1099,7 +1094,7 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!quorum_slot_present(super, opts->quorum_slot_nr)) {
|
||||
if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
|
||||
char *str = slots;
|
||||
*str = '\0';
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
@@ -1114,7 +1109,7 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
}
|
||||
}
|
||||
scoutfs_err(sb, "quorum_slot_nr=%u option references unused slot, must be one of the following configured slots:%s",
|
||||
opts->quorum_slot_nr, slots);
|
||||
qinf->our_quorum_slot_nr, slots);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -1137,11 +1132,12 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
int scoutfs_quorum_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct quorum_info *qinf;
|
||||
int ret;
|
||||
|
||||
if (opts->quorum_slot_nr < 0)
|
||||
scoutfs_options_read(sb, &opts);
|
||||
if (opts.quorum_slot_nr < 0)
|
||||
return 0;
|
||||
|
||||
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
|
||||
@@ -1153,6 +1149,8 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
spin_lock_init(&qinf->show_lock);
|
||||
INIT_WORK(&qinf->work, scoutfs_quorum_worker);
|
||||
scoutfs_sysfs_init_attrs(sb, &qinf->ssa);
|
||||
/* static for the lifetime of the mount */
|
||||
qinf->our_quorum_slot_nr = opts.quorum_slot_nr;
|
||||
|
||||
sbi->quorum_info = qinf;
|
||||
qinf->sb = sb;
|
||||
|
||||
@@ -2620,9 +2620,9 @@ TRACE_EVENT(scoutfs_item_invalidate_page,
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
int bit_nr),
|
||||
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
@@ -2630,7 +2630,6 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
__field(__u64, group_nr)
|
||||
__field(unsigned int, group_total)
|
||||
__field(int, bit_nr)
|
||||
__field(int, bit_count)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -2639,43 +2638,42 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
__entry->group_nr = group_nr;
|
||||
__entry->group_total = group_total;
|
||||
__entry->bit_nr = bit_nr;
|
||||
__entry->bit_count = bit_count;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d",
|
||||
TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d",
|
||||
SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total,
|
||||
__entry->bit_nr, __entry->bit_count)
|
||||
__entry->bit_nr)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_omap_should_delete,
|
||||
|
||||
@@ -122,7 +122,6 @@ struct server_info {
|
||||
struct server_client_info {
|
||||
u64 rid;
|
||||
struct list_head head;
|
||||
bool received_farewell;
|
||||
};
|
||||
|
||||
static __le64 *first_valopt(struct scoutfs_volume_options *valopt)
|
||||
@@ -1506,15 +1505,11 @@ static int server_lock(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
struct server_client_info *sci = scoutfs_net_client_info(conn);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
|
||||
if (arg_len != sizeof(struct scoutfs_net_lock))
|
||||
return -EINVAL;
|
||||
|
||||
if (sci->received_farewell)
|
||||
return scoutfs_net_response(sb, conn, cmd, id, -EINVAL, NULL, 0);
|
||||
|
||||
return scoutfs_lock_server_request(sb, rid, id, arg);
|
||||
}
|
||||
|
||||
@@ -1523,15 +1518,11 @@ static int lock_response(struct super_block *sb,
|
||||
void *resp, unsigned int resp_len,
|
||||
int error, void *data)
|
||||
{
|
||||
struct server_client_info *sci = scoutfs_net_client_info(conn);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
|
||||
if (resp_len != sizeof(struct scoutfs_net_lock))
|
||||
return -EINVAL;
|
||||
|
||||
if (sci->received_farewell)
|
||||
return 0;
|
||||
|
||||
return scoutfs_lock_server_response(sb, rid, resp);
|
||||
}
|
||||
|
||||
@@ -1569,15 +1560,11 @@ static int lock_recover_response(struct super_block *sb,
|
||||
void *resp, unsigned int resp_len,
|
||||
int error, void *data)
|
||||
{
|
||||
struct server_client_info *sci = scoutfs_net_client_info(conn);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
|
||||
if (invalid_recover(resp, resp_len))
|
||||
return -EINVAL;
|
||||
|
||||
if (sci->received_farewell)
|
||||
return 0;
|
||||
|
||||
return scoutfs_lock_server_recover_response(sb, rid, resp);
|
||||
}
|
||||
|
||||
@@ -2468,15 +2455,14 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
}
|
||||
|
||||
/* find the completion's original saved request */
|
||||
ret = next_log_merge_item(sb, &super->log_merge,
|
||||
SCOUTFS_LOG_MERGE_REQUEST_ZONE,
|
||||
rid, le64_to_cpu(comp->seq),
|
||||
&orig_req, sizeof(orig_req));
|
||||
if (WARN_ON_ONCE(ret == 0 && (comp->rid != orig_req.rid ||
|
||||
comp->seq != orig_req.seq)))
|
||||
ret = -ENOENT; /* inconsistency */
|
||||
ret = next_log_merge_item(sb, &super->log_merge, SCOUTFS_LOG_MERGE_REQUEST_ZONE,
|
||||
rid, le64_to_cpu(comp->seq), &orig_req, sizeof(orig_req));
|
||||
if (ret == 0 && (comp->rid != orig_req.rid || comp->seq != orig_req.seq))
|
||||
ret = -ENOENT;
|
||||
if (ret < 0) {
|
||||
err_str = "finding orig request";
|
||||
/* ENOENT is expected for resent processed completion */
|
||||
if (ret != -ENOENT)
|
||||
err_str = "finding orig request";
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -2546,7 +2532,7 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
out:
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
if (ret < 0)
|
||||
if (ret < 0 && err_str)
|
||||
scoutfs_err(sb, "error %d committing log merge: %s", ret, err_str);
|
||||
|
||||
err = scoutfs_server_apply_commit(sb, ret);
|
||||
@@ -3462,6 +3448,18 @@ static void farewell_worker(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Responses that are ready to send can be further delayed by
|
||||
* moving them back to the reqs list.
|
||||
*/
|
||||
list_for_each_entry_safe(fw, tmp, &send, entry) {
|
||||
/* finish lock recovery before destroying locks, fenced if too long */
|
||||
if (scoutfs_recov_is_pending(sb, fw->rid, SCOUTFS_RECOV_LOCKS)) {
|
||||
list_move_tail(&fw->entry, &reqs);
|
||||
quo_reqs++;
|
||||
}
|
||||
}
|
||||
|
||||
/* clean up resources for mounts before sending responses */
|
||||
list_for_each_entry_safe(fw, tmp, &send, entry) {
|
||||
ret = reclaim_rid(sb, fw->rid);
|
||||
@@ -3529,11 +3527,9 @@ static int server_farewell(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
struct server_client_info *sci = scoutfs_net_client_info(conn);
|
||||
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct farewell_request *fw;
|
||||
int ret;
|
||||
|
||||
if (arg_len != 0)
|
||||
return -EINVAL;
|
||||
@@ -3551,20 +3547,6 @@ static int server_farewell(struct super_block *sb,
|
||||
list_add_tail(&fw->entry, &server->farewell_requests);
|
||||
spin_unlock(&server->farewell_lock);
|
||||
|
||||
/*
|
||||
* Tear down client lock server state and set that we recieved farewell
|
||||
* to ensure that we do not race between client and server trying to process
|
||||
* lock recovery at the same time (race). We also want to mark that the recovery
|
||||
* finished so that if client's try to send stuff later; the server doesnt care.
|
||||
*/
|
||||
sci->received_farewell = true;
|
||||
ret = scoutfs_lock_server_farewell(sb, rid);
|
||||
if (ret < 0) {
|
||||
kfree(fw);
|
||||
return ret;
|
||||
}
|
||||
scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_LOCKS);
|
||||
|
||||
queue_farewell_work(server);
|
||||
|
||||
/* response will be sent later */
|
||||
@@ -3656,8 +3638,14 @@ static void finished_recovery(struct super_block *sb)
|
||||
|
||||
void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
if (scoutfs_recov_finish(sb, rid, which) > 0)
|
||||
finished_recovery(sb);
|
||||
|
||||
/* rid's farewell response might be sent after it finishes lock recov */
|
||||
if (which & SCOUTFS_RECOV_LOCKS)
|
||||
queue_farewell_work(server);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3854,8 +3842,8 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_net_connection *conn = NULL;
|
||||
struct scoutfs_mount_options opts;
|
||||
DECLARE_WAIT_QUEUE_HEAD(waitq);
|
||||
struct sockaddr_in sin;
|
||||
bool alloc_init = false;
|
||||
@@ -3864,7 +3852,8 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
|
||||
trace_scoutfs_server_work_enter(sb, 0, 0);
|
||||
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
scoutfs_options_read(sb, &opts);
|
||||
scoutfs_quorum_slot_sin(super, opts.quorum_slot_nr, &sin);
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
|
||||
@@ -132,44 +132,6 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
if (opts->quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts->quorum_slot_nr);
|
||||
seq_printf(seq, ",metadev_path=%s", opts->metadev_path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(metadev_path);
|
||||
|
||||
static ssize_t quorum_server_nr_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", opts->quorum_slot_nr);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_server_nr);
|
||||
|
||||
static struct attribute *mount_options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(quorum_server_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int scoutfs_sync_fs(struct super_block *sb, int wait)
|
||||
{
|
||||
trace_scoutfs_sync_fs(sb, wait);
|
||||
@@ -246,13 +208,11 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
scoutfs_destroy_triggers(sb);
|
||||
scoutfs_fence_destroy(sb);
|
||||
scoutfs_options_destroy(sb);
|
||||
scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
|
||||
debugfs_remove(sbi->debug_root);
|
||||
scoutfs_destroy_counters(sb);
|
||||
scoutfs_destroy_sysfs(sb);
|
||||
scoutfs_metadev_close(sb);
|
||||
|
||||
kfree(sbi->opts.metadev_path);
|
||||
kfree(sbi);
|
||||
|
||||
sb->s_fs_info = NULL;
|
||||
@@ -282,7 +242,7 @@ static const struct super_operations scoutfs_super_ops = {
|
||||
.destroy_inode = scoutfs_destroy_inode,
|
||||
.sync_fs = scoutfs_sync_fs,
|
||||
.statfs = scoutfs_statfs,
|
||||
.show_options = scoutfs_show_options,
|
||||
.show_options = scoutfs_options_show,
|
||||
.put_super = scoutfs_put_super,
|
||||
.umount_begin = scoutfs_umount_begin,
|
||||
};
|
||||
@@ -511,9 +471,9 @@ out:
|
||||
|
||||
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi;
|
||||
struct mount_options opts;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct block_device *meta_bdev;
|
||||
struct scoutfs_sb_info *sbi;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
@@ -541,13 +501,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
spin_lock_init(&sbi->next_ino_lock);
|
||||
spin_lock_init(&sbi->data_wait_root.lock);
|
||||
sbi->data_wait_root.root = RB_ROOT;
|
||||
scoutfs_sysfs_init_attrs(sb, &sbi->mopts_ssa);
|
||||
|
||||
ret = scoutfs_parse_options(sb, data, &opts);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sbi->opts = opts;
|
||||
/* parse options early for use during setup */
|
||||
ret = scoutfs_options_early_setup(sb, data);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
|
||||
if (ret != SCOUTFS_BLOCK_SM_SIZE) {
|
||||
@@ -556,9 +515,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
}
|
||||
|
||||
meta_bdev =
|
||||
blkdev_get_by_path(sbi->opts.metadev_path,
|
||||
SCOUTFS_META_BDEV_MODE, sb);
|
||||
meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb);
|
||||
if (IS_ERR(meta_bdev)) {
|
||||
scoutfs_err(sb, "could not open metadev: error %ld",
|
||||
PTR_ERR(meta_bdev));
|
||||
@@ -578,8 +535,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
scoutfs_setup_sysfs(sb) ?:
|
||||
scoutfs_setup_counters(sb) ?:
|
||||
scoutfs_options_setup(sb) ?:
|
||||
scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
|
||||
mount_options_attrs, "mount_options") ?:
|
||||
scoutfs_setup_triggers(sb) ?:
|
||||
scoutfs_fence_setup(sb) ?:
|
||||
scoutfs_block_setup(sb) ?:
|
||||
@@ -601,7 +556,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
|
||||
/* this interruptible iget lets hung mount be aborted with ctl-c */
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE);
|
||||
inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE, 0);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
if (ret == -ERESTARTSYS)
|
||||
@@ -652,6 +607,7 @@ static void scoutfs_kill_sb(struct super_block *sb)
|
||||
}
|
||||
|
||||
if (SCOUTFS_HAS_SBI(sb)) {
|
||||
scoutfs_options_stop(sb);
|
||||
scoutfs_inode_orphan_stop(sb);
|
||||
scoutfs_lock_unmount_begin(sb);
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ struct scoutfs_sb_info {
|
||||
|
||||
spinlock_t next_ino_lock;
|
||||
|
||||
struct options_info *options_info;
|
||||
struct data_info *data_info;
|
||||
struct inode_sb_info *inode_sb_info;
|
||||
struct btree_info *btree_info;
|
||||
@@ -74,10 +75,6 @@ struct scoutfs_sb_info {
|
||||
struct scoutfs_counters *counters;
|
||||
struct scoutfs_triggers *triggers;
|
||||
|
||||
struct mount_options opts;
|
||||
struct options_sb_info *options;
|
||||
struct scoutfs_sysfs_attrs mopts_ssa;
|
||||
|
||||
struct dentry *debug_root;
|
||||
|
||||
bool forced_unmount;
|
||||
|
||||
@@ -640,6 +640,7 @@ void scoutfs_shutdown_trans(struct super_block *sb)
|
||||
tri->write_workq = NULL;
|
||||
}
|
||||
|
||||
scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri);
|
||||
scoutfs_block_writer_forget_all(sb, &tri->wri);
|
||||
|
||||
kfree(tri);
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -3,6 +3,7 @@ src/createmany
|
||||
src/dumb_renameat2
|
||||
src/dumb_setxattr
|
||||
src/handle_cat
|
||||
src/handle_fsetxattr
|
||||
src/bulk_create_paths
|
||||
src/find_xattrs
|
||||
src/stage_tmpfile
|
||||
|
||||
@@ -6,6 +6,7 @@ BIN := src/createmany \
|
||||
src/dumb_renameat2 \
|
||||
src/dumb_setxattr \
|
||||
src/handle_cat \
|
||||
src/handle_fsetxattr \
|
||||
src/bulk_create_paths \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs \
|
||||
|
||||
@@ -56,8 +56,11 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .*: all clients recovered"
|
||||
re="$re|scoutfs .* error: client rid.*lock recovery timed out"
|
||||
|
||||
# some tests mount w/o options
|
||||
# we test bad devices and options
|
||||
re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
|
||||
re="$re|scoutfs .* error: meta_super META flag not set"
|
||||
re="$re|scoutfs .* error: could not open metadev:.*"
|
||||
re="$re|scoutfs .* error: Unknown or malformed option,.*"
|
||||
|
||||
# in debugging kernels we can slow things down a bit
|
||||
re="$re|hrtimer: interrupt took .*"
|
||||
|
||||
@@ -362,3 +362,49 @@ t_wait_for_leader() {
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
t_set_sysfs_mount_option() {
|
||||
local nr="$1"
|
||||
local name="$2"
|
||||
local val="$3"
|
||||
local opt="$(t_sysfs_path $nr)/mount_options/$name"
|
||||
|
||||
echo "$val" > "$opt"
|
||||
}
|
||||
|
||||
t_set_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local val="$2"
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
t_set_sysfs_mount_option $i $name $val
|
||||
done
|
||||
}
|
||||
|
||||
declare -A _saved_opts
|
||||
t_save_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local ind
|
||||
local opt
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
opt="$(t_sysfs_path $i)/mount_options/$name"
|
||||
ind="$name_$i"
|
||||
|
||||
_saved_opts[$ind]="$(cat $opt)"
|
||||
done
|
||||
}
|
||||
|
||||
t_restore_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local ind
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
ind="$name_$i"
|
||||
|
||||
t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}"
|
||||
done
|
||||
}
|
||||
|
||||
6
tests/golden/basic-bad-mounts
Normal file
6
tests/golden/basic-bad-mounts
Normal file
@@ -0,0 +1,6 @@
|
||||
== prepare devices, mount point, and logs
|
||||
== bad devices, bad options
|
||||
== swapped devices
|
||||
== both meta devices
|
||||
== both data devices
|
||||
== good volume, bad option and good options
|
||||
3
tests/golden/fallocate
Normal file
3
tests/golden/fallocate
Normal file
@@ -0,0 +1,3 @@
|
||||
== creating reasonably large per-mount files
|
||||
== 10s of racing cold reads and fallocate nop
|
||||
== cleaning up files
|
||||
@@ -2,3 +2,4 @@
|
||||
== unlinked and opened inodes still exist
|
||||
== orphan from failed evict deletion is picked up
|
||||
== orphaned inos in all mounts all deleted
|
||||
== 30s of racing evict deletion, orphan scanning, and open by handle
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
export-get-name-parent.sh
|
||||
basic-block-counts.sh
|
||||
basic-bad-mounts.sh
|
||||
inode-items-updated.sh
|
||||
simple-inode-index.sh
|
||||
simple-staging.sh
|
||||
simple-release-extents.sh
|
||||
fallocate.sh
|
||||
setattr_more.sh
|
||||
offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
|
||||
189
tests/src/handle_fsetxattr.c
Normal file
189
tests/src/handle_fsetxattr.c
Normal file
@@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <inttypes.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <endian.h>
|
||||
#include <time.h>
|
||||
#include <linux/types.h>
|
||||
#include <sys/xattr.h>
|
||||
|
||||
#define FILEID_SCOUTFS 0x81
|
||||
#define FILEID_SCOUTFS_WITH_PARENT 0x82
|
||||
|
||||
struct our_handle {
|
||||
struct file_handle handle;
|
||||
/*
|
||||
* scoutfs file handle can be ino or ino/parent. The
|
||||
* handle_type field of struct file_handle denotes which
|
||||
* version is in use. We only use the ino variant here.
|
||||
*/
|
||||
__le64 scoutfs_ino;
|
||||
};
|
||||
|
||||
#define DEFAULT_NAME "user.handle_fsetxattr"
|
||||
#define DEFAULT_VALUE "value"
|
||||
|
||||
static void exit_usage(void)
|
||||
{
|
||||
printf(" -h/-? output this usage message and exit\n"
|
||||
" -e keep trying on enoent, consider success an error\n"
|
||||
" -i <num> 64bit inode number for handle open, can be multiple\n"
|
||||
" -m <string> scoutfs mount path string for ioctl fd\n"
|
||||
" -n <string> optional xattr name string, defaults to \""DEFAULT_NAME"\"\n"
|
||||
" -s <num> loop for num seconds, defaults to 0 for one iteration"
|
||||
" -v <string> optional xattr value string, defaults to \""DEFAULT_VALUE"\"\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct our_handle handle;
|
||||
struct timespec ts;
|
||||
bool enoent_success_err = false;
|
||||
uint64_t seconds = 0;
|
||||
char *value = NULL;
|
||||
char *name = NULL;
|
||||
char *mnt = NULL;
|
||||
int nr_inos = 0;
|
||||
uint64_t *inos;
|
||||
uint64_t i;
|
||||
int *fds;
|
||||
int mntfd;
|
||||
int fd;
|
||||
int ret;
|
||||
char c;
|
||||
int j;
|
||||
|
||||
/* can't have more inos than args */
|
||||
inos = calloc(argc, sizeof(inos[0]));
|
||||
fds = calloc(argc, sizeof(fds[0]));
|
||||
if (!inos || !fds) {
|
||||
perror("calloc");
|
||||
exit(1);
|
||||
}
|
||||
for (i = 0; i < argc; i++)
|
||||
fds[i] = -1;
|
||||
|
||||
while ((c = getopt(argc, argv, "+ei:m:n:s:v:")) != -1) {
|
||||
switch (c) {
|
||||
case 'e':
|
||||
enoent_success_err = true;
|
||||
break;
|
||||
case 'i':
|
||||
inos[nr_inos] = strtoll(optarg, NULL, 0);
|
||||
nr_inos++;
|
||||
break;
|
||||
case 'm':
|
||||
mnt = strdup(optarg);
|
||||
break;
|
||||
case 'n':
|
||||
name = strdup(optarg);
|
||||
break;
|
||||
case 's':
|
||||
seconds = strtoll(optarg, NULL, 0);
|
||||
break;
|
||||
case 'v':
|
||||
value = strdup(optarg);
|
||||
break;
|
||||
case '?':
|
||||
printf("unknown argument: %c\n", optind);
|
||||
case 'h':
|
||||
exit_usage();
|
||||
}
|
||||
}
|
||||
|
||||
if (nr_inos == 0) {
|
||||
printf("specify non-zero inode number with -i\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!mnt) {
|
||||
printf("specify scoutfs mount path for ioctl with -p\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (name == NULL)
|
||||
name = DEFAULT_NAME;
|
||||
if (value == NULL)
|
||||
value = DEFAULT_VALUE;
|
||||
|
||||
mntfd = open(mnt, O_RDONLY);
|
||||
if (mntfd == -1) {
|
||||
perror("opening mountpoint");
|
||||
return 1;
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
seconds += ts.tv_sec;
|
||||
|
||||
for (i = 0; ; i++) {
|
||||
for (j = 0; j < nr_inos; j++) {
|
||||
fd = fds[j];
|
||||
|
||||
if (fd < 0) {
|
||||
handle.handle.handle_bytes = sizeof(struct our_handle);
|
||||
handle.handle.handle_type = FILEID_SCOUTFS;
|
||||
handle.scoutfs_ino = htole64(inos[j]);
|
||||
|
||||
fd = open_by_handle_at(mntfd, &handle.handle, O_RDWR);
|
||||
if (fd == -1) {
|
||||
if (!enoent_success_err || errno != ENOENT) {
|
||||
perror("open_by_handle_at");
|
||||
return 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
fds[j] = fd;
|
||||
}
|
||||
|
||||
ret = fsetxattr(fd, name, value, strlen(value), 0);
|
||||
if (ret < 0) {
|
||||
perror("fsetxattr");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if ((i % 10) == 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
if (ts.tv_sec >= seconds)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (enoent_success_err) {
|
||||
bool able = false;
|
||||
for (i = 0; i < nr_inos; i++) {
|
||||
if (fds[i] >= 0) {
|
||||
printf("was able to open ino %"PRIu64"\n", inos[i]);
|
||||
able = true;
|
||||
}
|
||||
}
|
||||
if (able)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* not bothering to close or free */
|
||||
return 0;
|
||||
}
|
||||
36
tests/tests/basic-bad-mounts.sh
Normal file
36
tests/tests/basic-bad-mounts.sh
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
mount_fail()
|
||||
{
|
||||
local mnt=${!#}
|
||||
|
||||
echo "mounting $@" >> $T_TMP.mount.out
|
||||
mount -t scoutfs "$@" >> $T_TMP.mount.out 2>&1
|
||||
if [ $? == 0 ]; then
|
||||
umount "$mnt" || t_fail "couldn't unmount"
|
||||
t_fail "bad mount succeeded"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "== prepare devices, mount point, and logs"
|
||||
SCR="/mnt/scoutfs.extra"
|
||||
mkdir -p "$SCR"
|
||||
> $T_TMP.mount.out
|
||||
scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
|
||||
|| t_fail "mkfs failed"
|
||||
|
||||
echo "== bad devices, bad options"
|
||||
mount_fail -o _bad /dev/null /dev/null "$SCR"
|
||||
|
||||
echo "== swapped devices"
|
||||
mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
|
||||
|
||||
echo "== both meta devices"
|
||||
mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
|
||||
|
||||
echo "== both data devices"
|
||||
mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
echo "== good volume, bad option and good options"
|
||||
mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
t_pass
|
||||
38
tests/tests/fallocate.sh
Normal file
38
tests/tests/fallocate.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
t_require_commands fallocate cat
|
||||
|
||||
echo "== creating reasonably large per-mount files"
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/file-$n"
|
||||
|
||||
LC_ALL=C fallocate -l 128MiB "$path" || \
|
||||
t_fail "initial creating fallocate failed"
|
||||
done
|
||||
|
||||
#
|
||||
# we had lock inversions between read and fallocate, dropping
|
||||
# the cache each time forces waiting for IO during the calls
|
||||
# with the inverted locks held so we have a better chance
|
||||
# of the deadlock happening.
|
||||
#
|
||||
DURATION=10
|
||||
echo "== ${DURATION}s of racing cold reads and fallocate nop"
|
||||
END=$((SECONDS + DURATION))
|
||||
while [ $SECONDS -le $END ]; do
|
||||
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/file-$n"
|
||||
|
||||
LC_ALL=C fallocate -o 0 -l 4KiB "$path" &
|
||||
cat "$path" > /dev/null &
|
||||
done
|
||||
|
||||
wait || t_fail "fallocate or cat failed"
|
||||
done
|
||||
|
||||
echo "== cleaning up files"
|
||||
rm -f "$T_D0"/file-*
|
||||
|
||||
t_pass
|
||||
@@ -26,9 +26,17 @@ inode_exists()
|
||||
{
|
||||
local ino="$1"
|
||||
|
||||
handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
|
||||
scoutfs get-allocated-inos -i "$ino" -s -p "$T_M0" > $T_TMP.inos.log 2>&1
|
||||
test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino"
|
||||
}
|
||||
|
||||
t_save_all_sysfs_mount_options orphan_scan_delay_ms
|
||||
restore_delays()
|
||||
{
|
||||
t_restore_all_sysfs_mount_options orphan_scan_delay_ms
|
||||
}
|
||||
trap restore_delays EXIT
|
||||
|
||||
echo "== test our inode existance function"
|
||||
path="$T_D0/file"
|
||||
touch "$path"
|
||||
@@ -37,6 +45,7 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
|
||||
echo "== unlinked and opened inodes still exist"
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for background sleep to run and open stdin
|
||||
pid="$!"
|
||||
rm -f "$path"
|
||||
inode_exists $ino || echo "$ino didn't exist"
|
||||
@@ -44,7 +53,8 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
echo "== orphan from failed evict deletion is picked up"
|
||||
# pending kill signal stops evict from getting locks and deleting
|
||||
silent_kill $pid
|
||||
sleep 55
|
||||
t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
|
||||
sleep 5
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
|
||||
echo "== orphaned inos in all mounts all deleted"
|
||||
@@ -55,6 +65,7 @@ for nr in $(t_fs_nrs); do
|
||||
touch "$path"
|
||||
inos="$inos $(stat -c %i $path)"
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for background sleep to run and open stdin
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
@@ -69,9 +80,63 @@ while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
# wait for orphan scans to run
|
||||
sleep 55
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
|
||||
# also have to wait for delayed log merge work from mount
|
||||
sleep 15
|
||||
for ino in $inos; do
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
done
|
||||
|
||||
RUNTIME=30
|
||||
echo "== ${RUNTIME}s of racing evict deletion, orphan scanning, and open by handle"
|
||||
|
||||
# exclude last client mount
|
||||
last=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
last=$nr
|
||||
done
|
||||
|
||||
END=$((SECONDS + RUNTIME))
|
||||
while [ $SECONDS -lt $END ]; do
|
||||
# hold open per-mount unlinked files
|
||||
pids=""
|
||||
ino_args=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
test $nr == $last && continue
|
||||
|
||||
eval path="\$T_D${nr}/racing-$nr"
|
||||
touch "$path"
|
||||
ino_args="$ino_args -i $(stat -c %i $path)"
|
||||
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for sleep to start and open input :/
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
|
||||
# remount excluded last client to force log merging and make orphan visible
|
||||
sync
|
||||
t_umount $last
|
||||
t_mount $last
|
||||
|
||||
# get all mounts scanning orphans at high frequency
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 100
|
||||
|
||||
# spin having tasks in each mount trying to open/fsetxattr all inos
|
||||
for nr in $(t_fs_nrs); do
|
||||
test $nr == $last && continue
|
||||
|
||||
eval path="\$T_M${nr}"
|
||||
handle_fsetxattr -e $ino_args -m "$path" -s 2 &
|
||||
done
|
||||
|
||||
# trigger eviction deletion of each file in each mount
|
||||
silent_kill $pids
|
||||
|
||||
wait || t_fail "handle_fsetxattr failed"
|
||||
|
||||
# slow down orphan scanning for the next iteration
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms $(((RUNTIME * 2) * 1000))
|
||||
done
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -21,6 +21,21 @@ contains the filesystem's metadata.
|
||||
.sp
|
||||
This option is required.
|
||||
.TP
|
||||
.B orphan_scan_delay_ms=<number>
|
||||
This option sets the average expected delay, in milliseconds, between
|
||||
each mount's scan of the global orphaned inode list. Jitter is added to
|
||||
avoid contention so each individual delay between scans is a random
|
||||
value up to 20% less than or greater than this average expected delay.
|
||||
.sp
|
||||
The minimum value for this option is 100ms which is very short and is
|
||||
only reasonable for testing or experiments. The default is 10000ms (10
|
||||
seconds) and the maximum is 60000ms (1 minute).
|
||||
.sp
|
||||
This option can be changed in an active mount by writing to its file in
|
||||
the options directory in the mount's sysfs directory. Writing a new
|
||||
value will cause the next pending orphan scan to be rescheduled
|
||||
with the newly written delay time.
|
||||
.TP
|
||||
.B quorum_slot_nr=<number>
|
||||
The quorum_slot_nr option assigns a quorum member slot to the mount.
|
||||
The mount will use the slot assignment to claim exclusive ownership of
|
||||
|
||||
@@ -15,7 +15,7 @@ environment variable. If that variable is also absent the current working
|
||||
directory will be used.
|
||||
|
||||
.TP
|
||||
.BI "change-format-version [-V, --format-version VERS] [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.BI "change-format-version [-V, --format-version VERS] [-F|--offline] META-DEVICE DATA-DEVICE"
|
||||
.sp
|
||||
Change the format version of an existing file system. The maxmimum
|
||||
supported version is used by default. A specific version in the range
|
||||
@@ -25,7 +25,7 @@ output of --help.
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-F, --offline META-DEVICE DATA-DEVICE"
|
||||
.B "-F, --offline"
|
||||
Change the format version by writing directly to the metadata and data
|
||||
devices. Like mkfs, this writes directly to the devices without
|
||||
protection and must only be used on completely unmounted devices. The
|
||||
@@ -43,7 +43,7 @@ the super blocks on both devices.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "change-quorum-config {-Q|--quorum-slot} NR,ADDR,PORT [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.BI "change-quorum-config {-Q|--quorum-slot NR,ADDR,PORT} [-F|--offline] META-DEVICE"
|
||||
.sp
|
||||
Change the quorum configuration for an existing file system. The new
|
||||
configuration completely replaces the old configuration. Any slots
|
||||
@@ -61,7 +61,7 @@ multiple arguments as described in the
|
||||
.B mkfs
|
||||
command.
|
||||
.TP
|
||||
.B "-F, --offline META-DEVICE"
|
||||
.B "-F, --offline"
|
||||
Perform the change offline by updating the superblock in the metadata
|
||||
device. The command will read the super block and refuse to make the
|
||||
change if it sees any evidence that the metadata device is currently in
|
||||
@@ -617,6 +617,33 @@ command is used first.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "get-allocated-inos [-i|--ino INO] [-s|--single] [-p|--path PATH]"
|
||||
.sp
|
||||
This debugging command prints allocated inode numbers. It only prints
|
||||
inodes
|
||||
found in the group that contains the starting inode. The printed inode
|
||||
numbers aren't necessarily reachable. They could be anywhere in the
|
||||
process from being unlinked to finally deleted when their items
|
||||
were found.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-i, --ino INO"
|
||||
The first 64bit inode number which could be printed.
|
||||
.TP
|
||||
.B "-s, --single"
|
||||
Only print the single starting inode when it is allocated, all other allocated
|
||||
inode numbers will be ignored.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
|
||||
.SH SEE ALSO
|
||||
.BR scoutfs (5),
|
||||
.BR xattr (7),
|
||||
|
||||
@@ -222,7 +222,7 @@ static struct argp_option options[] = {
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"",
|
||||
"META-DEVICE DATA-DEVICE",
|
||||
"Change format version of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ static struct argp_option options[] = {
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"",
|
||||
"META-DEVICE",
|
||||
"Change quorum slots and addresses of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
|
||||
137
utils/src/get_allocated_inos.c
Normal file
137
utils/src/get_allocated_inos.c
Normal file
@@ -0,0 +1,137 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "ioctl.h"
|
||||
#include "cmd.h"
|
||||
|
||||
struct get_allocated_inos_args {
|
||||
char *path;
|
||||
u64 ino;
|
||||
bool have_ino;
|
||||
bool single;
|
||||
};
|
||||
|
||||
static int do_get_allocated_inos(struct get_allocated_inos_args *args)
|
||||
{
|
||||
struct scoutfs_ioctl_get_allocated_inos gai;
|
||||
u64 *inos = NULL;
|
||||
int fd = -1;
|
||||
u64 bytes;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (args->single)
|
||||
bytes = sizeof(*inos);
|
||||
else
|
||||
bytes = SCOUTFS_LOCK_INODE_GROUP_NR * sizeof(*inos);
|
||||
|
||||
inos = malloc(bytes);
|
||||
if (!inos) {
|
||||
fprintf(stderr, "inode number array allocation failed\n");
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
fd = get_path(args->path, O_RDONLY);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
memset(&gai, 0, sizeof(gai));
|
||||
gai.start_ino = args->ino;
|
||||
gai.inos_ptr = (unsigned long)inos;
|
||||
gai.inos_bytes = bytes;
|
||||
|
||||
ret = ioctl(fd, SCOUTFS_IOC_GET_ALLOCATED_INOS, &gai);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "get_allocated_inos ioctl failed: "
|
||||
"%s (%d)\n", strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->single && ret > 0 && inos[0] != args->ino)
|
||||
ret = 0;
|
||||
|
||||
for (i = 0; i < ret; i++)
|
||||
printf("%llu\n", inos[i]);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
free(inos);
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct get_allocated_inos_args *args = state->input;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'i':
|
||||
ret = parse_u64(arg, &args->ino);
|
||||
if (ret)
|
||||
return ret;
|
||||
args->have_ino = true;
|
||||
case 'p':
|
||||
args->path = strdup_or_error(state, arg);
|
||||
break;
|
||||
case 's':
|
||||
args->single = true;
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->have_ino)
|
||||
argp_error(state, "must provide --ino starting inode number option");
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "ino", 'i', "NUMBER", 0, "Start from 64bit inode number (required)"},
|
||||
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
|
||||
{ "single", 's', NULL, 0, "Only print single specific inode number argument"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
NULL,
|
||||
"Print allocated inode numbers from starting inode number"
|
||||
};
|
||||
|
||||
static int get_allocated_inos_cmd(int argc, char **argv)
|
||||
{
|
||||
|
||||
struct get_allocated_inos_args get_allocated_inos_args = {NULL};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &get_allocated_inos_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_get_allocated_inos(&get_allocated_inos_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) get_allocated_inos_ctor(void)
|
||||
{
|
||||
cmd_register_argp("get-allocated-inos", &argp, GROUP_DEBUG, get_allocated_inos_cmd);
|
||||
}
|
||||
Reference in New Issue
Block a user