mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-28 06:12:03 +00:00
Compare commits
108 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0b919e2ba7 | ||
|
|
bb5267f0c9 | ||
|
|
6d4916954b | ||
|
|
8e067b3d3f | ||
|
|
87500e8bb5 | ||
|
|
41174867ed | ||
|
|
276fbebdac | ||
|
|
03df993e14 | ||
|
|
701f1a9538 | ||
|
|
71ed4512dc | ||
|
|
57dff347a6 | ||
|
|
fb7cb057c4 | ||
|
|
1b924c501e | ||
|
|
aed4313995 | ||
|
|
61d86f7718 | ||
|
|
717b56698a | ||
|
|
c92a7ff705 | ||
|
|
d05489c670 | ||
|
|
4806e8a7b3 | ||
|
|
b74f3f577d | ||
|
|
d5ddf1ecac | ||
|
|
e27ea22fe4 | ||
|
|
51fe5a4ceb | ||
|
|
3847c4fe63 | ||
|
|
ef2daf8857 | ||
|
|
064409eb62 | ||
|
|
ddc5d9f04d | ||
|
|
433a80c6fc | ||
|
|
78405bb5fd | ||
|
|
98e514e5f4 | ||
|
|
29538a9f45 | ||
|
|
1826048ca3 | ||
|
|
798fbb793e | ||
|
|
d7b16419ef | ||
|
|
f13aba78b1 | ||
|
|
3220c2055c | ||
|
|
1cbc927ccb | ||
|
|
acb94dd9b7 | ||
|
|
233fbb39f3 | ||
|
|
198d3cda32 | ||
|
|
e8c64b4217 | ||
|
|
89b64ae1f7 | ||
|
|
fc8a5a1b5c | ||
|
|
d4c793e010 | ||
|
|
8a3058818c | ||
|
|
ba9a106f72 | ||
|
|
310725eb72 | ||
|
|
51a8236316 | ||
|
|
f3dd00895b | ||
|
|
49df98f5a8 | ||
|
|
15cf3c4134 | ||
|
|
1abe97351d | ||
|
|
f757e29915 | ||
|
|
31e474c5fa | ||
|
|
dcf8202d7c | ||
|
|
ae55fa3153 | ||
|
|
7f9f21317c | ||
|
|
0d4bf83da3 | ||
|
|
0a6b1fb304 | ||
|
|
fb7e43dd23 | ||
|
|
45d90a5ae4 | ||
|
|
48f1305a8a | ||
|
|
cd4d6502b8 | ||
|
|
dff366e1a4 | ||
|
|
ca526e2bc0 | ||
|
|
e423d42106 | ||
|
|
82d2be2e4a | ||
|
|
4102b760d0 | ||
|
|
65654ee7c0 | ||
|
|
b2d6ceeb9c | ||
|
|
d8231016f8 | ||
|
|
3c2b329675 | ||
|
|
96ad8dd510 | ||
|
|
44f38a31ec | ||
|
|
fb2ff753ad | ||
|
|
bb3db7e272 | ||
|
|
c94b072925 | ||
|
|
26ae9c6e04 | ||
|
|
c8d7221ec5 | ||
|
|
7fd03dc311 | ||
|
|
4e8a088cc5 | ||
|
|
9c751c1197 | ||
|
|
875583b7ef | ||
|
|
38e5aa77c4 | ||
|
|
57a1d75e52 | ||
|
|
51d19d797f | ||
|
|
029a684c25 | ||
|
|
f2679d9598 | ||
|
|
bddca171ee | ||
|
|
18171b8543 | ||
|
|
d846eec5e8 | ||
|
|
e2c90339c5 | ||
|
|
4a0b14a4f2 | ||
|
|
90518a0fbd | ||
|
|
cd23cc61ca | ||
|
|
a67ea30bb7 | ||
|
|
f3b7c683f0 | ||
|
|
8decc54467 | ||
|
|
5adcf7677f | ||
|
|
07f03d499f | ||
|
|
c5068efef0 | ||
|
|
66678dc63b | ||
|
|
b2834d3c28 | ||
|
|
cff50bec6b | ||
|
|
4d6350b3b0 | ||
|
|
48966b42bb | ||
|
|
97cb8ad50d | ||
|
|
ae08a797ae |
175
ReleaseNotes.md
175
ReleaseNotes.md
@@ -2,9 +2,180 @@ Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.2-rc
|
||||
v1.10
|
||||
\
|
||||
*TBD*
|
||||
*Dec 7, 2022*
|
||||
|
||||
Fixed a potential directory entry cache management deadlock that could
|
||||
occur when many nodes performed heavy metadata write loads across shared
|
||||
directories and their child subdirectories. The deadlock could halt
|
||||
invalidation progress on a node which could then stop use of locks that
|
||||
needed invalidation on that node which would result in almost all tasks
|
||||
hanging on those locks that would never make progress.
|
||||
|
||||
Fixed a circumstance where metadata change sequence index item
|
||||
modification could leave behind old stale metadata sequence items. The
|
||||
duplication case required concurrent metadata updates across mounts with
|
||||
particular open transaction patterns so the duplicate items are rare.
|
||||
They resulted in a small amount of additional load when walking change
|
||||
indexes but had no effect on correctness.
|
||||
|
||||
Fixed a rare case where sparse file extension might not write partial
|
||||
blocks of zeros which was found in testing. This required using
|
||||
truncate to extend files past file sizes that end in partial blocks
|
||||
along with the right transaction commit and memory reclaim patterns.
|
||||
This never affected regular non-sparse files nor files prepopulated with
|
||||
fallocate.
|
||||
|
||||
---
|
||||
v1.9
|
||||
\
|
||||
*Oct 29, 2022*
|
||||
|
||||
Fix VFS cached directory entry consistency verification that could cause
|
||||
spurious "no such file or directory" (ENOENT) errors from rename over
|
||||
NFS under certain conditions. The problem was only every with the
|
||||
consistency of in-memory cached dentry objects, persistent data was
|
||||
correct and eventual eviction of the bad cached objects would stop
|
||||
generating the errors.
|
||||
|
||||
---
|
||||
v1.8
|
||||
\
|
||||
*Oct 18, 2022*
|
||||
|
||||
Add support for Linux POSIX Access Control Lists, as described in
|
||||
acl(5). Mount options are added to enable ("acl") and disable ("noacl")
|
||||
support. The default is to support ACLs. ACLs are stored in the
|
||||
existing extended attribute scheme so adding support is does not require
|
||||
a format change.
|
||||
|
||||
Add options to control data extent preallocation. The default behavior
|
||||
does not change. The options can relax the limits on preallocation
|
||||
which will then trigger under more write patterns and increase the risk
|
||||
of preallocated space which is never used. The options are described in
|
||||
scoutfs(5).
|
||||
|
||||
---
|
||||
v1.7
|
||||
\
|
||||
*Aug 26, 2022*
|
||||
|
||||
* **Fixed possible persistent errors moving freed data extents**
|
||||
\
|
||||
Fixed a case where the server could hit persistent errors trying to
|
||||
move a client's freed extents in one commit. The client had to free
|
||||
a large number of extents that occupied distant positions in the
|
||||
global free extent btree. Very large fragmented files could cause
|
||||
this. The server now moves the freed extents in multiple commits and
|
||||
can always ensure forward progress.
|
||||
|
||||
* **Fixed possible persistent errors from freed duplicate extents**
|
||||
\
|
||||
Background orphan deletion wasn't properly synchronizing with
|
||||
foreground tasks deleting very large files. If a deletion took long
|
||||
enough then background deletion could also attempt to delete inode items
|
||||
while the deletion was making progress. This could create duplicate
|
||||
deletions of data extent items which causes the server to abort when
|
||||
it later discovers the duplicate extents as it merges free lists.
|
||||
|
||||
---
|
||||
v1.6
|
||||
\
|
||||
*Jul 7, 2022*
|
||||
|
||||
* **Fix memory leaks in rare corner cases**
|
||||
\
|
||||
Analysis tools found a few corner cases that leaked small structures,
|
||||
generally around error handling or startup and shutdown.
|
||||
|
||||
* **Add --skip-likely-huge scoutfs print command option**
|
||||
\
|
||||
Add an option to scoutfs print to reduce the size of the output
|
||||
so that it can be used to see system-wide metadata without being
|
||||
overwhelmed by file-level details.
|
||||
|
||||
---
|
||||
v1.5
|
||||
\
|
||||
*Jun 21, 2022*
|
||||
|
||||
* **Fix persistent error during server startup**
|
||||
\
|
||||
Fixed a case where the server would always hit a consistent error on
|
||||
seartup, preventing the system from mounting. This required a rare
|
||||
but valid state across the clients.
|
||||
|
||||
* **Fix a client hang that would lead to fencing**
|
||||
\
|
||||
The client module's use of in-kernel networking was missing annotation
|
||||
that could lead to communication hanging. The server would fence the
|
||||
client when it stopped communicating. This could be identified by the
|
||||
server fencing a client after it disconnected with no attempt by the
|
||||
client to reconnect.
|
||||
|
||||
---
|
||||
v1.4
|
||||
\
|
||||
*May 6, 2022*
|
||||
|
||||
* **Fix possible client crash during server failover**
|
||||
\
|
||||
Fixed a narrow window during server failover and lock recovery that
|
||||
could cause a client mount to believe that it had an inconsistent item
|
||||
cache and panic. This required very specific lock state and messaging
|
||||
patterns between multiple mounts and multiple servers which made it
|
||||
unlikely to occur in the field.
|
||||
|
||||
---
|
||||
v1.3
|
||||
\
|
||||
*Apr 7, 2022*
|
||||
|
||||
* **Fix rare server instability under heavy load**
|
||||
\
|
||||
Fixed a case of server instability under heavy load due to concurrent
|
||||
work fully exhausting metadata block allocation pools reserved for a
|
||||
single server transaction. This would cause brief interruption as the
|
||||
server shutdown and the next server started up and made progress as
|
||||
pending work was retried.
|
||||
|
||||
* **Fix slow fencing preventing server startup**
|
||||
\
|
||||
If a server had to process many fence requests with a slow fencing
|
||||
mechanism it could be interrupted before it finished. The server
|
||||
now makes sure heartbeat messages are sent while it is making progress
|
||||
on fencing requests so that other quorum members don't interrupt the
|
||||
process.
|
||||
|
||||
* **Performance improvement in getxattr and setxattr**
|
||||
\
|
||||
Kernel allocation patterns in the getxattr and setxattr
|
||||
implementations were causing significant contention between CPUs. Their
|
||||
allocation strategy was changed so that concurrent tasks can call these
|
||||
xattr methods without degrading performance.
|
||||
|
||||
---
|
||||
v1.2
|
||||
\
|
||||
*Mar 14, 2022*
|
||||
|
||||
* **Fix deadlock between fallocate() and read() system calls**
|
||||
\
|
||||
Fixed a lock inversion that could cause two tasks to deadlock if they
|
||||
performed fallocate() and read() on a file at the same time. The
|
||||
deadlock was uninterruptible so the machine needed to be rebooted. This
|
||||
was relatively rare as fallocate() is usually used to prepare files
|
||||
before they're used.
|
||||
|
||||
* **Fix instability from heavy file deletion workloads**
|
||||
\
|
||||
Fixed rare circumstances under which background file deletion cleanup
|
||||
tasks could try to delete a file while it is being deleted by another
|
||||
task. Heavy load across multiple nodes, either many files being deleted
|
||||
or large files being deleted, increased the chances of this happening.
|
||||
Heavy staging could cause this problem because staging can create many
|
||||
internal temporary files that need to be deleted.
|
||||
|
||||
---
|
||||
v1.1
|
||||
|
||||
@@ -8,6 +8,7 @@ CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
|
||||
-include $(src)/Makefile.kernelcompat
|
||||
|
||||
scoutfs-y += \
|
||||
acl.o \
|
||||
avl.o \
|
||||
alloc.o \
|
||||
block.o \
|
||||
|
||||
@@ -34,3 +34,12 @@ endif
|
||||
ifneq (,$(shell grep 'FMODE_KABI_ITERATE' include/linux/fs.h))
|
||||
ccflags-y += -DKC_FMODE_KABI_ITERATE
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.7-rc2-23-g0d4d717f2583
|
||||
#
|
||||
# Added user_ns argument to posix_acl_valid
|
||||
#
|
||||
ifneq (,$(shell grep 'posix_acl_valid.*user_ns,' include/linux/posix_acl.h))
|
||||
ccflags-y += -DKC_POSIX_ACL_VALID_USER_NS
|
||||
endif
|
||||
|
||||
355
kmod/src/acl.c
Normal file
355
kmod/src/acl.c
Normal file
@@ -0,0 +1,355 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/posix_acl.h>
|
||||
#include <linux/posix_acl_xattr.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "super.h"
|
||||
#include "scoutfs_trace.h"
|
||||
#include "xattr.h"
|
||||
#include "acl.h"
|
||||
#include "inode.h"
|
||||
#include "trans.h"
|
||||
|
||||
/*
|
||||
* POSIX draft ACLs are stored as full xattr items with the entries
|
||||
* encoded as the kernel's posix_acl_xattr_{header,entry} value structs.
|
||||
*
|
||||
* They're accessed and modified via user facing synthetic xattrs, iops
|
||||
* calls from the kernel, during inode mode changes, and during inode
|
||||
* creation.
|
||||
*
|
||||
* ACL access devolves into xattr access which is relatively expensive
|
||||
* so we maintain the cached native form in the vfs inode. We drop the
|
||||
* cache in lock invalidation which means that cached acl access must
|
||||
* always be performed under cluster locking.
|
||||
*/
|
||||
|
||||
static int acl_xattr_name_len(int type, char **name, size_t *name_len)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
switch (type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
*name = XATTR_NAME_POSIX_ACL_ACCESS;
|
||||
if (name_len)
|
||||
*name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
|
||||
break;
|
||||
case ACL_TYPE_DEFAULT:
|
||||
*name = XATTR_NAME_POSIX_ACL_DEFAULT;
|
||||
if (name_len)
|
||||
*name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct posix_acl *acl;
|
||||
char *value = NULL;
|
||||
char *name;
|
||||
int ret;
|
||||
|
||||
if (!IS_POSIXACL(inode))
|
||||
return NULL;
|
||||
|
||||
acl = get_cached_acl(inode, type);
|
||||
if (acl != ACL_NOT_CACHED)
|
||||
return acl;
|
||||
|
||||
ret = acl_xattr_name_len(type, &name, NULL);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
ret = scoutfs_xattr_get_locked(inode, name, NULL, 0, lock);
|
||||
if (ret > 0) {
|
||||
value = kzalloc(ret, GFP_NOFS);
|
||||
if (!value)
|
||||
ret = -ENOMEM;
|
||||
else
|
||||
ret = scoutfs_xattr_get_locked(inode, name, value, ret, lock);
|
||||
}
|
||||
if (ret > 0) {
|
||||
acl = posix_acl_from_xattr(&init_user_ns, value, ret);
|
||||
} else if (ret == -ENODATA || ret == 0) {
|
||||
acl = NULL;
|
||||
} else {
|
||||
acl = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
/* can set null negative cache */
|
||||
if (!IS_ERR(acl))
|
||||
set_cached_acl(inode, type, acl);
|
||||
|
||||
kfree(value);
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
struct posix_acl *scoutfs_get_acl(struct inode *inode, int type)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct posix_acl *acl;
|
||||
int ret;
|
||||
|
||||
if (!IS_POSIXACL(inode))
|
||||
return NULL;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
|
||||
if (ret < 0) {
|
||||
acl = ERR_PTR(ret);
|
||||
} else {
|
||||
acl = scoutfs_get_acl_locked(inode, type, lock);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
}
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has acquired the locks and dirtied the inode, they'll
|
||||
* update the inode item if we return 0.
|
||||
*/
|
||||
int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
|
||||
struct scoutfs_lock *lock, struct list_head *ind_locks)
|
||||
{
|
||||
static const struct scoutfs_xattr_prefix_tags tgs = {0,}; /* never scoutfs. prefix */
|
||||
bool set_mode = false;
|
||||
char *value = NULL;
|
||||
umode_t new_mode;
|
||||
size_t name_len;
|
||||
char *name;
|
||||
int size = 0;
|
||||
int ret;
|
||||
|
||||
ret = acl_xattr_name_len(type, &name, &name_len);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
switch (type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
if (acl) {
|
||||
ret = posix_acl_update_mode(inode, &new_mode, &acl);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
set_mode = true;
|
||||
}
|
||||
break;
|
||||
case ACL_TYPE_DEFAULT:
|
||||
if (!S_ISDIR(inode->i_mode)) {
|
||||
ret = acl ? -EINVAL : 0;
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (acl) {
|
||||
size = posix_acl_xattr_size(acl->a_count);
|
||||
value = kmalloc(size, GFP_NOFS);
|
||||
if (!value) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_xattr_set_locked(inode, name, name_len, value, size, 0, &tgs,
|
||||
lock, NULL, ind_locks);
|
||||
if (ret == 0 && set_mode) {
|
||||
inode->i_mode = new_mode;
|
||||
if (!value) {
|
||||
/* can be setting an acl that only affects mode, didn't need xattr */
|
||||
inode_inc_iversion(inode);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (!ret)
|
||||
set_cached_acl(inode, type, acl);
|
||||
|
||||
kfree(value);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
LIST_HEAD(ind_locks);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock) ?:
|
||||
scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
|
||||
if (ret == 0) {
|
||||
ret = scoutfs_dirty_inode_item(inode, lock) ?:
|
||||
scoutfs_set_acl_locked(inode, acl, type, lock, &ind_locks);
|
||||
if (ret == 0)
|
||||
scoutfs_update_inode_item(inode, lock, &ind_locks);
|
||||
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
}
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value, size_t size,
|
||||
int type)
|
||||
{
|
||||
struct posix_acl *acl;
|
||||
int ret = 0;
|
||||
|
||||
if (!IS_POSIXACL(dentry->d_inode))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
acl = scoutfs_get_acl(dentry->d_inode, type);
|
||||
if (IS_ERR(acl))
|
||||
return PTR_ERR(acl);
|
||||
if (acl == NULL)
|
||||
return -ENODATA;
|
||||
|
||||
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
|
||||
posix_acl_release(acl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *value, size_t size,
|
||||
int flags, int type)
|
||||
{
|
||||
struct posix_acl *acl = NULL;
|
||||
int ret;
|
||||
|
||||
if (!inode_owner_or_capable(dentry->d_inode))
|
||||
return -EPERM;
|
||||
|
||||
if (!IS_POSIXACL(dentry->d_inode))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (value) {
|
||||
acl = posix_acl_from_xattr(&init_user_ns, value, size);
|
||||
if (IS_ERR(acl))
|
||||
return PTR_ERR(acl);
|
||||
|
||||
if (acl) {
|
||||
ret = kc_posix_acl_valid(&init_user_ns, acl);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_set_acl(dentry->d_inode, acl, type);
|
||||
out:
|
||||
posix_acl_release(acl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Apply the parent's default acl to new inodes access acl and inherit
|
||||
* it as the default for new directories. The caller holds locks and a
|
||||
* transaction.
|
||||
*/
|
||||
int scoutfs_init_acl_locked(struct inode *inode, struct inode *dir,
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *dir_lock,
|
||||
struct list_head *ind_locks)
|
||||
{
|
||||
struct posix_acl *acl = NULL;
|
||||
int ret = 0;
|
||||
|
||||
if (!S_ISLNK(inode->i_mode)) {
|
||||
if (IS_POSIXACL(dir)) {
|
||||
acl = scoutfs_get_acl_locked(dir, ACL_TYPE_DEFAULT, dir_lock);
|
||||
if (IS_ERR(acl))
|
||||
return PTR_ERR(acl);
|
||||
}
|
||||
|
||||
if (!acl)
|
||||
inode->i_mode &= ~current_umask();
|
||||
}
|
||||
|
||||
if (IS_POSIXACL(dir) && acl) {
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
ret = scoutfs_set_acl_locked(inode, acl, ACL_TYPE_DEFAULT,
|
||||
lock, ind_locks);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret > 0)
|
||||
ret = scoutfs_set_acl_locked(inode, acl, ACL_TYPE_ACCESS,
|
||||
lock, ind_locks);
|
||||
} else {
|
||||
cache_no_acl(inode);
|
||||
}
|
||||
out:
|
||||
posix_acl_release(acl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the access ACL based on a newly set mode. If we return an
|
||||
* error then the xattr wasn't changed.
|
||||
*
|
||||
* Annoyingly, setattr_copy has logic that transforms the final set mode
|
||||
* that we want to use to update the acl. But we don't want to modify
|
||||
* the other inode fields while discovering the resulting mode. We're
|
||||
* relying on acl_chmod not caring about the transformation (currently
|
||||
* just clears sgid). It would be better if we could get the resulting
|
||||
* mode to give to acl_chmod without modifying the other inode fields.
|
||||
*
|
||||
* The caller has the inode mutex, a cluster lock, transaction, and will
|
||||
* update the inode item if we return success.
|
||||
*/
|
||||
int scoutfs_acl_chmod_locked(struct inode *inode, struct iattr *attr,
|
||||
struct scoutfs_lock *lock, struct list_head *ind_locks)
|
||||
{
|
||||
struct posix_acl *acl;
|
||||
int ret = 0;
|
||||
|
||||
if (!IS_POSIXACL(inode) || !(attr->ia_valid & ATTR_MODE))
|
||||
return 0;
|
||||
|
||||
if (S_ISLNK(inode->i_mode))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
acl = scoutfs_get_acl_locked(inode, ACL_TYPE_ACCESS, lock);
|
||||
if (IS_ERR_OR_NULL(acl))
|
||||
return PTR_ERR(acl);
|
||||
|
||||
ret = posix_acl_chmod(&acl, GFP_KERNEL, attr->ia_mode);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = scoutfs_set_acl_locked(inode, acl, ACL_TYPE_ACCESS, lock, ind_locks);
|
||||
posix_acl_release(acl);
|
||||
return ret;
|
||||
}
|
||||
18
kmod/src/acl.h
Normal file
18
kmod/src/acl.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef _SCOUTFS_ACL_H_
|
||||
#define _SCOUTFS_ACL_H_
|
||||
|
||||
struct posix_acl *scoutfs_get_acl(struct inode *inode, int type);
|
||||
struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct scoutfs_lock *lock);
|
||||
int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
|
||||
int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
|
||||
struct scoutfs_lock *lock, struct list_head *ind_locks);
|
||||
int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value, size_t size,
|
||||
int type);
|
||||
int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *value, size_t size,
|
||||
int flags, int type);
|
||||
int scoutfs_acl_chmod_locked(struct inode *inode, struct iattr *attr,
|
||||
struct scoutfs_lock *lock, struct list_head *ind_locks);
|
||||
int scoutfs_init_acl_locked(struct inode *inode, struct inode *dir,
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *dir_lock,
|
||||
struct list_head *ind_locks);
|
||||
#endif
|
||||
@@ -84,6 +84,21 @@ static u64 smallest_order_length(u64 len)
|
||||
return 1ULL << (free_extent_order(len) * 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
*/
|
||||
static u32 extent_mod_blocks(u32 height)
|
||||
{
|
||||
return ((1 + height) * 2) * 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length order, largest first. The location key
|
||||
@@ -877,6 +892,13 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
|
||||
* -ENOENT is returned if we run out of extents in the source tree
|
||||
* before moving the total.
|
||||
*
|
||||
* If meta_budget is non-zero then -EINPROGRESS can be returned if the
|
||||
* the caller's budget is consumed in the allocator during this call
|
||||
* (though not necessarily by us, we don't have per-thread tracking of
|
||||
* allocator consumption :/). The call can still have made progress and
|
||||
* caller is expected commit the dirty trees and examining the resulting
|
||||
* modified trees to see if they need to continue moving extents.
|
||||
*
|
||||
* The caller can specify that extents in the source tree should first
|
||||
* be found based on their zone bitmaps. We'll first try to find
|
||||
* extents in the exclusive zones, then vacant zones, and then we'll
|
||||
@@ -891,7 +913,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.alloc = alloc,
|
||||
@@ -899,6 +921,8 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
};
|
||||
struct scoutfs_extent found;
|
||||
struct scoutfs_extent ext;
|
||||
u32 avail_start = 0;
|
||||
u32 freed_start = 0;
|
||||
u64 moved = 0;
|
||||
u64 count;
|
||||
int ret = 0;
|
||||
@@ -909,6 +933,9 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
vacant = NULL;
|
||||
}
|
||||
|
||||
if (meta_budget != 0)
|
||||
scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start);
|
||||
|
||||
while (moved < total) {
|
||||
count = total - moved;
|
||||
|
||||
@@ -941,6 +968,14 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (meta_budget != 0 &&
|
||||
scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget,
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
ret = -EINPROGRESS;
|
||||
break;
|
||||
}
|
||||
|
||||
/* searching set start/len, finish initializing alloced extent */
|
||||
ext.map = found.map ? ext.start - found.start + found.map : 0;
|
||||
ext.flags = found.flags;
|
||||
@@ -1065,15 +1100,6 @@ out:
|
||||
* than completely exhausting the avail list or overflowing the freed
|
||||
* list.
|
||||
*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
*
|
||||
* The caller tells us how many extents they're about to modify and how
|
||||
* many other additional blocks they may cow manually. And finally, the
|
||||
* caller could be the first to dirty the avail and freed blocks in the
|
||||
@@ -1082,7 +1108,7 @@ out:
|
||||
static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks)
|
||||
{
|
||||
u32 tree_blocks = (((1 + root->root.height) * 2) * 3) * extents;
|
||||
u32 tree_blocks = extent_mod_blocks(root->root.height) * extents;
|
||||
u32 most = 1 + tree_blocks + addl_blocks;
|
||||
|
||||
if (le32_to_cpu(alloc->avail.first_nr) < most) {
|
||||
@@ -1318,6 +1344,38 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
return lo;
|
||||
}
|
||||
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space)
|
||||
{
|
||||
unsigned int seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&alloc->seqlock);
|
||||
*avail_total = le32_to_cpu(alloc->avail.first_nr);
|
||||
*freed_space = list_block_space(alloc->freed.first_nr);
|
||||
} while (read_seqretry(&alloc->seqlock, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the caller's consumption of nr from either avail or
|
||||
* freed would end up exceeding their budget relative to the starting
|
||||
* remaining snapshot they took.
|
||||
*/
|
||||
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
||||
u32 budget, u32 nr)
|
||||
{
|
||||
u32 avail_use;
|
||||
u32 freed_use;
|
||||
u32 avail;
|
||||
u32 freed;
|
||||
|
||||
scoutfs_alloc_meta_remaining(alloc, &avail, &freed);
|
||||
|
||||
avail_use = avail_start - avail;
|
||||
freed_use = freed_start - freed;
|
||||
|
||||
return ((avail_use + nr) > budget) || ((freed_use + nr) > budget);
|
||||
}
|
||||
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag)
|
||||
{
|
||||
|
||||
@@ -19,14 +19,11 @@
|
||||
(128ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
|
||||
|
||||
/*
|
||||
* The largest aligned region that we'll try to allocate at the end of
|
||||
* the file as it's extended. This is also limited to the current file
|
||||
* size so we can only waste at most twice the total file size when
|
||||
* files are less than this. We try to keep this around the point of
|
||||
* diminishing returns in streaming performance of common data devices
|
||||
* to limit waste.
|
||||
* The default size that we'll try to preallocate. This is trying to
|
||||
* hit the limit of large efficient device writes while minimizing
|
||||
* wasted preallocation that is never used.
|
||||
*/
|
||||
#define SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT \
|
||||
#define SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS \
|
||||
(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
|
||||
|
||||
/*
|
||||
@@ -131,7 +128,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks);
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget);
|
||||
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
||||
u64 start, u64 len);
|
||||
@@ -158,6 +155,9 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
|
||||
|
||||
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 nr);
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
|
||||
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
||||
u32 budget, u32 nr);
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag);
|
||||
|
||||
|
||||
@@ -2449,7 +2449,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_btree_root *root, int alloc_low)
|
||||
struct scoutfs_btree_root *root, int free_budget)
|
||||
{
|
||||
u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT];
|
||||
struct scoutfs_block *bl = NULL;
|
||||
@@ -2459,11 +2459,15 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_avl_node *node;
|
||||
struct scoutfs_avl_node *next;
|
||||
struct scoutfs_key par_next;
|
||||
int nr_freed = 0;
|
||||
int nr_par;
|
||||
int level;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (WARN_ON_ONCE(free_budget <= 0))
|
||||
return -EINVAL;
|
||||
|
||||
if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos)))
|
||||
return -EIO; /* XXX corruption */
|
||||
|
||||
@@ -2538,8 +2542,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
while (node) {
|
||||
|
||||
/* make sure we can always free parents after leaves */
|
||||
if (scoutfs_alloc_meta_low(sb, alloc,
|
||||
alloc_low + nr_par + 1)) {
|
||||
if ((nr_freed + 1 + nr_par) > free_budget) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
@@ -2553,6 +2556,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
le64_to_cpu(ref.blkno));
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
nr_freed++;
|
||||
|
||||
node = scoutfs_avl_next(&bt->item_root, node);
|
||||
if (node) {
|
||||
@@ -2568,6 +2572,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
blknos[i]);
|
||||
ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]);
|
||||
BUG_ON(ret); /* checked meta low, freed should fit */
|
||||
nr_freed++;
|
||||
}
|
||||
|
||||
/* restart walk past the subtree we just freed */
|
||||
|
||||
@@ -125,7 +125,7 @@ int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_btree_root *root, int alloc_low);
|
||||
struct scoutfs_btree_root *root, int free_budget);
|
||||
|
||||
void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
|
||||
|
||||
|
||||
@@ -477,12 +477,15 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
|
||||
struct super_block *sb = client->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
const bool am_quorum = opts->quorum_slot_nr >= 0;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_net_greeting greet;
|
||||
struct sockaddr_in sin;
|
||||
bool am_quorum;
|
||||
int ret;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
am_quorum = opts.quorum_slot_nr >= 0;
|
||||
|
||||
/* can unmount once server farewell handling removes our item */
|
||||
if (client->sending_farewell &&
|
||||
lookup_mounted_client_item(sb, sbi->rid) == 0) {
|
||||
|
||||
@@ -75,8 +75,6 @@
|
||||
EXPAND_COUNTER(data_write_begin_enobufs_retry) \
|
||||
EXPAND_COUNTER(dentry_revalidate_error) \
|
||||
EXPAND_COUNTER(dentry_revalidate_invalid) \
|
||||
EXPAND_COUNTER(dentry_revalidate_locked) \
|
||||
EXPAND_COUNTER(dentry_revalidate_orphan) \
|
||||
EXPAND_COUNTER(dentry_revalidate_rcu) \
|
||||
EXPAND_COUNTER(dentry_revalidate_root) \
|
||||
EXPAND_COUNTER(dentry_revalidate_valid) \
|
||||
@@ -152,11 +150,12 @@
|
||||
EXPAND_COUNTER(net_recv_messages) \
|
||||
EXPAND_COUNTER(net_unknown_request) \
|
||||
EXPAND_COUNTER(orphan_scan) \
|
||||
EXPAND_COUNTER(orphan_scan_attempts) \
|
||||
EXPAND_COUNTER(orphan_scan_cached) \
|
||||
EXPAND_COUNTER(orphan_scan_error) \
|
||||
EXPAND_COUNTER(orphan_scan_item) \
|
||||
EXPAND_COUNTER(orphan_scan_omap_set) \
|
||||
EXPAND_COUNTER(orphan_scan_read) \
|
||||
EXPAND_COUNTER(quorum_candidate_server_stopping) \
|
||||
EXPAND_COUNTER(quorum_elected) \
|
||||
EXPAND_COUNTER(quorum_fence_error) \
|
||||
EXPAND_COUNTER(quorum_fence_leader) \
|
||||
|
||||
158
kmod/src/data.c
158
kmod/src/data.c
@@ -366,27 +366,27 @@ static inline u64 ext_last(struct scoutfs_extent *ext)
|
||||
|
||||
/*
|
||||
* The caller is writing to a logical iblock that doesn't have an
|
||||
* allocated extent.
|
||||
* allocated extent. The caller has searched for an extent containing
|
||||
* iblock. If it already existed then it must be unallocated and
|
||||
* offline.
|
||||
*
|
||||
* We always allocate an extent starting at the logical iblock. The
|
||||
* caller has searched for an extent containing iblock. If it already
|
||||
* existed then it must be unallocated and offline.
|
||||
* We implement two preallocation strategies. Typically we only
|
||||
* preallocate for simple streaming writes and limit preallocation while
|
||||
* the file is small. The largest efficient allocation size is
|
||||
* typically large enough that it would be unreasonable to allocate that
|
||||
* much for all small files.
|
||||
*
|
||||
* Preallocation is used if we're strictly contiguously extending
|
||||
* writes. That is, if the logical block offset equals the number of
|
||||
* online blocks. We try to preallocate the number of blocks existing
|
||||
* so that small files don't waste inordinate amounts of space and large
|
||||
* files will eventually see large extents. This only works for
|
||||
* contiguous single stream writes or stages of files from the first
|
||||
* block. It doesn't work for concurrent stages, releasing behind
|
||||
* staging, sparse files, multi-node writes, etc. fallocate() is always
|
||||
* a better tool to use.
|
||||
* Optionally, we can simply preallocate large empty aligned regions.
|
||||
* This can waste a lot of space for small or sparse files but is
|
||||
* reasonable when a file population is known to be large and dense but
|
||||
* known to be written with non-streaming write patterns.
|
||||
*/
|
||||
static int alloc_block(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_extent *ext, u64 iblock,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
DECLARE_DATA_INFO(sb, datinf);
|
||||
struct scoutfs_mount_options opts;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct data_ext_args args = {
|
||||
.ino = ino,
|
||||
@@ -394,17 +394,22 @@ static int alloc_block(struct super_block *sb, struct inode *inode,
|
||||
.lock = lock,
|
||||
};
|
||||
struct scoutfs_extent found;
|
||||
struct scoutfs_extent pre;
|
||||
struct scoutfs_extent pre = {0,};
|
||||
bool undo_pre = false;
|
||||
u64 blkno = 0;
|
||||
u64 online;
|
||||
u64 offline;
|
||||
u8 flags;
|
||||
u64 start;
|
||||
u64 count;
|
||||
u64 rem;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
/* can only allocate over existing unallocated offline extent */
|
||||
if (WARN_ON_ONCE(ext->len &&
|
||||
!(iblock >= ext->start && iblock <= ext_last(ext) &&
|
||||
@@ -413,66 +418,106 @@ static int alloc_block(struct super_block *sb, struct inode *inode,
|
||||
|
||||
mutex_lock(&datinf->mutex);
|
||||
|
||||
scoutfs_inode_get_onoff(inode, &online, &offline);
|
||||
/* default to single allocation at the written block */
|
||||
start = iblock;
|
||||
count = 1;
|
||||
/* copy existing flags for preallocated regions */
|
||||
flags = ext->len ? ext->flags : 0;
|
||||
|
||||
if (ext->len) {
|
||||
/* limit preallocation to remaining existing (offline) extent */
|
||||
/*
|
||||
* Assume that offline writers are going to be writing
|
||||
* all the offline extents and try to preallocate the
|
||||
* rest of the unwritten extent.
|
||||
*/
|
||||
count = ext->len - (iblock - ext->start);
|
||||
flags = ext->flags;
|
||||
|
||||
} else if (opts.data_prealloc_contig_only) {
|
||||
/*
|
||||
* Only preallocate when a quick test of the online
|
||||
* block counts looks like we're a simple streaming
|
||||
* write. Try to write until the next extent but limit
|
||||
* the preallocation size to the number of online
|
||||
* blocks.
|
||||
*/
|
||||
scoutfs_inode_get_onoff(inode, &online, &offline);
|
||||
if (iblock > 1 && iblock == online) {
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
|
||||
iblock, 1, &found);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto out;
|
||||
if (found.len && found.start > iblock)
|
||||
count = found.start - iblock;
|
||||
else
|
||||
count = opts.data_prealloc_blocks;
|
||||
|
||||
count = min(iblock, count);
|
||||
}
|
||||
|
||||
} else {
|
||||
/* otherwise alloc to next extent */
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
|
||||
iblock, 1, &found);
|
||||
/*
|
||||
* Preallocation of aligned regions only preallocates if
|
||||
* the aligned region contains no extents at all. This
|
||||
* could be fooled by offline sparse extents but we
|
||||
* don't want to iterate over all offline extents in the
|
||||
* aligned region.
|
||||
*/
|
||||
div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem);
|
||||
start = iblock - rem;
|
||||
count = opts.data_prealloc_blocks;
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto out;
|
||||
if (found.len && found.start > iblock)
|
||||
count = found.start - iblock;
|
||||
else
|
||||
count = SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT;
|
||||
flags = 0;
|
||||
if (found.len && found.start < start + count)
|
||||
count = 1;
|
||||
}
|
||||
|
||||
/* overall prealloc limit */
|
||||
count = min_t(u64, count, SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT);
|
||||
|
||||
/* only strictly contiguous extending writes will try to preallocate */
|
||||
if (iblock > 1 && iblock == online)
|
||||
count = min(iblock, count);
|
||||
else
|
||||
count = 1;
|
||||
count = min_t(u64, count, opts.data_prealloc_blocks);
|
||||
|
||||
ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
|
||||
&datinf->dalloc, count, &blkno, &count);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/*
|
||||
* An aligned prealloc attempt that gets a smaller extent can
|
||||
* fail to cover iblock, make sure that it does. This is a
|
||||
* pathological case so we don't try to move the window past
|
||||
* iblock. Just enough to cover it, which we know is safe.
|
||||
*/
|
||||
if (start + count <= iblock)
|
||||
start += (iblock - (start + count) + 1);
|
||||
|
||||
if (count > 1) {
|
||||
pre.start = iblock + 1;
|
||||
pre.len = count - 1;
|
||||
pre.map = blkno + 1;
|
||||
pre.start = start;
|
||||
pre.len = count;
|
||||
pre.map = blkno;
|
||||
pre.flags = flags | SEF_UNWRITTEN;
|
||||
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start,
|
||||
pre.len, pre.map, pre.flags);
|
||||
if (ret < 0) {
|
||||
err = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock,
|
||||
1, 0, flags);
|
||||
BUG_ON(err); /* couldn't restore original */
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
undo_pre = true;
|
||||
}
|
||||
|
||||
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno + (iblock - start), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* tell the caller we have a single block, could check next? */
|
||||
ext->start = iblock;
|
||||
ext->len = 1;
|
||||
ext->map = blkno;
|
||||
ext->map = blkno + (iblock - start);
|
||||
ext->flags = 0;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0 && blkno > 0) {
|
||||
if (undo_pre) {
|
||||
err = scoutfs_ext_set(sb, &data_ext_ops, &args,
|
||||
pre.start, pre.len, 0, flags);
|
||||
BUG_ON(err); /* leaked preallocated extent */
|
||||
}
|
||||
err = scoutfs_free_data(sb, datinf->alloc, datinf->wri,
|
||||
&datinf->data_freed, blkno, count);
|
||||
BUG_ON(err); /* leaked free blocks */
|
||||
@@ -586,8 +631,8 @@ static int scoutfs_get_block_read(struct inode *inode, sector_t iblock,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_get_block_write(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh, int create)
|
||||
int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh,
|
||||
int create)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
int ret;
|
||||
@@ -983,9 +1028,6 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
u64 last;
|
||||
s64 ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
/* XXX support more flags */
|
||||
if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
|
||||
ret = -EOPNOTSUPP;
|
||||
@@ -1003,18 +1045,22 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_mutex;
|
||||
|
||||
inode_dio_wait(inode);
|
||||
|
||||
down_write(&si->extent_sem);
|
||||
|
||||
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
||||
(offset + len > i_size_read(inode))) {
|
||||
ret = inode_newsize_ok(inode, offset + len);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_extent;
|
||||
}
|
||||
|
||||
iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
@@ -1024,7 +1070,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_extent;
|
||||
|
||||
ret = fallocate_extents(sb, inode, iblock, last, lock);
|
||||
|
||||
@@ -1050,17 +1096,19 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
}
|
||||
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
goto out_extent;
|
||||
|
||||
iblock += ret;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
out_extent:
|
||||
up_write(&si->extent_sem);
|
||||
out_mutex:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
out:
|
||||
trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -43,6 +43,9 @@ extern const struct file_operations scoutfs_file_fops;
|
||||
struct scoutfs_alloc;
|
||||
struct scoutfs_block_writer;
|
||||
|
||||
int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh,
|
||||
int create);
|
||||
|
||||
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
|
||||
u64 ino, u64 iblock, u64 last, bool offline,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
440
kmod/src/dir.c
440
kmod/src/dir.c
@@ -32,6 +32,7 @@
|
||||
#include "hash.h"
|
||||
#include "omap.h"
|
||||
#include "forest.h"
|
||||
#include "acl.h"
|
||||
#include "counters.h"
|
||||
#include "scoutfs_trace.h"
|
||||
|
||||
@@ -59,8 +60,6 @@
|
||||
* All the entries have a dirent struct with the full name in their
|
||||
* value. The dirent struct contains the name hash and readdir position
|
||||
* so that any item use can reference all the items for a given entry.
|
||||
* This is important for deleting all the items given a dentry that was
|
||||
* populated by lookup.
|
||||
*/
|
||||
|
||||
static unsigned int mode_to_type(umode_t mode)
|
||||
@@ -99,100 +98,12 @@ static unsigned int dentry_type(enum scoutfs_dentry_type type)
|
||||
return DT_UNKNOWN;
|
||||
}
|
||||
|
||||
/*
|
||||
* @lock_cov: tells revalidation that the dentry is still locked and valid.
|
||||
*
|
||||
* @pos, @hash: lets us remove items on final unlink without having to
|
||||
* look them up.
|
||||
*/
|
||||
struct dentry_info {
|
||||
struct scoutfs_lock_coverage lock_cov;
|
||||
u64 hash;
|
||||
u64 pos;
|
||||
};
|
||||
|
||||
static struct kmem_cache *dentry_info_cache;
|
||||
|
||||
static void scoutfs_d_release(struct dentry *dentry)
|
||||
{
|
||||
struct super_block *sb = dentry->d_sb;
|
||||
struct dentry_info *di = dentry->d_fsdata;
|
||||
|
||||
if (di) {
|
||||
scoutfs_lock_del_coverage(sb, &di->lock_cov);
|
||||
kmem_cache_free(dentry_info_cache, di);
|
||||
dentry->d_fsdata = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags);
|
||||
|
||||
static const struct dentry_operations scoutfs_dentry_ops = {
|
||||
.d_release = scoutfs_d_release,
|
||||
const struct dentry_operations scoutfs_dentry_ops = {
|
||||
.d_revalidate = scoutfs_d_revalidate,
|
||||
};
|
||||
|
||||
static int alloc_dentry_info(struct dentry *dentry)
|
||||
{
|
||||
struct dentry_info *di;
|
||||
|
||||
smp_rmb();
|
||||
if (dentry->d_op == &scoutfs_dentry_ops)
|
||||
return 0;
|
||||
|
||||
di = kmem_cache_zalloc(dentry_info_cache, GFP_NOFS);
|
||||
if (!di)
|
||||
return -ENOMEM;
|
||||
|
||||
scoutfs_lock_init_coverage(&di->lock_cov);
|
||||
|
||||
spin_lock(&dentry->d_lock);
|
||||
if (!dentry->d_fsdata) {
|
||||
dentry->d_fsdata = di;
|
||||
smp_wmb();
|
||||
d_set_d_op(dentry, &scoutfs_dentry_ops);
|
||||
}
|
||||
spin_unlock(&dentry->d_lock);
|
||||
|
||||
if (di != dentry->d_fsdata)
|
||||
kmem_cache_free(dentry_info_cache, di);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_dentry_info(struct super_block *sb, struct dentry *dentry,
|
||||
u64 hash, u64 pos, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct dentry_info *di = dentry->d_fsdata;
|
||||
|
||||
if (WARN_ON_ONCE(di == NULL))
|
||||
return;
|
||||
|
||||
scoutfs_lock_add_coverage(sb, lock, &di->lock_cov);
|
||||
di->hash = hash;
|
||||
di->pos = pos;
|
||||
}
|
||||
|
||||
static u64 dentry_info_hash(struct dentry *dentry)
|
||||
{
|
||||
struct dentry_info *di = dentry->d_fsdata;
|
||||
|
||||
if (WARN_ON_ONCE(di == NULL))
|
||||
return 0;
|
||||
|
||||
return di->hash;
|
||||
}
|
||||
|
||||
static u64 dentry_info_pos(struct dentry *dentry)
|
||||
{
|
||||
struct dentry_info *di = dentry->d_fsdata;
|
||||
|
||||
if (WARN_ON_ONCE(di == NULL))
|
||||
return 0;
|
||||
|
||||
return di->pos;
|
||||
}
|
||||
|
||||
static void init_dirent_key(struct scoutfs_key *key, u8 type, u64 ino,
|
||||
u64 major, u64 minor)
|
||||
{
|
||||
@@ -317,62 +228,105 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that the caller's dentry still precisely matches our dirent
|
||||
* items.
|
||||
*
|
||||
* The caller has a dentry that the vfs revalidated before they acquired
|
||||
* their locks. If the dentry is still covered by a lock we immediately
|
||||
* return 0. If not, we check items and return -ENOENT if a positive
|
||||
* dentry no longer matches the items or -EEXIST if a negative entry's
|
||||
* name now has an item.
|
||||
*/
|
||||
static int verify_entry(struct super_block *sb, u64 dir_ino, struct dentry *dentry,
|
||||
struct scoutfs_lock *lock)
|
||||
static int lookup_dentry_dirent(struct super_block *sb, u64 dir_ino, struct dentry *dentry,
|
||||
struct scoutfs_dirent *dent_ret,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct dentry_info *di = dentry->d_fsdata;
|
||||
return lookup_dirent(sb, dir_ino, dentry->d_name.name, dentry->d_name.len,
|
||||
dirent_name_hash(dentry->d_name.name, dentry->d_name.len),
|
||||
dent_ret, lock);
|
||||
}
|
||||
|
||||
static u64 dentry_parent_ino(struct dentry *dentry)
|
||||
{
|
||||
struct dentry *parent = NULL;
|
||||
struct inode *dir;
|
||||
u64 dir_ino = 0;
|
||||
|
||||
if ((parent = dget_parent(dentry)) && (dir = parent->d_inode))
|
||||
dir_ino = scoutfs_ino(dir);
|
||||
|
||||
dput(parent);
|
||||
return dir_ino;
|
||||
}
|
||||
|
||||
/* negative dentries return 0, our root ino is non-zero (1) */
|
||||
static u64 dentry_ino(struct dentry *dentry)
|
||||
{
|
||||
return dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0;
|
||||
}
|
||||
|
||||
static void set_dentry_fsdata(struct dentry *dentry, struct scoutfs_lock *lock)
|
||||
{
|
||||
void *now = (void *)(unsigned long)lock->refresh_gen;
|
||||
void *was;
|
||||
|
||||
/* didn't want to alloc :/ */
|
||||
BUILD_BUG_ON(sizeof(dentry->d_fsdata) != sizeof(u64));
|
||||
BUILD_BUG_ON(sizeof(dentry->d_fsdata) != sizeof(long));
|
||||
|
||||
do {
|
||||
was = dentry->d_fsdata;
|
||||
} while (cmpxchg(&dentry->d_fsdata, was, now) != was);
|
||||
}
|
||||
|
||||
static bool test_dentry_fsdata(struct dentry *dentry, u64 refresh)
|
||||
{
|
||||
u64 fsd = (unsigned long)ACCESS_ONCE(dentry->d_fsdata);
|
||||
|
||||
return fsd == refresh;
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate an operation caller's input dentry argument. If the fsdata
|
||||
* is valid then the underlying dirent items couldn't have changed and
|
||||
* we return 0. If fsdata is no longer protected by a lock or its
|
||||
* fields don't match then we check the dirent item. If the dirent item
|
||||
* doesn't match what the caller expected given their dentry fields then
|
||||
* we return an error.
|
||||
*/
|
||||
static int validate_dentry(struct super_block *sb, u64 dir_ino, struct dentry *dentry,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
u64 ino = dentry_ino(dentry);
|
||||
struct scoutfs_dirent dent = {0,};
|
||||
const char *name;
|
||||
u64 dentry_ino;
|
||||
int name_len;
|
||||
u64 hash;
|
||||
int ret;
|
||||
|
||||
if (scoutfs_lock_is_covered(sb, &di->lock_cov))
|
||||
return 0;
|
||||
if (test_dentry_fsdata(dentry, lock->refresh_gen)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
dentry_ino = dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0;
|
||||
name = dentry->d_name.name;
|
||||
name_len = dentry->d_name.len;
|
||||
hash = dirent_name_hash(name, name_len);
|
||||
|
||||
ret = lookup_dirent(sb, dir_ino, name, name_len, hash, &dent, lock);
|
||||
ret = lookup_dentry_dirent(sb, dir_ino, dentry, &dent, lock);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
return ret;
|
||||
goto out;
|
||||
|
||||
if (dentry_ino != le64_to_cpu(dent.ino) || di->hash != le64_to_cpu(dent.hash) ||
|
||||
di->pos != le64_to_cpu(dent.pos)) {
|
||||
if (dentry_ino)
|
||||
ret = -ENOENT;
|
||||
else
|
||||
ret = -EEXIST;
|
||||
/* use negative zeroed dent when lookup gave -ENOENT */
|
||||
if (!ino && dent.ino) {
|
||||
/* caller expected negative but there was a dirent */
|
||||
ret = -EEXIST;
|
||||
} else if (ino && !dent.ino) {
|
||||
/* caller expected positive but there was no dirent */
|
||||
ret = -ENOENT;
|
||||
} else if (ino != le64_to_cpu(dent.ino)) {
|
||||
/* name linked to different inode than caller's */
|
||||
ret = -ESTALE;
|
||||
} else {
|
||||
/* dirent ino matches dentry ino */
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
out:
|
||||
trace_scoutfs_validate_dentry(sb, dentry, dir_ino, ino, le64_to_cpu(dent.ino),
|
||||
lock->refresh_gen, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags)
|
||||
{
|
||||
struct super_block *sb = dentry->d_sb;
|
||||
struct dentry_info *di = dentry->d_fsdata;
|
||||
struct dentry *parent = dget_parent(dentry);
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_dirent dent;
|
||||
bool is_covered = false;
|
||||
struct inode *dir;
|
||||
u64 dentry_ino;
|
||||
u64 dir_ino = dentry_parent_ino(dentry);
|
||||
int ret;
|
||||
|
||||
/* don't think this happens but we can find out */
|
||||
@@ -394,47 +348,7 @@ static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE(di == NULL)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
is_covered = scoutfs_lock_is_covered(sb, &di->lock_cov);
|
||||
if (is_covered) {
|
||||
scoutfs_inc_counter(sb, dentry_revalidate_locked);
|
||||
ret = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!parent || !parent->d_inode) {
|
||||
scoutfs_inc_counter(sb, dentry_revalidate_orphan);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
dir = parent->d_inode;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, dir, &lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = lookup_dirent(sb, scoutfs_ino(dir),
|
||||
dentry->d_name.name, dentry->d_name.len,
|
||||
dirent_name_hash(dentry->d_name.name,
|
||||
dentry->d_name.len),
|
||||
&dent, lock);
|
||||
if (ret == -ENOENT) {
|
||||
dent.ino = 0;
|
||||
dent.hash = 0;
|
||||
dent.pos = 0;
|
||||
} else if (ret < 0) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
dentry_ino = dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0;
|
||||
|
||||
if ((dentry_ino == le64_to_cpu(dent.ino))) {
|
||||
update_dentry_info(sb, dentry, le64_to_cpu(dent.hash),
|
||||
le64_to_cpu(dent.pos), lock);
|
||||
if (test_dentry_fsdata(dentry, scoutfs_lock_ino_refresh_gen(sb, dir_ino))) {
|
||||
scoutfs_inc_counter(sb, dentry_revalidate_valid);
|
||||
ret = 1;
|
||||
} else {
|
||||
@@ -443,10 +357,7 @@ static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags)
|
||||
}
|
||||
|
||||
out:
|
||||
trace_scoutfs_d_revalidate(sb, dentry, flags, parent, is_covered, ret);
|
||||
|
||||
dput(parent);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
trace_scoutfs_d_revalidate(sb, dentry, flags, dir_ino, ret);
|
||||
|
||||
if (ret < 0 && ret != -ECHILD)
|
||||
scoutfs_inc_counter(sb, dentry_revalidate_error);
|
||||
@@ -483,10 +394,6 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = alloc_dentry_info(dentry);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, dir, &dir_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
@@ -500,8 +407,7 @@ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
ino = le64_to_cpu(dent.ino);
|
||||
}
|
||||
if (ret == 0)
|
||||
update_dentry_info(sb, dentry, le64_to_cpu(dent.hash),
|
||||
le64_to_cpu(dent.pos), dir_lock);
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
|
||||
|
||||
@@ -720,15 +626,11 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
|
||||
struct list_head *ind_locks)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode;
|
||||
struct inode *inode = NULL;
|
||||
u64 ind_seq;
|
||||
int ret = 0;
|
||||
u64 ino;
|
||||
|
||||
ret = alloc_dentry_info(dentry);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
ret = scoutfs_alloc_ino(sb, S_ISDIR(mode), &ino);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
@@ -765,11 +667,10 @@ retry:
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
inode = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode) ?:
|
||||
scoutfs_init_acl_locked(inode, dir, *inode_lock, *dir_lock, ind_locks);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_dirty_inode_item(dir, *dir_lock);
|
||||
out:
|
||||
@@ -787,6 +688,8 @@ out_unlock:
|
||||
*orph_lock = NULL;
|
||||
}
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
inode = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
@@ -816,7 +719,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
return PTR_ERR(inode);
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
ret = verify_entry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -829,7 +732,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
update_dentry_info(sb, dentry, hash, pos, dir_lock);
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
@@ -903,19 +806,15 @@ static int scoutfs_link(struct dentry *old_dentry,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (inode->i_nlink >= SCOUTFS_LINK_MAX) {
|
||||
ret = -EMLINK;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = alloc_dentry_info(dentry);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
ret = verify_entry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
dir_size = i_size_read(dir) + dentry->d_name.len;
|
||||
|
||||
if (inode->i_nlink == 0) {
|
||||
@@ -941,7 +840,7 @@ retry:
|
||||
goto out;
|
||||
|
||||
if (del_orphan) {
|
||||
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
|
||||
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
@@ -953,11 +852,11 @@ retry:
|
||||
scoutfs_ino(inode), inode->i_mode, dir_lock,
|
||||
inode_lock);
|
||||
if (ret) {
|
||||
err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock);
|
||||
WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
|
||||
goto out;
|
||||
}
|
||||
update_dentry_info(sb, dentry, hash, pos, dir_lock);
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
i_size_write(dir, dir_size);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
@@ -1005,9 +904,11 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *dir_lock = NULL;
|
||||
struct scoutfs_dirent dent;
|
||||
LIST_HEAD(ind_locks);
|
||||
u64 ind_seq;
|
||||
int ret = 0;
|
||||
u64 hash;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE,
|
||||
@@ -1016,11 +917,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = alloc_dentry_info(dentry);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
ret = verify_entry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
@@ -1029,6 +926,13 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
|
||||
|
||||
ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, dentry->d_name.len, hash,
|
||||
&dent, dir_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (should_orphan(inode)) {
|
||||
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
|
||||
&orph_lock);
|
||||
@@ -1047,21 +951,20 @@ retry:
|
||||
goto unlock;
|
||||
|
||||
if (should_orphan(inode)) {
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
|
||||
dentry_info_pos(dentry), scoutfs_ino(inode),
|
||||
dir_lock, inode_lock);
|
||||
ret = del_entry_items(sb, scoutfs_ino(dir), le64_to_cpu(dent.hash), le64_to_cpu(dent.pos),
|
||||
scoutfs_ino(inode), dir_lock, inode_lock);
|
||||
if (ret) {
|
||||
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
|
||||
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock);
|
||||
WARN_ON_ONCE(ret); /* should have been dirty */
|
||||
goto out;
|
||||
}
|
||||
|
||||
update_dentry_info(sb, dentry, 0, 0, dir_lock);
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
dir->i_ctime = ts;
|
||||
dir->i_mtime = ts;
|
||||
@@ -1242,10 +1145,11 @@ const struct inode_operations scoutfs_symlink_iops = {
|
||||
.put_link = scoutfs_put_link,
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
.setxattr = scoutfs_setxattr,
|
||||
.getxattr = scoutfs_getxattr,
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.removexattr = scoutfs_removexattr,
|
||||
.removexattr = generic_removexattr,
|
||||
.get_acl = scoutfs_get_acl,
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -1273,17 +1177,13 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
name_len > PATH_MAX || name_len > SCOUTFS_SYMLINK_MAX_SIZE)
|
||||
return -ENAMETOOLONG;
|
||||
|
||||
ret = alloc_dentry_info(dentry);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
|
||||
&dir_lock, &inode_lock, NULL, &ind_locks);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
ret = verify_entry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -1301,7 +1201,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
update_dentry_info(sb, dentry, hash, pos, dir_lock);
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
@@ -1319,11 +1219,11 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
insert_inode_hash(inode);
|
||||
/* XXX need to set i_op/fop before here for sec callbacks */
|
||||
d_instantiate(dentry, inode);
|
||||
inode = NULL;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0) {
|
||||
/* XXX remove inode items */
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock,
|
||||
NULL, name_len);
|
||||
@@ -1334,6 +1234,9 @@ out:
|
||||
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1628,6 +1531,8 @@ static int scoutfs_rename_common(struct inode *old_dir,
|
||||
struct scoutfs_lock *old_inode_lock = NULL;
|
||||
struct scoutfs_lock *new_inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_dirent new_dent;
|
||||
struct scoutfs_dirent old_dent;
|
||||
struct timespec now;
|
||||
bool ins_new = false;
|
||||
bool del_new = false;
|
||||
@@ -1675,19 +1580,18 @@ static int scoutfs_rename_common(struct inode *old_dir,
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
/* make sure that the entries assumed by the argument still exist */
|
||||
ret = validate_dentry(sb, scoutfs_ino(old_dir), old_dentry, old_dir_lock) ?:
|
||||
validate_dentry(sb, scoutfs_ino(new_dir), new_dentry, new_dir_lock);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
/* test dir i_size now that it's refreshed */
|
||||
if (new_inode && S_ISDIR(new_inode->i_mode) && i_size_read(new_inode)) {
|
||||
ret = -ENOTEMPTY;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/* make sure that the entries assumed by the argument still exist */
|
||||
ret = alloc_dentry_info(old_dentry) ?:
|
||||
alloc_dentry_info(new_dentry) ?:
|
||||
verify_entry(sb, scoutfs_ino(old_dir), old_dentry, old_dir_lock) ?:
|
||||
verify_entry(sb, scoutfs_ino(new_dir), new_dentry, new_dir_lock);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
if ((flags & RENAME_NOREPLACE) && (new_inode != NULL)) {
|
||||
ret = -EEXIST;
|
||||
@@ -1730,10 +1634,12 @@ retry:
|
||||
|
||||
/* remove the new entry if it exists */
|
||||
if (new_inode) {
|
||||
ret = del_entry_items(sb, scoutfs_ino(new_dir),
|
||||
dentry_info_hash(new_dentry),
|
||||
dentry_info_pos(new_dentry),
|
||||
scoutfs_ino(new_inode),
|
||||
ret = lookup_dirent(sb, scoutfs_ino(new_dir), new_dentry->d_name.name,
|
||||
new_dentry->d_name.len, new_hash, &new_dent, new_dir_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
ret = del_entry_items(sb, scoutfs_ino(new_dir), le64_to_cpu(new_dent.hash),
|
||||
le64_to_cpu(new_dent.pos), scoutfs_ino(new_inode),
|
||||
new_dir_lock, new_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
@@ -1749,18 +1655,22 @@ retry:
|
||||
goto out;
|
||||
del_new = true;
|
||||
|
||||
ret = lookup_dirent(sb, scoutfs_ino(old_dir), old_dentry->d_name.name,
|
||||
old_dentry->d_name.len, old_hash, &old_dent, old_dir_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* remove the old entry */
|
||||
ret = del_entry_items(sb, scoutfs_ino(old_dir),
|
||||
dentry_info_hash(old_dentry),
|
||||
dentry_info_pos(old_dentry),
|
||||
scoutfs_ino(old_inode),
|
||||
ret = del_entry_items(sb, scoutfs_ino(old_dir), le64_to_cpu(old_dent.hash),
|
||||
le64_to_cpu(old_dent.pos), scoutfs_ino(old_inode),
|
||||
old_dir_lock, old_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
ins_old = true;
|
||||
|
||||
if (should_orphan(new_inode)) {
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock,
|
||||
new_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
@@ -1768,7 +1678,7 @@ retry:
|
||||
/* won't fail from here on out, update all the vfs structs */
|
||||
|
||||
/* the caller will use d_move to move the old_dentry into place */
|
||||
update_dentry_info(sb, old_dentry, new_hash, new_pos, new_dir_lock);
|
||||
set_dentry_fsdata(old_dentry, new_dir_lock);
|
||||
|
||||
i_size_write(old_dir, i_size_read(old_dir) - old_dentry->d_name.len);
|
||||
if (!new_inode)
|
||||
@@ -1833,8 +1743,8 @@ out:
|
||||
err = 0;
|
||||
if (ins_old)
|
||||
err = add_entry_items(sb, scoutfs_ino(old_dir),
|
||||
dentry_info_hash(old_dentry),
|
||||
dentry_info_pos(old_dentry),
|
||||
le64_to_cpu(old_dent.hash),
|
||||
le64_to_cpu(old_dent.pos),
|
||||
old_dentry->d_name.name,
|
||||
old_dentry->d_name.len,
|
||||
scoutfs_ino(old_inode),
|
||||
@@ -1850,8 +1760,8 @@ out:
|
||||
|
||||
if (ins_new && err == 0)
|
||||
err = add_entry_items(sb, scoutfs_ino(new_dir),
|
||||
dentry_info_hash(new_dentry),
|
||||
dentry_info_pos(new_dentry),
|
||||
le64_to_cpu(new_dent.hash),
|
||||
le64_to_cpu(new_dent.pos),
|
||||
new_dentry->d_name.name,
|
||||
new_dentry->d_name.len,
|
||||
scoutfs_ino(new_inode),
|
||||
@@ -1922,11 +1832,9 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
return PTR_ERR(inode);
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
|
||||
if (ret < 0) {
|
||||
iput(inode);
|
||||
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock);
|
||||
if (ret < 0)
|
||||
goto out; /* XXX returning error but items created */
|
||||
}
|
||||
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
si->crtime = inode->i_mtime;
|
||||
@@ -1939,7 +1847,6 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
|
||||
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
iput(inode);
|
||||
|
||||
out:
|
||||
scoutfs_release_trans(sb);
|
||||
@@ -1948,6 +1855,9 @@ out:
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
|
||||
if (!IS_ERR_OR_NULL(inode))
|
||||
iput(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1975,32 +1885,14 @@ const struct inode_operations_wrapper scoutfs_dir_iops = {
|
||||
.rename = scoutfs_rename,
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
.setxattr = scoutfs_setxattr,
|
||||
.getxattr = scoutfs_getxattr,
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.removexattr = scoutfs_removexattr,
|
||||
.removexattr = generic_removexattr,
|
||||
.get_acl = scoutfs_get_acl,
|
||||
.symlink = scoutfs_symlink,
|
||||
.permission = scoutfs_permission,
|
||||
},
|
||||
.tmpfile = scoutfs_tmpfile,
|
||||
.rename2 = scoutfs_rename2,
|
||||
};
|
||||
|
||||
void scoutfs_dir_exit(void)
|
||||
{
|
||||
if (dentry_info_cache) {
|
||||
kmem_cache_destroy(dentry_info_cache);
|
||||
dentry_info_cache = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int scoutfs_dir_init(void)
|
||||
{
|
||||
dentry_info_cache = kmem_cache_create("scoutfs_dentry_info",
|
||||
sizeof(struct dentry_info), 0,
|
||||
SLAB_RECLAIM_ACCOUNT, NULL);
|
||||
if (!dentry_info_cache)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,8 @@ extern const struct file_operations scoutfs_dir_fops;
|
||||
extern const struct inode_operations_wrapper scoutfs_dir_iops;
|
||||
extern const struct inode_operations scoutfs_symlink_iops;
|
||||
|
||||
extern const struct dentry_operations scoutfs_dentry_ops;
|
||||
|
||||
struct scoutfs_link_backref_entry {
|
||||
struct list_head head;
|
||||
u64 dir_ino;
|
||||
@@ -29,7 +31,4 @@ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
|
||||
int scoutfs_symlink_drop(struct super_block *sb, u64 ino,
|
||||
struct scoutfs_lock *lock, u64 i_size);
|
||||
|
||||
int scoutfs_dir_init(void);
|
||||
void scoutfs_dir_exit(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -395,12 +395,13 @@ int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
|
||||
int scoutfs_fence_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct fence_info *fi;
|
||||
int ret;
|
||||
|
||||
/* can only fence if we can be elected by quorum */
|
||||
if (opts->quorum_slot_nr == -1) {
|
||||
scoutfs_options_read(sb, &opts);
|
||||
if (opts.quorum_slot_nr == -1) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
662
kmod/src/inode.c
662
kmod/src/inode.c
File diff suppressed because it is too large
Load Diff
@@ -56,14 +56,16 @@ struct scoutfs_inode_info {
|
||||
|
||||
struct scoutfs_lock_coverage ino_lock_cov;
|
||||
|
||||
/* drop if i_count hits 0, allows drop while invalidate holds coverage */
|
||||
bool drop_invalidated;
|
||||
struct llist_node iput_llnode;
|
||||
atomic_t iput_count;
|
||||
struct list_head iput_head;
|
||||
unsigned long iput_count;
|
||||
unsigned long iput_flags;
|
||||
|
||||
struct inode inode;
|
||||
};
|
||||
|
||||
/* try to prune dcache aliases with queued iput */
|
||||
#define SI_IPUT_FLAG_PRUNE (1 << 0)
|
||||
|
||||
static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode)
|
||||
{
|
||||
return container_of(inode, struct scoutfs_inode_info, inode);
|
||||
@@ -78,11 +80,13 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
|
||||
void scoutfs_destroy_inode(struct inode *inode);
|
||||
int scoutfs_drop_inode(struct inode *inode);
|
||||
void scoutfs_evict_inode(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode, unsigned long flags);
|
||||
|
||||
#define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf);
|
||||
struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
|
||||
struct inode *scoutfs_ilookup_nowait(struct super_block *sb, u64 ino);
|
||||
struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino);
|
||||
|
||||
|
||||
void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino);
|
||||
void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
|
||||
@@ -104,9 +108,8 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
|
||||
struct list_head *ind_locks);
|
||||
|
||||
int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret);
|
||||
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t rdev, u64 ino,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
|
||||
u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret);
|
||||
|
||||
void scoutfs_inode_set_meta_seq(struct inode *inode);
|
||||
void scoutfs_inode_set_data_seq(struct inode *inode);
|
||||
@@ -124,8 +127,11 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
|
||||
struct scoutfs_lock *primary);
|
||||
int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
|
||||
struct scoutfs_lock *primary);
|
||||
void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb);
|
||||
|
||||
void scoutfs_inode_queue_writeback(struct inode *inode);
|
||||
int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
|
||||
|
||||
@@ -387,7 +387,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
|
||||
if (sblock > eblock)
|
||||
return -EINVAL;
|
||||
|
||||
inode = scoutfs_ilookup(sb, args.ino);
|
||||
inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino);
|
||||
if (!inode) {
|
||||
ret = -ESTALE;
|
||||
goto out;
|
||||
|
||||
@@ -1676,6 +1676,14 @@ static int lock_safe(struct scoutfs_lock *lock, struct scoutfs_key *key,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int optional_lock_mode_match(struct scoutfs_lock *lock, int mode)
|
||||
{
|
||||
if (WARN_ON_ONCE(lock && lock->mode != mode))
|
||||
return -EINVAL;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the cached item's value into the caller's value. The number of
|
||||
* bytes copied is returned. A null val returns 0.
|
||||
@@ -1832,12 +1840,19 @@ out:
|
||||
* also increase the seqs. It lets us limit the inputs of item merging
|
||||
* to the last stable seq and ensure that all the items in open
|
||||
* transactions and granted locks will have greater seqs.
|
||||
*
|
||||
* This is a little awkward for WRITE_ONLY locks which can have much
|
||||
* older versions than the version of locked primary data that they're
|
||||
* operating on behalf of. Callers can optionally provide that primary
|
||||
* lock to get the version from. This ensures that items created under
|
||||
* WRITE_ONLY locks can not have versions less than their primary data.
|
||||
*/
|
||||
static u64 item_seq(struct super_block *sb, struct scoutfs_lock *lock)
|
||||
static u64 item_seq(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
struct scoutfs_lock *primary)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
|
||||
return max(sbi->trans_seq, lock->write_seq);
|
||||
return max3(sbi->trans_seq, lock->write_seq, primary ? primary->write_seq : 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1872,7 +1887,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
|
||||
if (!item || item->deletion) {
|
||||
ret = -ENOENT;
|
||||
} else {
|
||||
item->seq = item_seq(sb, lock);
|
||||
item->seq = item_seq(sb, lock, NULL);
|
||||
mark_item_dirty(sb, cinf, pg, NULL, item);
|
||||
ret = 0;
|
||||
}
|
||||
@@ -1889,10 +1904,10 @@ out:
|
||||
*/
|
||||
static int item_create(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock,
|
||||
int mode, bool force)
|
||||
struct scoutfs_lock *primary, int mode, bool force)
|
||||
{
|
||||
DECLARE_ITEM_CACHE_INFO(sb, cinf);
|
||||
const u64 seq = item_seq(sb, lock);
|
||||
const u64 seq = item_seq(sb, lock, primary);
|
||||
struct cached_item *found;
|
||||
struct cached_item *item;
|
||||
struct cached_page *pg;
|
||||
@@ -1902,7 +1917,8 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key,
|
||||
|
||||
scoutfs_inc_counter(sb, item_create);
|
||||
|
||||
if ((ret = lock_safe(lock, key, mode)))
|
||||
if ((ret = lock_safe(lock, key, mode)) ||
|
||||
(ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE)))
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_forest_set_bloom_bits(sb, lock);
|
||||
@@ -1943,15 +1959,15 @@ out:
|
||||
int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock)
|
||||
{
|
||||
return item_create(sb, key, val, val_len, lock,
|
||||
return item_create(sb, key, val, val_len, lock, NULL,
|
||||
SCOUTFS_LOCK_READ, false);
|
||||
}
|
||||
|
||||
int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len,
|
||||
struct scoutfs_lock *lock)
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *primary)
|
||||
{
|
||||
return item_create(sb, key, val, val_len, lock,
|
||||
return item_create(sb, key, val, val_len, lock, primary,
|
||||
SCOUTFS_LOCK_WRITE_ONLY, true);
|
||||
}
|
||||
|
||||
@@ -1965,7 +1981,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock)
|
||||
{
|
||||
DECLARE_ITEM_CACHE_INFO(sb, cinf);
|
||||
const u64 seq = item_seq(sb, lock);
|
||||
const u64 seq = item_seq(sb, lock, NULL);
|
||||
struct cached_item *item;
|
||||
struct cached_item *found;
|
||||
struct cached_page *pg;
|
||||
@@ -2025,12 +2041,16 @@ out:
|
||||
* current items so the caller always writes with write only locks. If
|
||||
* combining the current delta item and the caller's item results in a
|
||||
* null we can just drop it, we don't have to emit a deletion item.
|
||||
*
|
||||
* Delta items don't have to worry about creating items with old
|
||||
* versions under write_only locks. The versions don't impact how we
|
||||
* merge two items.
|
||||
*/
|
||||
int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock)
|
||||
{
|
||||
DECLARE_ITEM_CACHE_INFO(sb, cinf);
|
||||
const u64 seq = item_seq(sb, lock);
|
||||
const u64 seq = item_seq(sb, lock, NULL);
|
||||
struct cached_item *item;
|
||||
struct cached_page *pg;
|
||||
struct rb_node **pnode;
|
||||
@@ -2099,10 +2119,11 @@ out:
|
||||
* deletion item if there isn't one already cached.
|
||||
*/
|
||||
static int item_delete(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock, int mode, bool force)
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *primary,
|
||||
int mode, bool force)
|
||||
{
|
||||
DECLARE_ITEM_CACHE_INFO(sb, cinf);
|
||||
const u64 seq = item_seq(sb, lock);
|
||||
const u64 seq = item_seq(sb, lock, primary);
|
||||
struct cached_item *item;
|
||||
struct cached_page *pg;
|
||||
struct rb_node **pnode;
|
||||
@@ -2111,7 +2132,8 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
|
||||
|
||||
scoutfs_inc_counter(sb, item_delete);
|
||||
|
||||
if ((ret = lock_safe(lock, key, mode)))
|
||||
if ((ret = lock_safe(lock, key, mode)) ||
|
||||
(ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE)))
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_forest_set_bloom_bits(sb, lock);
|
||||
@@ -2161,13 +2183,13 @@ out:
|
||||
int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE, false);
|
||||
return item_delete(sb, key, lock, NULL, SCOUTFS_LOCK_WRITE, false);
|
||||
}
|
||||
|
||||
int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock)
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *primary)
|
||||
{
|
||||
return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE_ONLY, true);
|
||||
return item_delete(sb, key, lock, primary, SCOUTFS_LOCK_WRITE_ONLY, true);
|
||||
}
|
||||
|
||||
u64 scoutfs_item_dirty_pages(struct super_block *sb)
|
||||
|
||||
@@ -15,16 +15,15 @@ int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock);
|
||||
int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len,
|
||||
struct scoutfs_lock *lock);
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *primary);
|
||||
int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock);
|
||||
int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key,
|
||||
void *val, int val_len, struct scoutfs_lock *lock);
|
||||
int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_item_delete_force(struct super_block *sb,
|
||||
struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_lock *lock, struct scoutfs_lock *primary);
|
||||
|
||||
u64 scoutfs_item_dirty_pages(struct super_block *sb);
|
||||
int scoutfs_item_write_dirty(struct super_block *sb);
|
||||
|
||||
@@ -46,4 +46,10 @@ static inline int dir_emit_dots(struct file *file, void *dirent,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef KC_POSIX_ACL_VALID_USER_NS
|
||||
#define kc_posix_acl_valid(user_ns, acl) posix_acl_valid(user_ns, acl)
|
||||
#else
|
||||
#define kc_posix_acl_valid(user_ns, acl) posix_acl_valid(acl)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
101
kmod/src/lock.c
101
kmod/src/lock.c
@@ -18,6 +18,7 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/posix_acl.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "lock.h"
|
||||
@@ -129,20 +130,17 @@ static bool lock_modes_match(int granted, int requested)
|
||||
* allows deletions to be performed by unlink without having to wait for
|
||||
* remote cached inodes to be dropped.
|
||||
*
|
||||
* If the cached inode was already deferring final inode deletion then
|
||||
* we can't perform that inline in invalidation. The locking alone
|
||||
* deadlock, and it might also take multiple transactions to fully
|
||||
* delete an inode with significant metadata. We only perform the iput
|
||||
* inline if we know that possible eviction can't perform the final
|
||||
* deletion, otherwise we kick it off to async work.
|
||||
* We kick the d_prune and iput off to async work because they can end
|
||||
* up in final iput and inode eviction item deletion which would
|
||||
* deadlock. d_prune->dput can end up in iput on parents in different
|
||||
* locks entirely.
|
||||
*/
|
||||
static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_LOCK_INFO(sb, linfo);
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode;
|
||||
|
||||
inode = scoutfs_ilookup(sb, ino);
|
||||
inode = scoutfs_ilookup_nowait_nonewfree(sb, ino);
|
||||
if (inode) {
|
||||
si = SCOUTFS_I(inode);
|
||||
|
||||
@@ -152,17 +150,9 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
scoutfs_data_wait_changed(inode);
|
||||
}
|
||||
|
||||
/* can't touch during unmount, dcache destroys w/o locks */
|
||||
if (!linfo->unmounting)
|
||||
d_prune_aliases(inode);
|
||||
forget_all_cached_acls(inode);
|
||||
|
||||
si->drop_invalidated = true;
|
||||
if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
|
||||
iput(inode);
|
||||
} else {
|
||||
/* defer iput to work context so we don't evict inodes from invalidation */
|
||||
scoutfs_inode_queue_iput(inode);
|
||||
}
|
||||
scoutfs_inode_queue_iput(inode, SI_IPUT_FLAG_PRUNE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,16 +188,6 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
/* have to invalidate if we're not in the only usable case */
|
||||
if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
|
||||
retry:
|
||||
/* invalidate inodes before removing coverage */
|
||||
if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
|
||||
ino = le64_to_cpu(lock->start.ski_ino);
|
||||
last = le64_to_cpu(lock->end.ski_ino);
|
||||
while (ino <= last) {
|
||||
invalidate_inode(sb, ino);
|
||||
ino++;
|
||||
}
|
||||
}
|
||||
|
||||
/* remove cov items to tell users that their cache is stale */
|
||||
spin_lock(&lock->cov_list_lock);
|
||||
list_for_each_entry_safe(cov, tmp, &lock->cov_list, head) {
|
||||
@@ -223,6 +203,16 @@ retry:
|
||||
}
|
||||
spin_unlock(&lock->cov_list_lock);
|
||||
|
||||
/* invalidate inodes after removing coverage so drop/evict aren't covered */
|
||||
if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
|
||||
ino = le64_to_cpu(lock->start.ski_ino);
|
||||
last = le64_to_cpu(lock->end.ski_ino);
|
||||
while (ino <= last) {
|
||||
invalidate_inode(sb, ino);
|
||||
ino++;
|
||||
}
|
||||
}
|
||||
|
||||
scoutfs_item_invalidate(sb, &lock->start, &lock->end);
|
||||
}
|
||||
|
||||
@@ -255,7 +245,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
|
||||
BUG_ON(!list_empty(&lock->shrink_head));
|
||||
BUG_ON(!list_empty(&lock->cov_list));
|
||||
|
||||
scoutfs_omap_free_lock_data(lock->omap_data);
|
||||
kfree(lock->inode_deletion_data);
|
||||
kfree(lock);
|
||||
}
|
||||
|
||||
@@ -289,9 +279,9 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
|
||||
lock->sb = sb;
|
||||
init_waitqueue_head(&lock->waitq);
|
||||
lock->mode = SCOUTFS_LOCK_NULL;
|
||||
lock->invalidating_mode = SCOUTFS_LOCK_NULL;
|
||||
|
||||
atomic64_set(&lock->forest_bloom_nr, 0);
|
||||
spin_lock_init(&lock->omap_spinlock);
|
||||
|
||||
trace_scoutfs_lock_alloc(sb, lock);
|
||||
|
||||
@@ -667,7 +657,9 @@ struct inv_req {
|
||||
*
|
||||
* Before we start invalidating the lock we set the lock to the new
|
||||
* mode, preventing further incompatible users of the old mode from
|
||||
* using the lock while we're invalidating.
|
||||
* using the lock while we're invalidating. We record the previously
|
||||
* granted mode so that we can send lock recover responses with the old
|
||||
* granted mode during invalidation.
|
||||
*/
|
||||
static void lock_invalidate_worker(struct work_struct *work)
|
||||
{
|
||||
@@ -692,7 +684,8 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
if (!lock_counts_match(nl->new_mode, lock->users))
|
||||
continue;
|
||||
|
||||
/* set the new mode, no incompatible users during inval */
|
||||
/* set the new mode, no incompatible users during inval, recov needs old */
|
||||
lock->invalidating_mode = lock->mode;
|
||||
lock->mode = nl->new_mode;
|
||||
|
||||
/* move everyone that's ready to our private list */
|
||||
@@ -735,6 +728,8 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
list_del(&ireq->head);
|
||||
kfree(ireq);
|
||||
|
||||
lock->invalidating_mode = SCOUTFS_LOCK_NULL;
|
||||
|
||||
if (list_empty(&lock->inv_list)) {
|
||||
/* finish if another request didn't arrive */
|
||||
list_del_init(&lock->inv_head);
|
||||
@@ -825,6 +820,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
|
||||
{
|
||||
DECLARE_LOCK_INFO(sb, linfo);
|
||||
struct scoutfs_net_lock_recover *nlr;
|
||||
enum scoutfs_lock_mode mode;
|
||||
struct scoutfs_lock *lock;
|
||||
struct scoutfs_lock *next;
|
||||
struct rb_node *node;
|
||||
@@ -845,10 +841,15 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
|
||||
|
||||
for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {
|
||||
|
||||
if (lock->invalidating_mode != SCOUTFS_LOCK_NULL)
|
||||
mode = lock->invalidating_mode;
|
||||
else
|
||||
mode = lock->mode;
|
||||
|
||||
nlr->locks[i].key = lock->start;
|
||||
nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq);
|
||||
nlr->locks[i].old_mode = lock->mode;
|
||||
nlr->locks[i].new_mode = lock->mode;
|
||||
nlr->locks[i].old_mode = mode;
|
||||
nlr->locks[i].new_mode = mode;
|
||||
|
||||
node = rb_next(&lock->node);
|
||||
if (node)
|
||||
@@ -1514,6 +1515,38 @@ void scoutfs_lock_flush_invalidate(struct super_block *sb)
|
||||
flush_work(&linfo->inv_work);
|
||||
}
|
||||
|
||||
static u64 get_held_lock_refresh_gen(struct super_block *sb, struct scoutfs_key *start)
|
||||
{
|
||||
DECLARE_LOCK_INFO(sb, linfo);
|
||||
struct scoutfs_lock *lock;
|
||||
u64 refresh_gen = 0;
|
||||
|
||||
/* this can be called from all manner of places */
|
||||
if (!linfo)
|
||||
return 0;
|
||||
|
||||
spin_lock(&linfo->lock);
|
||||
lock = lock_lookup(sb, start, NULL);
|
||||
if (lock) {
|
||||
if (lock_mode_can_read(lock->mode))
|
||||
refresh_gen = lock->refresh_gen;
|
||||
}
|
||||
spin_unlock(&linfo->lock);
|
||||
|
||||
return refresh_gen;
|
||||
}
|
||||
|
||||
u64 scoutfs_lock_ino_refresh_gen(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct scoutfs_key start;
|
||||
|
||||
scoutfs_key_set_zeros(&start);
|
||||
start.sk_zone = SCOUTFS_FS_ZONE;
|
||||
start.ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
|
||||
return get_held_lock_refresh_gen(sb, &start);
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is going to be shutting down transactions and the client.
|
||||
* We need to make sure that locking won't call either after we return.
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
#define SCOUTFS_LOCK_NR_MODES SCOUTFS_LOCK_INVALID
|
||||
|
||||
struct scoutfs_omap_lock;
|
||||
struct inode_deletion_lock_data;
|
||||
|
||||
/*
|
||||
* A few fields (start, end, refresh_gen, write_seq, granted_mode)
|
||||
@@ -39,6 +39,7 @@ struct scoutfs_lock {
|
||||
struct list_head cov_list;
|
||||
|
||||
enum scoutfs_lock_mode mode;
|
||||
enum scoutfs_lock_mode invalidating_mode;
|
||||
unsigned int waiters[SCOUTFS_LOCK_NR_MODES];
|
||||
unsigned int users[SCOUTFS_LOCK_NR_MODES];
|
||||
|
||||
@@ -47,9 +48,8 @@ struct scoutfs_lock {
|
||||
/* the forest tracks which log tree last saw bloom bit updates */
|
||||
atomic64_t forest_bloom_nr;
|
||||
|
||||
/* open ino mapping has a valid map for a held write lock */
|
||||
spinlock_t omap_spinlock;
|
||||
struct scoutfs_omap_lock_data *omap_data;
|
||||
/* inode deletion tracks some state per lock */
|
||||
struct inode_deletion_lock_data *inode_deletion_data;
|
||||
};
|
||||
|
||||
struct scoutfs_lock_coverage {
|
||||
@@ -100,6 +100,8 @@ void scoutfs_lock_del_coverage(struct super_block *sb,
|
||||
bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
|
||||
enum scoutfs_lock_mode mode);
|
||||
|
||||
u64 scoutfs_lock_ino_refresh_gen(struct super_block *sb, u64 ino);
|
||||
|
||||
void scoutfs_free_unused_locks(struct super_block *sb);
|
||||
|
||||
int scoutfs_lock_setup(struct super_block *sb);
|
||||
|
||||
@@ -749,7 +749,7 @@ out:
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "lock server err %d during client rid %016llx farewell, shutting down",
|
||||
ret, rid);
|
||||
scoutfs_server_abort(sb);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -355,6 +355,7 @@ static int submit_send(struct super_block *sb,
|
||||
}
|
||||
if (rid != 0) {
|
||||
spin_unlock(&conn->lock);
|
||||
kfree(msend);
|
||||
return -ENOTCONN;
|
||||
}
|
||||
}
|
||||
@@ -991,6 +992,8 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
acc_sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
/* inherit accepted request funcs from listening conn */
|
||||
acc_conn = scoutfs_net_alloc_conn(sb, conn->notify_up,
|
||||
conn->notify_down,
|
||||
@@ -1053,6 +1056,8 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
/* caller specified connect timeout */
|
||||
tv.tv_sec = conn->connect_timeout_ms / MSEC_PER_SEC;
|
||||
tv.tv_usec = (conn->connect_timeout_ms % MSEC_PER_SEC) * USEC_PER_MSEC;
|
||||
@@ -1292,7 +1297,7 @@ restart:
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "client fence returned err %d, shutting down server",
|
||||
ret);
|
||||
scoutfs_server_abort(sb);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
}
|
||||
destroy_conn(acc);
|
||||
@@ -1341,10 +1346,12 @@ scoutfs_net_alloc_conn(struct super_block *sb,
|
||||
if (!conn)
|
||||
return NULL;
|
||||
|
||||
conn->info = kzalloc(info_size, GFP_NOFS);
|
||||
if (!conn->info) {
|
||||
kfree(conn);
|
||||
return NULL;
|
||||
if (info_size) {
|
||||
conn->info = kzalloc(info_size, GFP_NOFS);
|
||||
if (!conn->info) {
|
||||
kfree(conn);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
conn->workq = alloc_workqueue("scoutfs_net_%s",
|
||||
@@ -1450,6 +1457,8 @@ int scoutfs_net_bind(struct super_block *sb,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
optval = 1;
|
||||
ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
|
||||
(char *)&optval, sizeof(optval));
|
||||
|
||||
313
kmod/src/omap.c
313
kmod/src/omap.c
@@ -30,27 +30,22 @@
|
||||
/*
|
||||
* As a client removes an inode from its cache with an nlink of 0 it
|
||||
* needs to decide if it is the last client using the inode and should
|
||||
* fully delete all its items. It needs to know if other mounts still
|
||||
* have the inode in use.
|
||||
* fully delete all the inode's items. It needs to know if other mounts
|
||||
* still have the inode in use.
|
||||
*
|
||||
* We need a way to communicate between mounts that an inode is open.
|
||||
* We need a way to communicate between mounts that an inode is in use.
|
||||
* We don't want to pay the synchronous per-file locking round trip
|
||||
* costs associated with per-inode open locks that you'd typically see
|
||||
* in systems to solve this problem.
|
||||
* in systems to solve this problem. The first prototypes of this
|
||||
* tracked open file handles so this was coined the open map, though it
|
||||
* now tracks cached inodes.
|
||||
*
|
||||
* Instead clients maintain open bitmaps that cover groups of inodes.
|
||||
* As inodes enter the cache their bit is set, and as the inode is
|
||||
* evicted the bit is cleared. As an inode is evicted messages are sent
|
||||
* around the cluster to get the current bitmaps for that inode's group
|
||||
* from all active mounts. If the inode's bit is clear then it can be
|
||||
* deleted.
|
||||
*
|
||||
* We associate the open bitmaps with our cluster locking of inode
|
||||
* groups to cache these open bitmaps. As long as we have the lock then
|
||||
* nlink can't be changed on any remote mounts. Specifically, it can't
|
||||
* increase from 0 so any clear bits can gain references on remote
|
||||
* mounts. As long as we have the lock, all clear bits in the group for
|
||||
* inodes with 0 nlink can be deleted.
|
||||
* Clients maintain bitmaps that cover groups of inodes. As inodes
|
||||
* enter the cache their bit is set and as the inode is evicted the bit
|
||||
* is cleared. As deletion is attempted, either by scanning orphans or
|
||||
* evicting an inode with an nlink of 0, messages are sent around the
|
||||
* cluster to get the current bitmaps for that inode's group from all
|
||||
* active mounts. If the inode's bit is clear then it can be deleted.
|
||||
*
|
||||
* This layer maintains a list of client rids to send messages to. The
|
||||
* server calls us as clients enter and leave the cluster. We can't
|
||||
@@ -85,14 +80,12 @@ struct omap_info {
|
||||
struct omap_info *name = SCOUTFS_SB(sb)->omap_info
|
||||
|
||||
/*
|
||||
* The presence of an inode in the inode cache increases the count of
|
||||
* its inode number's position within its lock group. These structs
|
||||
* track the counts for all the inodes in a lock group and maintain a
|
||||
* bitmap whose bits are set for each non-zero count.
|
||||
* The presence of an inode in the inode sets its bit in the lock
|
||||
* group's bitmap.
|
||||
*
|
||||
* We don't want to add additional global synchronization of inode cache
|
||||
* maintenance so these are tracked in an rcu hash table. Once their
|
||||
* total count reaches zero they're removed from the hash and queued for
|
||||
* total reaches zero they're removed from the hash and queued for
|
||||
* freeing and readers should ignore them.
|
||||
*/
|
||||
struct omap_group {
|
||||
@@ -102,7 +95,6 @@ struct omap_group {
|
||||
u64 nr;
|
||||
spinlock_t lock;
|
||||
unsigned int total;
|
||||
unsigned int *counts;
|
||||
__le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S];
|
||||
};
|
||||
|
||||
@@ -111,8 +103,7 @@ do { \
|
||||
__typeof__(group) _grp = (group); \
|
||||
__typeof__(bit_nr) _nr = (bit_nr); \
|
||||
\
|
||||
trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr, \
|
||||
_nr < 0 ? -1 : _grp->counts[_nr]); \
|
||||
trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
@@ -134,18 +125,6 @@ struct omap_request {
|
||||
struct scoutfs_open_ino_map map;
|
||||
};
|
||||
|
||||
/*
|
||||
* In each inode group cluster lock we store data to track the open ino
|
||||
* map which tracks all the inodes that the cluster lock covers. When
|
||||
* the seq shows that the map is stale we send a request to update it.
|
||||
*/
|
||||
struct scoutfs_omap_lock_data {
|
||||
u64 seq;
|
||||
bool req_in_flight;
|
||||
wait_queue_head_t waitq;
|
||||
struct scoutfs_open_ino_map map;
|
||||
};
|
||||
|
||||
static inline void init_rid_list(struct omap_rid_list *list)
|
||||
{
|
||||
INIT_LIST_HEAD(&list->head);
|
||||
@@ -178,6 +157,15 @@ static int free_rid(struct omap_rid_list *list, struct omap_rid_entry *entry)
|
||||
return nr;
|
||||
}
|
||||
|
||||
static void free_rid_list(struct omap_rid_list *list)
|
||||
{
|
||||
struct omap_rid_entry *entry;
|
||||
struct omap_rid_entry *tmp;
|
||||
|
||||
list_for_each_entry_safe(entry, tmp, &list->head, head)
|
||||
free_rid(list, entry);
|
||||
}
|
||||
|
||||
static int copy_rids(struct omap_rid_list *to, struct omap_rid_list *from, spinlock_t *from_lock)
|
||||
{
|
||||
struct omap_rid_entry *entry;
|
||||
@@ -232,7 +220,7 @@ static void free_rids(struct omap_rid_list *list)
|
||||
}
|
||||
}
|
||||
|
||||
static void calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
|
||||
void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr)
|
||||
{
|
||||
*group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
|
||||
*bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
|
||||
@@ -242,21 +230,13 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
|
||||
{
|
||||
struct omap_group *group;
|
||||
|
||||
BUILD_BUG_ON((sizeof(group->counts[0]) * SCOUTFS_OPEN_INO_MAP_BITS) > PAGE_SIZE);
|
||||
|
||||
group = kzalloc(sizeof(struct omap_group), GFP_NOFS);
|
||||
if (group) {
|
||||
group->sb = sb;
|
||||
group->nr = group_nr;
|
||||
spin_lock_init(&group->lock);
|
||||
|
||||
group->counts = (void *)get_zeroed_page(GFP_NOFS);
|
||||
if (!group->counts) {
|
||||
kfree(group);
|
||||
group = NULL;
|
||||
} else {
|
||||
trace_group(sb, alloc, group, -1);
|
||||
}
|
||||
trace_group(sb, alloc, group, -1);
|
||||
}
|
||||
|
||||
return group;
|
||||
@@ -265,7 +245,6 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr)
|
||||
static void free_group(struct super_block *sb, struct omap_group *group)
|
||||
{
|
||||
trace_group(sb, free, group, -1);
|
||||
free_page((unsigned long)group->counts);
|
||||
kfree(group);
|
||||
}
|
||||
|
||||
@@ -283,13 +262,16 @@ static const struct rhashtable_params group_ht_params = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Track an cached inode in its group. Our increment can be racing with
|
||||
* a final decrement that removes the group from the hash, sets total to
|
||||
* Track an cached inode in its group. Our set can be racing with a
|
||||
* final clear that removes the group from the hash, sets total to
|
||||
* UINT_MAX, and calls rcu free. We can retry until the dead group is
|
||||
* no longer visible in the hash table and we can insert a new allocated
|
||||
* group.
|
||||
*
|
||||
* The caller must ensure that the bit is clear, -EEXIST will be
|
||||
* returned otherwise.
|
||||
*/
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino)
|
||||
int scoutfs_omap_set(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
@@ -298,7 +280,7 @@ int scoutfs_omap_inc(struct super_block *sb, u64 ino)
|
||||
bool found;
|
||||
int ret = 0;
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
retry:
|
||||
found = false;
|
||||
@@ -308,10 +290,10 @@ retry:
|
||||
spin_lock(&group->lock);
|
||||
if (group->total < UINT_MAX) {
|
||||
found = true;
|
||||
if (group->counts[bit_nr]++ == 0) {
|
||||
set_bit_le(bit_nr, group->bits);
|
||||
if (WARN_ON_ONCE(test_and_set_bit_le(bit_nr, group->bits)))
|
||||
ret = -EEXIST;
|
||||
else
|
||||
group->total++;
|
||||
}
|
||||
}
|
||||
trace_group(sb, inc, group, bit_nr);
|
||||
spin_unlock(&group->lock);
|
||||
@@ -342,29 +324,50 @@ retry:
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool scoutfs_omap_test(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
bool ret = false;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
rcu_read_lock();
|
||||
group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
|
||||
if (group) {
|
||||
spin_lock(&group->lock);
|
||||
ret = !!test_bit_le(bit_nr, group->bits);
|
||||
spin_unlock(&group->lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrement a previously incremented ino count. Not finding a count
|
||||
* implies imbalanced inc/dec or bugs freeing groups. We only free
|
||||
* groups here as the last dec drops the group's total count to 0.
|
||||
* Clear a previously set ino bit. Trying to clear a bit that's already
|
||||
* clear implies imbalanced set/clear or bugs freeing groups. We only
|
||||
* free groups here as the last clear drops the group's total to 0.
|
||||
*/
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino)
|
||||
void scoutfs_omap_clear(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_OMAP_INFO(sb, ominf);
|
||||
struct omap_group *group;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
rcu_read_lock();
|
||||
group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params);
|
||||
if (group) {
|
||||
spin_lock(&group->lock);
|
||||
WARN_ON_ONCE(group->counts[bit_nr] == 0);
|
||||
WARN_ON_ONCE(!test_bit_le(bit_nr, group->bits));
|
||||
WARN_ON_ONCE(group->total == 0);
|
||||
WARN_ON_ONCE(group->total == UINT_MAX);
|
||||
if (--group->counts[bit_nr] == 0) {
|
||||
clear_bit_le(bit_nr, group->bits);
|
||||
if (test_and_clear_bit_le(bit_nr, group->bits)) {
|
||||
if (--group->total == 0) {
|
||||
group->total = UINT_MAX;
|
||||
rhashtable_remove_fast(&ominf->group_ht, &group->ht_head,
|
||||
@@ -664,8 +667,7 @@ int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id,
|
||||
|
||||
/*
|
||||
* The client is receiving a request from the server for its map for the
|
||||
* given group. Look up the group and copy the bits to the map for
|
||||
* non-zero open counts.
|
||||
* given group. Look up the group and copy the bits to the map.
|
||||
*
|
||||
* The mount originating the request for this bitmap has the inode group
|
||||
* write locked. We can't be adding links to any inodes in the group
|
||||
@@ -811,182 +813,13 @@ void scoutfs_omap_server_shutdown(struct super_block *sb)
|
||||
llist_for_each_entry_safe(req, tmp, requests, llnode)
|
||||
kfree(req);
|
||||
|
||||
spin_lock(&ominf->lock);
|
||||
free_rid_list(&ominf->rids);
|
||||
spin_unlock(&ominf->lock);
|
||||
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lock_data *ldata)
|
||||
{
|
||||
bool in_flight;
|
||||
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
in_flight = ldata->req_in_flight;
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
|
||||
return in_flight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure the map covered by the cluster lock is current. The caller
|
||||
* holds the cluster lock so once we store lock_data on the cluster lock
|
||||
* it won't be freed and the write_seq in the cluster lock won't change.
|
||||
*
|
||||
* The omap_spinlock protects the omap_data in the cluster lock. We
|
||||
* have to drop it if we have to block to allocate lock_data, send a
|
||||
* request for a new map, or wait for a request in flight to finish.
|
||||
*/
|
||||
static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
struct scoutfs_omap_lock_data **ldata_ret, u64 group_nr)
|
||||
{
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
bool send_req;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
ldata = lock->omap_data;
|
||||
if (ldata == NULL) {
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
ldata = kzalloc(sizeof(struct scoutfs_omap_lock_data), GFP_NOFS);
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
if (!ldata) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (lock->omap_data == NULL) {
|
||||
ldata->seq = lock->write_seq - 1; /* ensure refresh */
|
||||
init_waitqueue_head(&ldata->waitq);
|
||||
|
||||
lock->omap_data = ldata;
|
||||
} else {
|
||||
kfree(ldata);
|
||||
ldata = lock->omap_data;
|
||||
}
|
||||
}
|
||||
|
||||
while (ldata->seq != lock->write_seq) {
|
||||
/* only one waiter sends a request at a time */
|
||||
if (!ldata->req_in_flight) {
|
||||
ldata->req_in_flight = true;
|
||||
send_req = true;
|
||||
} else {
|
||||
send_req = false;
|
||||
}
|
||||
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
if (send_req)
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map);
|
||||
else
|
||||
wait_event(ldata->waitq, !omap_req_in_flight(lock, ldata));
|
||||
spin_lock(&lock->omap_spinlock);
|
||||
|
||||
/* only sender can return error, other waiters retry */
|
||||
if (send_req) {
|
||||
ldata->req_in_flight = false;
|
||||
if (ret == 0)
|
||||
ldata->seq = lock->write_seq;
|
||||
wake_up(&ldata->waitq);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&lock->omap_spinlock);
|
||||
|
||||
if (ret == 0)
|
||||
*ldata_ret = ldata;
|
||||
else
|
||||
*ldata_ret = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 1 and give the caller their locks when they should delete the
|
||||
* inode items. It's safe to delete the inode items when it is no
|
||||
* longer reachable and nothing is referencing it.
|
||||
*
|
||||
* The inode is unreachable when nlink hits zero. Cluster locks protect
|
||||
* modification and testing of nlink. We use the ino_lock_cov covrage
|
||||
* to short circuit the common case of having a locked inode that hasn't
|
||||
* been deleted. If it isn't locked, we have to acquire the lock to
|
||||
* refresh the inode to see its current nlink.
|
||||
*
|
||||
* Then we use an open inode bitmap that covers all the inodes in the
|
||||
* lock group to determine if the inode is present in any other mount's
|
||||
* caches. We refresh it by asking the server for all clients' maps and
|
||||
* then store it in the lock. As long as we hold the lock nothing can
|
||||
* increase nlink from zero and let people get a reference to the inode.
|
||||
*/
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_omap_lock_data *ldata;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* lock group and omap constants are defined independently */
|
||||
BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
|
||||
|
||||
if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (inode->i_nlink > 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
/* only one request to refresh the map at a time */
|
||||
ret = get_current_lock_data(sb, lock, &ldata, group_nr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* can delete caller's zero nlink inode if it's not cached in other mounts */
|
||||
ret = !test_bit_le(bit_nr, ldata->map.bits);
|
||||
out:
|
||||
trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);
|
||||
|
||||
if (ret > 0) {
|
||||
err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
|
||||
if (err < 0)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
if (ret <= 0) {
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
lock = NULL;
|
||||
}
|
||||
|
||||
*lock_ret = lock;
|
||||
*orph_lock_ret = orph_lock;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata)
|
||||
{
|
||||
if (ldata) {
|
||||
WARN_ON_ONCE(ldata->req_in_flight);
|
||||
WARN_ON_ONCE(waitqueue_active(&ldata->waitq));
|
||||
kfree(ldata);
|
||||
}
|
||||
}
|
||||
|
||||
int scoutfs_omap_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
@@ -1044,6 +877,10 @@ void scoutfs_omap_destroy(struct super_block *sb)
|
||||
rhashtable_walk_stop(&iter);
|
||||
rhashtable_walk_exit(&iter);
|
||||
|
||||
spin_lock(&ominf->lock);
|
||||
free_rid_list(&ominf->rids);
|
||||
spin_unlock(&ominf->lock);
|
||||
|
||||
rhashtable_destroy(&ominf->group_ht);
|
||||
rhashtable_destroy(&ominf->req_ht);
|
||||
kfree(ominf);
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#ifndef _SCOUTFS_OMAP_H_
|
||||
#define _SCOUTFS_OMAP_H_
|
||||
|
||||
int scoutfs_omap_inc(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_dec(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
|
||||
void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
|
||||
int scoutfs_omap_set(struct super_block *sb, u64 ino);
|
||||
bool scoutfs_omap_test(struct super_block *sb, u64 ino);
|
||||
void scoutfs_omap_clear(struct super_block *sb, u64 ino);
|
||||
int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
|
||||
struct scoutfs_open_ino_map_args *args);
|
||||
void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr);
|
||||
|
||||
int scoutfs_omap_add_rid(struct super_block *sb, u64 rid);
|
||||
int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid);
|
||||
|
||||
@@ -26,22 +26,39 @@
|
||||
#include "msg.h"
|
||||
#include "options.h"
|
||||
#include "super.h"
|
||||
#include "inode.h"
|
||||
#include "alloc.h"
|
||||
|
||||
enum {
|
||||
Opt_acl,
|
||||
Opt_data_prealloc_blocks,
|
||||
Opt_data_prealloc_contig_only,
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
static const match_table_t tokens = {
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_acl, "acl"},
|
||||
{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
|
||||
{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
|
||||
struct options_sb_info {
|
||||
struct dentry *debugfs_dir;
|
||||
struct options_info {
|
||||
seqlock_t seqlock;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_sysfs_attrs sysfs_attrs;
|
||||
};
|
||||
|
||||
u32 scoutfs_option_u32(struct super_block *sb, int token)
|
||||
{
|
||||
WARN_ON_ONCE(1);
|
||||
return 0;
|
||||
}
|
||||
#define DECLARE_OPTIONS_INFO(sb, name) \
|
||||
struct options_info *name = SCOUTFS_SB(sb)->options_info
|
||||
|
||||
static int parse_bdev_path(struct super_block *sb, substring_t *substr,
|
||||
char **bdev_path_ret)
|
||||
@@ -89,58 +106,133 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
struct mount_options *parsed)
|
||||
static void free_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
kfree(opts->metadev_path);
|
||||
}
|
||||
|
||||
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
|
||||
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
#define MIN_DATA_PREALLOC_BLOCKS 1ULL
|
||||
#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX)
|
||||
|
||||
static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
memset(opts, 0, sizeof(*opts));
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse the option string into our options struct. This can allocate
|
||||
* memory in the struct. The caller is responsible for always calling
|
||||
* free_options() when the struct is destroyed, including when we return
|
||||
* an error.
|
||||
*/
|
||||
static int parse_options(struct super_block *sb, char *options, struct scoutfs_mount_options *opts)
|
||||
{
|
||||
substring_t args[MAX_OPT_ARGS];
|
||||
u64 nr64;
|
||||
int nr;
|
||||
int token;
|
||||
char *p;
|
||||
int ret;
|
||||
|
||||
/* Set defaults */
|
||||
memset(parsed, 0, sizeof(*parsed));
|
||||
parsed->quorum_slot_nr = -1;
|
||||
|
||||
while ((p = strsep(&options, ",")) != NULL) {
|
||||
if (!*p)
|
||||
continue;
|
||||
|
||||
token = match_token(p, tokens, args);
|
||||
switch (token) {
|
||||
case Opt_quorum_slot_nr:
|
||||
|
||||
if (parsed->quorum_slot_nr != -1) {
|
||||
case Opt_acl:
|
||||
sb->s_flags |= MS_POSIXACL;
|
||||
break;
|
||||
|
||||
case Opt_data_prealloc_blocks:
|
||||
ret = match_u64(args, &nr64);
|
||||
if (ret < 0 ||
|
||||
nr64 < MIN_DATA_PREALLOC_BLOCKS || nr64 > MAX_DATA_PREALLOC_BLOCKS) {
|
||||
scoutfs_err(sb, "invalid data_prealloc_blocks option, must be between %llu and %llu",
|
||||
MIN_DATA_PREALLOC_BLOCKS, MAX_DATA_PREALLOC_BLOCKS);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
opts->data_prealloc_blocks = nr64;
|
||||
break;
|
||||
|
||||
case Opt_data_prealloc_contig_only:
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 || nr < 0 || nr > 1) {
|
||||
scoutfs_err(sb, "invalid data_prealloc_contig_only option, bool must only be 0 or 1");
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
opts->data_prealloc_contig_only = nr;
|
||||
break;
|
||||
|
||||
case Opt_metadev_path:
|
||||
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
|
||||
case Opt_noacl:
|
||||
sb->s_flags &= ~MS_POSIXACL;
|
||||
break;
|
||||
|
||||
case Opt_orphan_scan_delay_ms:
|
||||
if (opts->orphan_scan_delay_ms != -1) {
|
||||
scoutfs_err(sb, "multiple orphan_scan_delay_ms options provided, only provide one.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 ||
|
||||
nr < MIN_ORPHAN_SCAN_DELAY_MS || nr > MAX_ORPHAN_SCAN_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid orphan_scan_delay_ms option, must be between %lu and %lu",
|
||||
MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
opts->orphan_scan_delay_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_quorum_slot_nr:
|
||||
if (opts->quorum_slot_nr != -1) {
|
||||
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = match_int(args, &nr);
|
||||
if (ret < 0 || nr < 0 ||
|
||||
nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
||||
if (ret < 0 || nr < 0 || nr >= SCOUTFS_QUORUM_MAX_SLOTS) {
|
||||
scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u",
|
||||
SCOUTFS_QUORUM_MAX_SLOTS - 1);
|
||||
if (ret == 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
parsed->quorum_slot_nr = nr;
|
||||
opts->quorum_slot_nr = nr;
|
||||
break;
|
||||
case Opt_metadev_path:
|
||||
|
||||
ret = parse_bdev_path(sb, &args[0],
|
||||
&parsed->metadev_path);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
break;
|
||||
default:
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
|
||||
p);
|
||||
break;
|
||||
scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
if (!parsed->metadev_path) {
|
||||
if (opts->orphan_scan_delay_ms == -1)
|
||||
opts->orphan_scan_delay_ms = DEFAULT_ORPHAN_SCAN_DELAY_MS;
|
||||
|
||||
if (!opts->metadev_path) {
|
||||
scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -148,40 +240,267 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scoutfs_options_setup(struct super_block *sb)
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
unsigned int seq;
|
||||
|
||||
if (WARN_ON_ONCE(optinf == NULL)) {
|
||||
/* trying to use options before early setup or after destroy */
|
||||
init_default_options(opts);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&optinf->seqlock);
|
||||
memcpy(opts, &optinf->opts, sizeof(struct scoutfs_mount_options));
|
||||
} while (read_seqretry(&optinf->seqlock, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Early setup that parses and stores the options so that the rest of
|
||||
* setup can use them. Full options setup that relies on other
|
||||
* components will be done later.
|
||||
*/
|
||||
int scoutfs_options_early_setup(struct super_block *sb, char *options)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct options_sb_info *osi;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct options_info *optinf;
|
||||
int ret;
|
||||
|
||||
osi = kzalloc(sizeof(struct options_sb_info), GFP_KERNEL);
|
||||
if (!osi)
|
||||
return -ENOMEM;
|
||||
init_default_options(&opts);
|
||||
|
||||
sbi->options = osi;
|
||||
ret = parse_options(sb, options, &opts);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
osi->debugfs_dir = debugfs_create_dir("options", sbi->debug_root);
|
||||
if (!osi->debugfs_dir) {
|
||||
optinf = kzalloc(sizeof(struct options_info), GFP_KERNEL);
|
||||
if (!optinf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
seqlock_init(&optinf->seqlock);
|
||||
scoutfs_sysfs_init_attrs(sb, &optinf->sysfs_attrs);
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts = opts;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
sbi->options_info = optinf;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret)
|
||||
if (ret < 0)
|
||||
free_options(&opts);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct scoutfs_mount_options opts;
|
||||
const bool is_acl = !!(sb->s_flags & MS_POSIXACL);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
if (is_acl)
|
||||
seq_puts(seq, ",acl");
|
||||
seq_printf(seq, ",data_prealloc_blocks=%llu", opts.data_prealloc_blocks);
|
||||
seq_printf(seq, ",data_prealloc_contig_only=%u", opts.data_prealloc_contig_only);
|
||||
seq_printf(seq, ",metadev_path=%s", opts.metadev_path);
|
||||
if (!is_acl)
|
||||
seq_puts(seq, ",noacl");
|
||||
seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms);
|
||||
if (opts.quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t data_prealloc_blocks_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%llu", opts.data_prealloc_blocks);
|
||||
}
|
||||
static ssize_t data_prealloc_blocks_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
u64 val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoll(nullterm, 0, &val);
|
||||
if (ret < 0 || val < MIN_DATA_PREALLOC_BLOCKS || val > MAX_DATA_PREALLOC_BLOCKS) {
|
||||
scoutfs_err(sb, "invalid data_prealloc_blocks option, must be between %llu and %llu",
|
||||
MIN_DATA_PREALLOC_BLOCKS, MAX_DATA_PREALLOC_BLOCKS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.data_prealloc_blocks = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(data_prealloc_blocks);
|
||||
|
||||
static ssize_t data_prealloc_contig_only_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.data_prealloc_contig_only);
|
||||
}
|
||||
static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[20]; /* more than enough for octal -U32_MAX */
|
||||
long val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtol(nullterm, 0, &val);
|
||||
if (ret < 0 || val < 0 || val > 1) {
|
||||
scoutfs_err(sb, "invalid data_prealloc_contig_only option, bool must be 0 or 1");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.data_prealloc_contig_only = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(data_prealloc_contig_only);
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s", opts.metadev_path);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(metadev_path);
|
||||
|
||||
static ssize_t orphan_scan_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.orphan_scan_delay_ms);
|
||||
}
|
||||
static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[20]; /* more than enough for octal -U32_MAX */
|
||||
long val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtol(nullterm, 0, &val);
|
||||
if (ret < 0 || val < MIN_ORPHAN_SCAN_DELAY_MS || val > MAX_ORPHAN_SCAN_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid orphan_scan_delay_ms value written to options sysfs file, must be between %lu and %lu",
|
||||
MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.orphan_scan_delay_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
|
||||
scoutfs_inode_schedule_orphan_dwork(sb);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
|
||||
|
||||
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", opts.quorum_slot_nr);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
int scoutfs_options_setup(struct super_block *sb)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_sysfs_create_attrs(sb, &optinf->sysfs_attrs, options_attrs, "mount_options");
|
||||
if (ret < 0)
|
||||
scoutfs_options_destroy(sb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* We remove the sysfs files early in unmount so that they can't try to call other subsystems
|
||||
* as they're being destroyed.
|
||||
*/
|
||||
void scoutfs_options_stop(struct super_block *sb)
|
||||
{
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (optinf)
|
||||
scoutfs_sysfs_destroy_attrs(sb, &optinf->sysfs_attrs);
|
||||
}
|
||||
|
||||
void scoutfs_options_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct options_sb_info *osi = sbi->options;
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
|
||||
if (osi) {
|
||||
if (osi->debugfs_dir)
|
||||
debugfs_remove_recursive(osi->debugfs_dir);
|
||||
kfree(osi);
|
||||
sbi->options = NULL;
|
||||
scoutfs_options_stop(sb);
|
||||
|
||||
if (optinf) {
|
||||
free_options(&optinf->opts);
|
||||
kfree(optinf);
|
||||
sbi->options_info = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,23 +5,21 @@
|
||||
#include <linux/in.h>
|
||||
#include "format.h"
|
||||
|
||||
enum scoutfs_mount_options {
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_metadev_path,
|
||||
Opt_err,
|
||||
};
|
||||
|
||||
struct mount_options {
|
||||
int quorum_slot_nr;
|
||||
struct scoutfs_mount_options {
|
||||
u64 data_prealloc_blocks;
|
||||
bool data_prealloc_contig_only;
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
};
|
||||
|
||||
int scoutfs_parse_options(struct super_block *sb, char *options,
|
||||
struct mount_options *parsed);
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
int scoutfs_options_show(struct seq_file *seq, struct dentry *root);
|
||||
|
||||
int scoutfs_options_early_setup(struct super_block *sb, char *options);
|
||||
int scoutfs_options_setup(struct super_block *sb);
|
||||
void scoutfs_options_stop(struct super_block *sb);
|
||||
void scoutfs_options_destroy(struct super_block *sb);
|
||||
|
||||
u32 scoutfs_option_u32(struct super_block *sb, int token);
|
||||
#define scoutfs_option_bool scoutfs_option_u32
|
||||
|
||||
#endif /* _SCOUTFS_OPTIONS_H_ */
|
||||
|
||||
@@ -105,6 +105,8 @@ enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
|
||||
struct quorum_status {
|
||||
enum quorum_role role;
|
||||
u64 term;
|
||||
u64 server_start_term;
|
||||
int server_event;
|
||||
int vote_for;
|
||||
unsigned long vote_bits;
|
||||
ktime_t timeout;
|
||||
@@ -116,7 +118,7 @@ struct quorum_info {
|
||||
struct socket *sock;
|
||||
bool shutdown;
|
||||
|
||||
unsigned long flags;
|
||||
int our_quorum_slot_nr;
|
||||
int votes_needed;
|
||||
|
||||
spinlock_t show_lock;
|
||||
@@ -127,8 +129,6 @@ struct quorum_info {
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
};
|
||||
|
||||
#define QINF_FLAG_SERVER 0
|
||||
|
||||
#define DECLARE_QUORUM_INFO(sb, name) \
|
||||
struct quorum_info *name = SCOUTFS_SB(sb)->quorum_info
|
||||
#define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \
|
||||
@@ -160,9 +160,7 @@ static ktime_t heartbeat_timeout(void)
|
||||
static int create_socket(struct super_block *sb)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct socket *sock = NULL;
|
||||
struct sockaddr_in sin;
|
||||
int addrlen;
|
||||
@@ -176,7 +174,7 @@ static int create_socket(struct super_block *sb)
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin);
|
||||
scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);
|
||||
|
||||
addrlen = sizeof(sin);
|
||||
ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
|
||||
@@ -207,7 +205,6 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
int only)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
ktime_t now;
|
||||
int i;
|
||||
@@ -216,7 +213,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
.fsid = super->hdr.fsid,
|
||||
.term = cpu_to_le64(term),
|
||||
.type = type,
|
||||
.from = opts->quorum_slot_nr,
|
||||
.from = qinf->our_quorum_slot_nr,
|
||||
};
|
||||
struct kvec kv = {
|
||||
.iov_base = &qmes,
|
||||
@@ -238,7 +235,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i) ||
|
||||
(only >= 0 && i != only) || i == opts->quorum_slot_nr)
|
||||
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
|
||||
continue;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, i, &sin);
|
||||
@@ -476,8 +473,8 @@ static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_
|
||||
*/
|
||||
static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
|
||||
{
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
u64 blkno = SCOUTFS_QUORUM_BLKNO + qinf->our_quorum_slot_nr;
|
||||
struct scoutfs_quorum_block blk;
|
||||
int ret;
|
||||
|
||||
@@ -496,16 +493,6 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has fenced previous leaders and reclaimed their
|
||||
* resources. We can now update our fence event with a greater term to
|
||||
* stop future leaders from doing the same.
|
||||
*/
|
||||
int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term)
|
||||
{
|
||||
return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has been elected and has started running but can't
|
||||
* yet assume that it has exclusive access to the metadata device. We
|
||||
@@ -595,15 +582,9 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
||||
}
|
||||
|
||||
out:
|
||||
if (fence_started) {
|
||||
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
} else {
|
||||
err = scoutfs_quorum_fence_complete(sb, term);
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
|
||||
if (ret < 0)
|
||||
scoutfs_inc_counter(sb, quorum_fence_error);
|
||||
@@ -611,21 +592,34 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The main quorum task maintains its private status. It seemed cleaner
|
||||
* to occasionally copy the status for showing in sysfs/debugfs files
|
||||
* than to have the two lock access to shared status. The show copy is
|
||||
* updated after being modified before the quorum task sleeps for a
|
||||
* significant amount of time, either waiting on timeouts or interacting
|
||||
* with the server.
|
||||
*/
|
||||
static void update_show_status(struct quorum_info *qinf, struct quorum_status *qst)
|
||||
{
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->show_status = *qst;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* The quorum work always runs in the background of quorum member
|
||||
* mounts. It's responsible for starting and stopping the server if
|
||||
* it's elected leader, and the server can call back into it to let it
|
||||
* know that it has shut itself down (perhaps due to error) so that the
|
||||
* work should stop sending heartbeats.
|
||||
* it's elected leader. While it's leader it sends heartbeats to
|
||||
* suppress other quorum work from standing for election.
|
||||
*/
|
||||
static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst;
|
||||
struct quorum_status qst = {0,};
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
@@ -634,9 +628,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* start out as a follower */
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = 0;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
|
||||
/* read our starting term from greatest in all events in all slots */
|
||||
read_greatest_term(sb, &qst.term);
|
||||
@@ -654,6 +646,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
ret = recv_msg(sb, &msg, qst.timeout);
|
||||
if (ret < 0) {
|
||||
if (ret != -ETIMEDOUT && ret != -EAGAIN) {
|
||||
@@ -670,24 +664,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
msg.term < qst.term)
|
||||
msg.type = SCOUTFS_QUORUM_MSG_INVALID;
|
||||
|
||||
/* if the server has shutdown we become follower */
|
||||
if (!test_bit(QINF_FLAG_SERVER, &qinf->flags) &&
|
||||
qst.role == LEADER) {
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
||||
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
}
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->show_status = qst;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for,
|
||||
qst.vote_bits,
|
||||
ktime_to_timespec64(qst.timeout));
|
||||
@@ -698,7 +674,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
if (qst.role == LEADER) {
|
||||
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
|
||||
msg.type, msg.from, msg.term, qst.term);
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = msg.term;
|
||||
@@ -720,11 +695,18 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
/* followers and candidates start new election on timeout */
|
||||
if (qst.role != LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
/* .. but only if their server has stopped */
|
||||
if (!scoutfs_server_is_down(sb)) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_candidate_server_stopping);
|
||||
continue;
|
||||
}
|
||||
|
||||
qst.role = CANDIDATE;
|
||||
qst.term++;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
set_bit(opts->quorum_slot_nr, &qst.vote_bits);
|
||||
set_bit(qinf->our_quorum_slot_nr, &qst.vote_bits);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE,
|
||||
qst.term);
|
||||
qst.timeout = election_timeout();
|
||||
@@ -761,29 +743,69 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
qst.term);
|
||||
qst.timeout = heartbeat_interval();
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
/* record that we've been elected before starting up server */
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* make very sure server is fully shut down */
|
||||
scoutfs_server_stop(sb);
|
||||
/* set server bit before server shutdown could clear */
|
||||
set_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
qst.server_start_term = qst.term;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT;
|
||||
scoutfs_server_start(sb, qst.term);
|
||||
}
|
||||
|
||||
ret = scoutfs_server_start(sb, qst.term);
|
||||
if (ret < 0) {
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
/* store our increased term */
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
|
||||
true);
|
||||
if (err < 0) {
|
||||
ret = err;
|
||||
goto out;
|
||||
}
|
||||
ret = 0;
|
||||
continue;
|
||||
/*
|
||||
* This leader's server is up, having finished fencing
|
||||
* previous leaders. We update the fence event with the
|
||||
* current term to let future leaders know that previous
|
||||
* servers have been fenced.
|
||||
*/
|
||||
if (qst.role == LEADER && qst.server_event != SCOUTFS_QUORUM_EVENT_FENCE &&
|
||||
scoutfs_server_is_up(sb)) {
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, qst.term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_FENCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Stop a running server if we're no longer leader in
|
||||
* its term.
|
||||
*/
|
||||
if (!(qst.role == LEADER && qst.term == qst.server_start_term) &&
|
||||
scoutfs_server_is_running(sb)) {
|
||||
scoutfs_server_stop(sb);
|
||||
}
|
||||
|
||||
/*
|
||||
* A previously running server has stopped. The quorum
|
||||
* protocol might have shut it down by changing roles or
|
||||
* it might have stopped on its own, perhaps on errors.
|
||||
* If we're still a leader then we become a follower and
|
||||
* send resignations to encourage the next election.
|
||||
* Always update the _STOP event to stop connections and
|
||||
* fencing.
|
||||
*/
|
||||
if (qst.server_start_term > 0 && scoutfs_server_is_down(sb)) {
|
||||
if (qst.role == LEADER) {
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
qst.vote_bits = 0;
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_server_shutdown);
|
||||
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.server_start_term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
}
|
||||
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
qst.server_start_term, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
qst.server_start_term = 0;
|
||||
}
|
||||
|
||||
/* leaders regularly send heartbeats to delay elections */
|
||||
@@ -820,12 +842,19 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
|
||||
/* always try to stop a running server as we stop */
|
||||
if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) {
|
||||
scoutfs_server_stop(sb);
|
||||
scoutfs_fence_stop(sb);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.term);
|
||||
if (scoutfs_server_is_running(sb)) {
|
||||
scoutfs_server_stop_wait(sb);
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION, qst.term);
|
||||
|
||||
if (qst.server_start_term > 0) {
|
||||
err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
qst.server_start_term, true);
|
||||
if (err < 0 && ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
}
|
||||
|
||||
/* record that this slot no longer has an active quorum */
|
||||
@@ -837,21 +866,6 @@ out:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling server has shutdown and is no longer using shared
|
||||
* resources. Clear the bit so that we stop sending heartbeats and
|
||||
* allow the next server to be elected. Update the stop event so that
|
||||
* it won't be considered available by clients or fenced by the next
|
||||
* leader.
|
||||
*/
|
||||
void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
|
||||
clear_bit(QINF_FLAG_SERVER, &qinf->flags);
|
||||
update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clients read quorum blocks looking for the leader with a server whose
|
||||
* address it can try and connect to.
|
||||
@@ -954,7 +968,6 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
||||
struct mount_options *opts = &SCOUTFS_SB(qinf->sb)->opts;
|
||||
struct quorum_status qst;
|
||||
struct last_msg last;
|
||||
struct timespec64 ts;
|
||||
@@ -971,9 +984,11 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
ret = 0;
|
||||
|
||||
snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n",
|
||||
opts->quorum_slot_nr);
|
||||
qinf->our_quorum_slot_nr);
|
||||
snprintf_ret(buf, size, &ret, "term %llu\n",
|
||||
qst.term);
|
||||
snprintf_ret(buf, size, &ret, "server_start_term %llu\n", qst.server_start_term);
|
||||
snprintf_ret(buf, size, &ret, "server_event %d\n", qst.server_event);
|
||||
snprintf_ret(buf, size, &ret, "role %d (%s)\n",
|
||||
qst.role, role_str(qst.role));
|
||||
snprintf_ret(buf, size, &ret, "vote_for %d\n",
|
||||
@@ -1048,7 +1063,6 @@ static inline bool valid_ipv4_port(__be16 port)
|
||||
static int verify_quorum_slots(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct sockaddr_in other;
|
||||
@@ -1099,7 +1113,7 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!quorum_slot_present(super, opts->quorum_slot_nr)) {
|
||||
if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
|
||||
char *str = slots;
|
||||
*str = '\0';
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
@@ -1114,7 +1128,7 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
}
|
||||
}
|
||||
scoutfs_err(sb, "quorum_slot_nr=%u option references unused slot, must be one of the following configured slots:%s",
|
||||
opts->quorum_slot_nr, slots);
|
||||
qinf->our_quorum_slot_nr, slots);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -1137,11 +1151,12 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
int scoutfs_quorum_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct mount_options *opts = &sbi->opts;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct quorum_info *qinf;
|
||||
int ret;
|
||||
|
||||
if (opts->quorum_slot_nr < 0)
|
||||
scoutfs_options_read(sb, &opts);
|
||||
if (opts.quorum_slot_nr < 0)
|
||||
return 0;
|
||||
|
||||
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
|
||||
@@ -1153,6 +1168,8 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
spin_lock_init(&qinf->show_lock);
|
||||
INIT_WORK(&qinf->work, scoutfs_quorum_worker);
|
||||
scoutfs_sysfs_init_attrs(sb, &qinf->ssa);
|
||||
/* static for the lifetime of the mount */
|
||||
qinf->our_quorum_slot_nr = opts.quorum_slot_nr;
|
||||
|
||||
sbi->quorum_info = qinf;
|
||||
qinf->sb = sb;
|
||||
|
||||
@@ -2,14 +2,12 @@
|
||||
#define _SCOUTFS_QUORUM_H_
|
||||
|
||||
int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
|
||||
void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term);
|
||||
|
||||
u8 scoutfs_quorum_votes_needed(struct super_block *sb);
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
|
||||
struct sockaddr_in *sin);
|
||||
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
|
||||
int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term);
|
||||
|
||||
int scoutfs_quorum_setup(struct super_block *sb);
|
||||
void scoutfs_quorum_shutdown(struct super_block *sb);
|
||||
|
||||
@@ -691,16 +691,16 @@ TRACE_EVENT(scoutfs_evict_inode,
|
||||
|
||||
TRACE_EVENT(scoutfs_drop_inode,
|
||||
TP_PROTO(struct super_block *sb, __u64 ino, unsigned int nlink,
|
||||
unsigned int unhashed, bool drop_invalidated),
|
||||
unsigned int unhashed, bool lock_covered),
|
||||
|
||||
TP_ARGS(sb, ino, nlink, unhashed, drop_invalidated),
|
||||
TP_ARGS(sb, ino, nlink, unhashed, lock_covered),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, ino)
|
||||
__field(unsigned int, nlink)
|
||||
__field(unsigned int, unhashed)
|
||||
__field(unsigned int, drop_invalidated)
|
||||
__field(unsigned int, lock_covered)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -708,12 +708,12 @@ TRACE_EVENT(scoutfs_drop_inode,
|
||||
__entry->ino = ino;
|
||||
__entry->nlink = nlink;
|
||||
__entry->unhashed = unhashed;
|
||||
__entry->drop_invalidated = !!drop_invalidated;
|
||||
__entry->lock_covered = !!lock_covered;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" ino %llu nlink %u unhashed %d drop_invalidated %u", SCSB_TRACE_ARGS,
|
||||
TP_printk(SCSBF" ino %llu nlink %u unhashed %d lock_covered %u", SCSB_TRACE_ARGS,
|
||||
__entry->ino, __entry->nlink, __entry->unhashed,
|
||||
__entry->drop_invalidated)
|
||||
__entry->lock_covered)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_inode_walk_writeback,
|
||||
@@ -1417,42 +1417,71 @@ TRACE_EVENT(scoutfs_rename,
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_d_revalidate,
|
||||
TP_PROTO(struct super_block *sb,
|
||||
struct dentry *dentry, int flags, struct dentry *parent,
|
||||
bool is_covered, int ret),
|
||||
TP_PROTO(struct super_block *sb, struct dentry *dentry, int flags, u64 dir_ino, int ret),
|
||||
|
||||
TP_ARGS(sb, dentry, flags, parent, is_covered, ret),
|
||||
TP_ARGS(sb, dentry, flags, dir_ino, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(void *, dentry)
|
||||
__string(name, dentry->d_name.name)
|
||||
__field(__u64, ino)
|
||||
__field(__u64, parent_ino)
|
||||
__field(__u64, dir_ino)
|
||||
__field(int, flags)
|
||||
__field(int, is_root)
|
||||
__field(int, is_covered)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->dentry = dentry;
|
||||
__assign_str(name, dentry->d_name.name)
|
||||
__entry->ino = dentry->d_inode ?
|
||||
scoutfs_ino(dentry->d_inode) : 0;
|
||||
__entry->parent_ino = parent->d_inode ?
|
||||
scoutfs_ino(parent->d_inode) : 0;
|
||||
__entry->ino = dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0;
|
||||
__entry->dir_ino = dir_ino;
|
||||
__entry->flags = flags;
|
||||
__entry->is_root = IS_ROOT(dentry);
|
||||
__entry->is_covered = is_covered;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" name %s ino %llu parent_ino %llu flags 0x%x s_root %u is_covered %u ret %d",
|
||||
SCSB_TRACE_ARGS, __get_str(name), __entry->ino,
|
||||
__entry->parent_ino, __entry->flags,
|
||||
__entry->is_root,
|
||||
__entry->is_covered,
|
||||
__entry->ret)
|
||||
TP_printk(SCSBF" dentry %p name %s ino %llu dir_ino %llu flags 0x%x s_root %u ret %d",
|
||||
SCSB_TRACE_ARGS, __entry->dentry, __get_str(name), __entry->ino, __entry->dir_ino,
|
||||
__entry->flags, __entry->is_root, __entry->ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_validate_dentry,
|
||||
TP_PROTO(struct super_block *sb, struct dentry *dentry, u64 dir_ino, u64 dentry_ino,
|
||||
u64 dent_ino, u64 refresh_gen, int ret),
|
||||
|
||||
TP_ARGS(sb, dentry, dir_ino, dentry_ino, dent_ino, refresh_gen, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(void *, dentry)
|
||||
__field(__u64, dir_ino)
|
||||
__string(name, dentry->d_name.name)
|
||||
__field(__u64, dentry_ino)
|
||||
__field(__u64, dent_ino)
|
||||
__field(__u64, fsdata_gen)
|
||||
__field(__u64, refresh_gen)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->dentry = dentry;
|
||||
__entry->dir_ino = dir_ino;
|
||||
__assign_str(name, dentry->d_name.name)
|
||||
__entry->dentry_ino = dentry_ino;
|
||||
__entry->dent_ino = dent_ino;
|
||||
__entry->fsdata_gen = (unsigned long long)dentry->d_fsdata;
|
||||
__entry->refresh_gen = refresh_gen;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" dentry %p dir %llu name %s dentry_ino %llu dent_ino %llu fsdata_gen %llu refresh_gen %llu ret %d",
|
||||
SCSB_TRACE_ARGS, __entry->dentry, __entry->dir_ino, __get_str(name),
|
||||
__entry->dentry_ino, __entry->dent_ino, __entry->fsdata_gen,
|
||||
__entry->refresh_gen, __entry->ret)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_super_lifecycle_class,
|
||||
@@ -1843,6 +1872,53 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
|
||||
TP_ARGS(sb, rid, nr_clients)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, holding)
|
||||
__field(int, applying)
|
||||
__field(int, nr_holders)
|
||||
__field(__u32, avail_before)
|
||||
__field(__u32, freed_before)
|
||||
__field(int, exceeded)
|
||||
),
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->holding = !!holding;
|
||||
__entry->applying = !!applying;
|
||||
__entry->nr_holders = nr_holders;
|
||||
__entry->avail_before = avail_before;
|
||||
__entry->freed_before = freed_before;
|
||||
__entry->exceeded = !!exceeded;
|
||||
),
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u exceeded %u",
|
||||
SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
|
||||
__entry->avail_before, __entry->freed_before, __entry->exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
);
|
||||
|
||||
#define slt_symbolic(mode) \
|
||||
__print_symbolic(mode, \
|
||||
{ SLT_CLIENT, "client" }, \
|
||||
@@ -2620,9 +2696,9 @@ TRACE_EVENT(scoutfs_item_invalidate_page,
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
int bit_nr),
|
||||
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
@@ -2630,7 +2706,6 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
__field(__u64, group_nr)
|
||||
__field(unsigned int, group_total)
|
||||
__field(int, bit_nr)
|
||||
__field(int, bit_count)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -2639,43 +2714,42 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class,
|
||||
__entry->group_nr = group_nr;
|
||||
__entry->group_total = group_total;
|
||||
__entry->bit_nr = bit_nr;
|
||||
__entry->bit_count = bit_count;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d",
|
||||
TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d",
|
||||
SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total,
|
||||
__entry->bit_nr, __entry->bit_count)
|
||||
__entry->bit_nr)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy,
|
||||
TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total,
|
||||
int bit_nr, int bit_count),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count)
|
||||
int bit_nr),
|
||||
TP_ARGS(sb, grp, group_nr, group_total, bit_nr)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_omap_should_delete,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -64,8 +64,6 @@ int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
|
||||
struct scoutfs_key *key);
|
||||
void scoutfs_server_hold_commit(struct super_block *sb);
|
||||
int scoutfs_server_apply_commit(struct super_block *sb, int err);
|
||||
void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);
|
||||
|
||||
int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
|
||||
@@ -77,9 +75,12 @@ u64 scoutfs_server_seq(struct super_block *sb);
|
||||
u64 scoutfs_server_next_seq(struct super_block *sb);
|
||||
void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
|
||||
|
||||
int scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_abort(struct super_block *sb);
|
||||
void scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_stop(struct super_block *sb);
|
||||
void scoutfs_server_stop_wait(struct super_block *sb);
|
||||
bool scoutfs_server_is_running(struct super_block *sb);
|
||||
bool scoutfs_server_is_up(struct super_block *sb);
|
||||
bool scoutfs_server_is_down(struct super_block *sb);
|
||||
|
||||
int scoutfs_server_setup(struct super_block *sb);
|
||||
void scoutfs_server_destroy(struct super_block *sb);
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include "omap.h"
|
||||
#include "volopt.h"
|
||||
#include "fence.h"
|
||||
#include "xattr.h"
|
||||
#include "scoutfs_trace.h"
|
||||
|
||||
static struct dentry *scoutfs_debugfs_root;
|
||||
@@ -132,44 +133,6 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
if (opts->quorum_slot_nr >= 0)
|
||||
seq_printf(seq, ",quorum_slot_nr=%d", opts->quorum_slot_nr);
|
||||
seq_printf(seq, ",metadev_path=%s", opts->metadev_path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(metadev_path);
|
||||
|
||||
static ssize_t quorum_server_nr_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", opts->quorum_slot_nr);
|
||||
}
|
||||
SCOUTFS_ATTR_RO(quorum_server_nr);
|
||||
|
||||
static struct attribute *mount_options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(quorum_server_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int scoutfs_sync_fs(struct super_block *sb, int wait)
|
||||
{
|
||||
trace_scoutfs_sync_fs(sb, wait);
|
||||
@@ -246,13 +209,11 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
scoutfs_destroy_triggers(sb);
|
||||
scoutfs_fence_destroy(sb);
|
||||
scoutfs_options_destroy(sb);
|
||||
scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
|
||||
debugfs_remove(sbi->debug_root);
|
||||
scoutfs_destroy_counters(sb);
|
||||
scoutfs_destroy_sysfs(sb);
|
||||
scoutfs_metadev_close(sb);
|
||||
|
||||
kfree(sbi->opts.metadev_path);
|
||||
kfree(sbi);
|
||||
|
||||
sb->s_fs_info = NULL;
|
||||
@@ -282,7 +243,7 @@ static const struct super_operations scoutfs_super_ops = {
|
||||
.destroy_inode = scoutfs_destroy_inode,
|
||||
.sync_fs = scoutfs_sync_fs,
|
||||
.statfs = scoutfs_statfs,
|
||||
.show_options = scoutfs_show_options,
|
||||
.show_options = scoutfs_options_show,
|
||||
.put_super = scoutfs_put_super,
|
||||
.umount_begin = scoutfs_umount_begin,
|
||||
};
|
||||
@@ -511,9 +472,9 @@ out:
|
||||
|
||||
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi;
|
||||
struct mount_options opts;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct block_device *meta_bdev;
|
||||
struct scoutfs_sb_info *sbi;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
@@ -522,8 +483,10 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sb->s_magic = SCOUTFS_SUPER_MAGIC;
|
||||
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
||||
sb->s_op = &scoutfs_super_ops;
|
||||
sb->s_d_op = &scoutfs_dentry_ops;
|
||||
sb->s_export_op = &scoutfs_export_ops;
|
||||
sb->s_flags |= MS_I_VERSION;
|
||||
sb->s_xattr = scoutfs_xattr_handlers;
|
||||
sb->s_flags |= MS_I_VERSION | MS_POSIXACL;
|
||||
|
||||
/* btree blocks use long lived bh->b_data refs */
|
||||
mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
|
||||
@@ -536,18 +499,17 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
|
||||
ret = assign_random_id(sbi);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
goto out;
|
||||
|
||||
spin_lock_init(&sbi->next_ino_lock);
|
||||
spin_lock_init(&sbi->data_wait_root.lock);
|
||||
sbi->data_wait_root.root = RB_ROOT;
|
||||
scoutfs_sysfs_init_attrs(sb, &sbi->mopts_ssa);
|
||||
|
||||
ret = scoutfs_parse_options(sb, data, &opts);
|
||||
if (ret)
|
||||
/* parse options early for use during setup */
|
||||
ret = scoutfs_options_early_setup(sb, data);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sbi->opts = opts;
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
|
||||
if (ret != SCOUTFS_BLOCK_SM_SIZE) {
|
||||
@@ -556,9 +518,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
goto out;
|
||||
}
|
||||
|
||||
meta_bdev =
|
||||
blkdev_get_by_path(sbi->opts.metadev_path,
|
||||
SCOUTFS_META_BDEV_MODE, sb);
|
||||
meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb);
|
||||
if (IS_ERR(meta_bdev)) {
|
||||
scoutfs_err(sb, "could not open metadev: error %ld",
|
||||
PTR_ERR(meta_bdev));
|
||||
@@ -578,8 +538,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
scoutfs_setup_sysfs(sb) ?:
|
||||
scoutfs_setup_counters(sb) ?:
|
||||
scoutfs_options_setup(sb) ?:
|
||||
scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
|
||||
mount_options_attrs, "mount_options") ?:
|
||||
scoutfs_setup_triggers(sb) ?:
|
||||
scoutfs_fence_setup(sb) ?:
|
||||
scoutfs_block_setup(sb) ?:
|
||||
@@ -652,6 +610,7 @@ static void scoutfs_kill_sb(struct super_block *sb)
|
||||
}
|
||||
|
||||
if (SCOUTFS_HAS_SBI(sb)) {
|
||||
scoutfs_options_stop(sb);
|
||||
scoutfs_inode_orphan_stop(sb);
|
||||
scoutfs_lock_unmount_begin(sb);
|
||||
}
|
||||
@@ -672,7 +631,6 @@ MODULE_ALIAS_FS("scoutfs");
|
||||
static void teardown_module(void)
|
||||
{
|
||||
debugfs_remove(scoutfs_debugfs_root);
|
||||
scoutfs_dir_exit();
|
||||
scoutfs_inode_exit();
|
||||
scoutfs_sysfs_exit();
|
||||
}
|
||||
@@ -710,7 +668,6 @@ static int __init scoutfs_module_init(void)
|
||||
goto out;
|
||||
}
|
||||
ret = scoutfs_inode_init() ?:
|
||||
scoutfs_dir_init() ?:
|
||||
register_filesystem(&scoutfs_fs_type);
|
||||
out:
|
||||
if (ret)
|
||||
|
||||
@@ -44,6 +44,7 @@ struct scoutfs_sb_info {
|
||||
|
||||
spinlock_t next_ino_lock;
|
||||
|
||||
struct options_info *options_info;
|
||||
struct data_info *data_info;
|
||||
struct inode_sb_info *inode_sb_info;
|
||||
struct btree_info *btree_info;
|
||||
@@ -74,10 +75,6 @@ struct scoutfs_sb_info {
|
||||
struct scoutfs_counters *counters;
|
||||
struct scoutfs_triggers *triggers;
|
||||
|
||||
struct mount_options opts;
|
||||
struct options_sb_info *options;
|
||||
struct scoutfs_sysfs_attrs mopts_ssa;
|
||||
|
||||
struct dentry *debug_root;
|
||||
|
||||
bool forced_unmount;
|
||||
|
||||
@@ -37,6 +37,15 @@ struct attr_funcs {
|
||||
#define ATTR_FUNCS_RO(_name) \
|
||||
static struct attr_funcs _name##_attr_funcs = __ATTR_RO(_name)
|
||||
|
||||
static ssize_t data_device_maj_min_show(struct kobject *kobj, struct attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = KOBJ_TO_SB(kobj, sb_id_kobj);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u:%u\n",
|
||||
MAJOR(sb->s_bdev->bd_dev), MINOR(sb->s_bdev->bd_dev));
|
||||
}
|
||||
ATTR_FUNCS_RO(data_device_maj_min);
|
||||
|
||||
static ssize_t format_version_show(struct kobject *kobj, struct attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
@@ -101,6 +110,7 @@ static ssize_t attr_funcs_show(struct kobject *kobj, struct attribute *attr,
|
||||
|
||||
|
||||
static struct attribute *sb_id_attrs[] = {
|
||||
&data_device_maj_min_attr_funcs.attr,
|
||||
&format_version_attr_funcs.attr,
|
||||
&fsid_attr_funcs.attr,
|
||||
&rid_attr_funcs.attr,
|
||||
@@ -258,7 +268,7 @@ int __init scoutfs_sysfs_init(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __exit scoutfs_sysfs_exit(void)
|
||||
void scoutfs_sysfs_exit(void)
|
||||
{
|
||||
if (scoutfs_kset)
|
||||
kset_unregister(scoutfs_kset);
|
||||
|
||||
@@ -53,6 +53,6 @@ int scoutfs_setup_sysfs(struct super_block *sb);
|
||||
void scoutfs_destroy_sysfs(struct super_block *sb);
|
||||
|
||||
int __init scoutfs_sysfs_init(void);
|
||||
void __exit scoutfs_sysfs_exit(void);
|
||||
void scoutfs_sysfs_exit(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -640,6 +640,7 @@ void scoutfs_shutdown_trans(struct super_block *sb)
|
||||
tri->write_workq = NULL;
|
||||
}
|
||||
|
||||
scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri);
|
||||
scoutfs_block_writer_forget_all(sb, &tri->wri);
|
||||
|
||||
kfree(tri);
|
||||
|
||||
590
kmod/src/xattr.c
590
kmod/src/xattr.c
@@ -15,6 +15,7 @@
|
||||
#include <linux/dcache.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/crc32c.h>
|
||||
#include <linux/posix_acl.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "inode.h"
|
||||
@@ -26,6 +27,7 @@
|
||||
#include "xattr.h"
|
||||
#include "lock.h"
|
||||
#include "hash.h"
|
||||
#include "acl.h"
|
||||
#include "scoutfs_trace.h"
|
||||
|
||||
/*
|
||||
@@ -57,12 +59,6 @@ static u32 xattr_names_equal(const char *a_name, unsigned int a_len,
|
||||
return a_len == b_len && memcmp(a_name, b_name, a_len) == 0;
|
||||
}
|
||||
|
||||
static unsigned int xattr_full_bytes(struct scoutfs_xattr *xat)
|
||||
{
|
||||
return offsetof(struct scoutfs_xattr,
|
||||
name[xat->name_len + le16_to_cpu(xat->val_len)]);
|
||||
}
|
||||
|
||||
static unsigned int xattr_nr_parts(struct scoutfs_xattr *xat)
|
||||
{
|
||||
return SCOUTFS_XATTR_NR_PARTS(xat->name_len,
|
||||
@@ -85,16 +81,6 @@ static void init_xattr_key(struct scoutfs_key *key, u64 ino, u32 name_hash,
|
||||
#define SCOUTFS_XATTR_PREFIX "scoutfs."
|
||||
#define SCOUTFS_XATTR_PREFIX_LEN (sizeof(SCOUTFS_XATTR_PREFIX) - 1)
|
||||
|
||||
static int unknown_prefix(const char *name)
|
||||
{
|
||||
return strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
|
||||
strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
|
||||
strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
|
||||
strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)&&
|
||||
strncmp(name, SCOUTFS_XATTR_PREFIX, SCOUTFS_XATTR_PREFIX_LEN);
|
||||
}
|
||||
|
||||
|
||||
#define HIDE_TAG "hide."
|
||||
#define SRCH_TAG "srch."
|
||||
#define TOTL_TAG "totl."
|
||||
@@ -137,12 +123,29 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next xattr and copy the key, xattr header, and as much of
|
||||
* the name and value into the callers buffer as we can. Returns the
|
||||
* number of bytes copied which include the header, name, and value and
|
||||
* can be limited by the xattr length or the callers buffer. The caller
|
||||
* is responsible for comparing their lengths, the header, and the
|
||||
* returned length before safely using the xattr.
|
||||
* xattrs are stored in multiple items. The first item is a
|
||||
* concatenation of an initial header, the name, and then as much of the
|
||||
* value as fits in the remainder of the first item. This return the
|
||||
* size of the first item that'd store an xattr with the given name
|
||||
* length and value payload size.
|
||||
*/
|
||||
static int first_item_bytes(int name_len, size_t size)
|
||||
{
|
||||
if (WARN_ON_ONCE(name_len <= 0) ||
|
||||
WARN_ON_ONCE(name_len > SCOUTFS_XATTR_MAX_NAME_LEN))
|
||||
return 0;
|
||||
|
||||
return min_t(int, sizeof(struct scoutfs_xattr) + name_len + size,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next xattr, set the caller's key, and copy as much of the
|
||||
* first item into the callers buffer as we can. Returns the number of
|
||||
* bytes copied which can include the header, name, and start of the
|
||||
* value from the first item. The caller is responsible for comparing
|
||||
* their lengths, the header, and the returned length before safely
|
||||
* using the buffer.
|
||||
*
|
||||
* If a name is provided then we'll iterate over items with a matching
|
||||
* name_hash until we find a matching name. If we don't find a matching
|
||||
@@ -154,20 +157,17 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
|
||||
* Returns -ENOENT if it didn't find a next item.
|
||||
*/
|
||||
static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
struct scoutfs_xattr *xat, unsigned int bytes,
|
||||
struct scoutfs_xattr *xat, unsigned int xat_bytes,
|
||||
const char *name, unsigned int name_len,
|
||||
u64 name_hash, u64 id, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key last;
|
||||
u8 last_part;
|
||||
int total;
|
||||
u8 part;
|
||||
int ret;
|
||||
|
||||
/* need to be able to see the name we're looking for */
|
||||
if (WARN_ON_ONCE(name_len > 0 && bytes < offsetof(struct scoutfs_xattr,
|
||||
name[name_len])))
|
||||
if (WARN_ON_ONCE(name_len > 0 &&
|
||||
xat_bytes < offsetof(struct scoutfs_xattr, name[name_len])))
|
||||
return -EINVAL;
|
||||
|
||||
if (name_len)
|
||||
@@ -176,26 +176,15 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
init_xattr_key(key, scoutfs_ino(inode), name_hash, id);
|
||||
init_xattr_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
|
||||
|
||||
last_part = 0;
|
||||
part = 0;
|
||||
total = 0;
|
||||
|
||||
for (;;) {
|
||||
key->skx_part = part;
|
||||
ret = scoutfs_item_next(sb, key, &last,
|
||||
(void *)xat + total, bytes - total,
|
||||
lock);
|
||||
if (ret < 0) {
|
||||
/* XXX corruption, ran out of parts */
|
||||
if (ret == -ENOENT && part > 0)
|
||||
ret = -EIO;
|
||||
ret = scoutfs_item_next(sb, key, &last, xat, xat_bytes, lock);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
trace_scoutfs_xattr_get_next_key(sb, key);
|
||||
|
||||
/* XXX corruption */
|
||||
if (key->skx_part != part) {
|
||||
if (key->skx_part != 0) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
@@ -205,8 +194,7 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
* the first part and if the next xattr name fits in our
|
||||
* buffer then the item must have included it.
|
||||
*/
|
||||
if (part == 0 &&
|
||||
(ret < sizeof(struct scoutfs_xattr) ||
|
||||
if ((ret < sizeof(struct scoutfs_xattr) ||
|
||||
(xat->name_len <= name_len &&
|
||||
ret < offsetof(struct scoutfs_xattr,
|
||||
name[xat->name_len])) ||
|
||||
@@ -216,7 +204,7 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
break;
|
||||
}
|
||||
|
||||
if (part == 0 && name_len) {
|
||||
if (name_len > 0) {
|
||||
/* ran out of names that could match */
|
||||
if (le64_to_cpu(key->skx_name_hash) != name_hash) {
|
||||
ret = -ENOENT;
|
||||
@@ -224,64 +212,126 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
}
|
||||
|
||||
/* keep looking for our name */
|
||||
if (!xattr_names_equal(name, name_len,
|
||||
xat->name, xat->name_len)) {
|
||||
part = 0;
|
||||
if (!xattr_names_equal(name, name_len, xat->name, xat->name_len)) {
|
||||
le64_add_cpu(&key->skx_id, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* use the matching name we found */
|
||||
last_part = xattr_nr_parts(xat) - 1;
|
||||
}
|
||||
|
||||
total += ret;
|
||||
if (total == bytes || part == last_part) {
|
||||
/* copied as much as we could */
|
||||
ret = total;
|
||||
break;
|
||||
}
|
||||
part++;
|
||||
/* found next name */
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has already read and verified the xattr's first item.
|
||||
* Copy the value from the tail of the first item and from any future
|
||||
* items into the destination buffer.
|
||||
*/
|
||||
static int copy_xattr_value(struct super_block *sb, struct scoutfs_key *xat_key,
|
||||
struct scoutfs_xattr *xat, int xat_bytes,
|
||||
char *buffer, size_t size,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_key key;
|
||||
size_t copied = 0;
|
||||
int val_tail;
|
||||
int bytes;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
/* must have first item up to value */
|
||||
if (WARN_ON_ONCE(xat_bytes < sizeof(struct scoutfs_xattr)) ||
|
||||
WARN_ON_ONCE(xat_bytes < offsetof(struct scoutfs_xattr, name[xat->name_len])))
|
||||
return -EINVAL;
|
||||
|
||||
/* only ever copy up to the full value */
|
||||
size = min_t(size_t, size, le16_to_cpu(xat->val_len));
|
||||
|
||||
/* must have full first item if caller needs value from second item */
|
||||
val_tail = SCOUTFS_XATTR_MAX_PART_SIZE -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
if (WARN_ON_ONCE(size > val_tail && xat_bytes != SCOUTFS_XATTR_MAX_PART_SIZE))
|
||||
return -EINVAL;
|
||||
|
||||
/* copy from tail of first item */
|
||||
bytes = min_t(unsigned int, size, val_tail);
|
||||
if (bytes > 0) {
|
||||
memcpy(buffer, &xat->name[xat->name_len], bytes);
|
||||
copied += bytes;
|
||||
}
|
||||
|
||||
key = *xat_key;
|
||||
for (i = 1; copied < size; i++) {
|
||||
key.skx_part = i;
|
||||
bytes = min_t(unsigned int, size - copied, SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
ret = scoutfs_item_lookup(sb, &key, buffer + copied, bytes, lock);
|
||||
if (ret >= 0 && ret != bytes)
|
||||
ret = -EIO;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
copied += ret;
|
||||
}
|
||||
|
||||
return copied;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is working with items that are either in the allocated
|
||||
* first compound item or further items that are offsets into a value
|
||||
* buffer. Give them a pointer and length of the start of the item.
|
||||
*/
|
||||
static void xattr_item_part_buffer(void **buf, int *len, int part,
|
||||
struct scoutfs_xattr *xat, unsigned int xat_bytes,
|
||||
const char *value, size_t size)
|
||||
{
|
||||
int off;
|
||||
|
||||
if (part == 0) {
|
||||
*buf = xat;
|
||||
*len = xat_bytes;
|
||||
} else {
|
||||
off = (part * SCOUTFS_XATTR_MAX_PART_SIZE) -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
BUG_ON(off >= size); /* calls limited by number of parts */
|
||||
*buf = (void *)value + off;
|
||||
*len = min_t(size_t, size - off, SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create all the items associated with the given xattr. If this
|
||||
* returns an error it will have already cleaned up any items it created
|
||||
* before seeing the error.
|
||||
*/
|
||||
static int create_xattr_items(struct inode *inode, u64 id,
|
||||
struct scoutfs_xattr *xat, unsigned int bytes,
|
||||
static int create_xattr_items(struct inode *inode, u64 id, struct scoutfs_xattr *xat,
|
||||
int xat_bytes, const char *value, size_t size, u8 new_parts,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key key;
|
||||
unsigned int part_bytes;
|
||||
unsigned int total;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
void *buf;
|
||||
int len;
|
||||
int i;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
total = 0;
|
||||
ret = 0;
|
||||
while (total < bytes) {
|
||||
part_bytes = min_t(unsigned int, bytes - total,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
for (i = 0; i < new_parts; i++) {
|
||||
key.skx_part = i;
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
|
||||
ret = scoutfs_item_create(sb, &key,
|
||||
(void *)xat + total, part_bytes,
|
||||
lock);
|
||||
if (ret) {
|
||||
ret = scoutfs_item_create(sb, &key, buf, len, lock);
|
||||
if (ret < 0) {
|
||||
while (key.skx_part-- > 0)
|
||||
scoutfs_item_delete(sb, &key, lock);
|
||||
break;
|
||||
}
|
||||
|
||||
total += part_bytes;
|
||||
key.skx_part++;
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -329,20 +379,20 @@ out:
|
||||
* deleted items.
|
||||
*/
|
||||
static int change_xattr_items(struct inode *inode, u64 id,
|
||||
struct scoutfs_xattr *new_xat,
|
||||
unsigned int new_bytes, u8 new_parts,
|
||||
u8 old_parts, struct scoutfs_lock *lock)
|
||||
struct scoutfs_xattr *xat, int xat_bytes,
|
||||
const char *value, size_t size,
|
||||
u8 new_parts, u8 old_parts, struct scoutfs_lock *lock)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_key key;
|
||||
int last_created = -1;
|
||||
int bytes;
|
||||
int off;
|
||||
void *buf;
|
||||
int len;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(new_xat->name, new_xat->name_len), id);
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
/* dirty existing old items */
|
||||
for (i = 0; i < old_parts; i++) {
|
||||
@@ -354,13 +404,10 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
|
||||
/* create any new items past the old */
|
||||
for (i = old_parts; i < new_parts; i++) {
|
||||
off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
|
||||
bytes = min_t(unsigned int, new_bytes - off,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
key.skx_part = i;
|
||||
ret = scoutfs_item_create(sb, &key, (void *)new_xat + off,
|
||||
bytes, lock);
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
|
||||
ret = scoutfs_item_create(sb, &key, buf, len, lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -369,13 +416,10 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
|
||||
/* update dirtied overlapping existing items, last partial first */
|
||||
for (i = min(old_parts, new_parts) - 1; i >= 0; i--) {
|
||||
off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
|
||||
bytes = min_t(unsigned int, new_bytes - off,
|
||||
SCOUTFS_XATTR_MAX_PART_SIZE);
|
||||
|
||||
key.skx_part = i;
|
||||
ret = scoutfs_item_update(sb, &key, (void *)new_xat + off,
|
||||
bytes, lock);
|
||||
xattr_item_part_buffer(&buf, &len, i, xat, xat_bytes, value, size);
|
||||
|
||||
ret = scoutfs_item_update(sb, &key, buf, len, lock);
|
||||
/* only last partial can fail, then we unwind created */
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@@ -403,72 +447,69 @@ out:
|
||||
* Copy the value for the given xattr name into the caller's buffer, if it
|
||||
* fits. Return the bytes copied or -ERANGE if it doesn't fit.
|
||||
*/
|
||||
ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
size_t size)
|
||||
int scoutfs_xattr_get_locked(struct inode *inode, const char *name, void *buffer, size_t size,
|
||||
struct scoutfs_lock *lck)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_key key;
|
||||
unsigned int bytes;
|
||||
unsigned int xat_bytes;
|
||||
size_t name_len;
|
||||
int ret;
|
||||
|
||||
if (unknown_prefix(name))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
name_len = strlen(name);
|
||||
if (name_len > SCOUTFS_XATTR_MAX_NAME_LEN)
|
||||
return -ENODATA;
|
||||
|
||||
/* only need enough for caller's name and value sizes */
|
||||
bytes = sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
|
||||
xat_bytes = first_item_bytes(name_len, size);
|
||||
xat = kmalloc(xat_bytes, GFP_NOFS);
|
||||
if (!xat)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lck);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
down_read(&si->xattr_rwsem);
|
||||
|
||||
ret = get_next_xattr(inode, &key, xat, bytes,
|
||||
name, name_len, 0, 0, lck);
|
||||
|
||||
up_read(&si->xattr_rwsem);
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_READ);
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes, name, name_len, 0, 0, lck);
|
||||
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = -ENODATA;
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* the caller just wants to know the size */
|
||||
if (size == 0) {
|
||||
ret = le16_to_cpu(xat->val_len);
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* the caller's buffer wasn't big enough */
|
||||
if (size < le16_to_cpu(xat->val_len)) {
|
||||
ret = -ERANGE;
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* XXX corruption, the items didn't match the header */
|
||||
if (ret < xattr_full_bytes(xat)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
ret = copy_xattr_value(sb, &key, xat, xat_bytes, buffer, size, lck);
|
||||
unlock:
|
||||
up_read(&si->xattr_rwsem);
|
||||
|
||||
kfree(xat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_xattr_get(struct dentry *dentry, const char *name, void *buffer, size_t size)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
|
||||
if (ret == 0) {
|
||||
ret = scoutfs_xattr_get_locked(inode, name, buffer, size, lock);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
}
|
||||
|
||||
ret = le16_to_cpu(xat->val_len);
|
||||
memcpy(buffer, &xat->name[xat->name_len], ret);
|
||||
out:
|
||||
vfree(xat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -576,29 +617,32 @@ int scoutfs_xattr_combine_totl(void *dst, int dst_len, void *src, int src_len)
|
||||
* cause creation to fail if the xattr already exists (_CREATE) or
|
||||
* doesn't already exist (_REPLACE). xattrs can have a zero length
|
||||
* value.
|
||||
*
|
||||
* The caller has acquired cluster locks, holds a transaction, and has
|
||||
* dirtied the inode item so that they can update it after we modify it.
|
||||
* The caller has to know the tags to acquire cluster locks before
|
||||
* holding the transaction so they pass in the parsed tags, or all 0s for
|
||||
* non scoutfs. prefixes.
|
||||
*/
|
||||
static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_len,
|
||||
const void *value, size_t size, int flags,
|
||||
const struct scoutfs_xattr_prefix_tags *tgs,
|
||||
struct scoutfs_lock *lck, struct scoutfs_lock *totl_lock,
|
||||
struct list_head *ind_locks)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
const u64 ino = scoutfs_ino(inode);
|
||||
struct scoutfs_xattr_totl_val tval = {0,};
|
||||
struct scoutfs_xattr_prefix_tags tgs;
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_lock *totl_lock = NULL;
|
||||
size_t name_len = strlen(name);
|
||||
struct scoutfs_key totl_key;
|
||||
struct scoutfs_key key;
|
||||
bool undo_srch = false;
|
||||
bool undo_totl = false;
|
||||
LIST_HEAD(ind_locks);
|
||||
u8 found_parts;
|
||||
unsigned int bytes;
|
||||
unsigned int xat_bytes_totl;
|
||||
unsigned int xat_bytes;
|
||||
unsigned int val_len;
|
||||
u64 ind_seq;
|
||||
u64 total;
|
||||
u64 hash = 0;
|
||||
u64 id = 0;
|
||||
@@ -607,6 +651,9 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
|
||||
trace_scoutfs_xattr_set(sb, name_len, value, size, flags);
|
||||
|
||||
if (WARN_ON_ONCE(tgs->totl && !totl_lock))
|
||||
return -EINVAL;
|
||||
|
||||
/* mirror the syscall's errors for large names and values */
|
||||
if (name_len > SCOUTFS_XATTR_MAX_NAME_LEN)
|
||||
return -ERANGE;
|
||||
@@ -617,73 +664,61 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
(flags & ~(XATTR_CREATE | XATTR_REPLACE)))
|
||||
return -EINVAL;
|
||||
|
||||
if (unknown_prefix(name))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (scoutfs_xattr_parse_tags(name, name_len, &tgs) != 0)
|
||||
return -EINVAL;
|
||||
|
||||
if ((tgs.hide | tgs.srch | tgs.totl) && !capable(CAP_SYS_ADMIN))
|
||||
if ((tgs->hide | tgs->srch | tgs->totl) && !capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
if (tgs.totl && ((ret = parse_totl_key(&totl_key, name, name_len)) != 0))
|
||||
if (tgs->totl && ((ret = parse_totl_key(&totl_key, name, name_len)) != 0))
|
||||
return ret;
|
||||
|
||||
bytes = sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
/* alloc enough to read old totl value */
|
||||
xat = __vmalloc(bytes + SCOUTFS_XATTR_MAX_TOTL_U64, GFP_NOFS, PAGE_KERNEL);
|
||||
if (!xat) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lck);
|
||||
if (ret)
|
||||
goto out;
|
||||
/* allocate enough to always read an existing xattr's totl */
|
||||
xat_bytes_totl = first_item_bytes(name_len,
|
||||
max_t(size_t, size, SCOUTFS_XATTR_MAX_TOTL_U64));
|
||||
/* but store partial first item that only includes the new xattr's value */
|
||||
xat_bytes = first_item_bytes(name_len, size);
|
||||
xat = kmalloc(xat_bytes_totl, GFP_NOFS);
|
||||
if (!xat)
|
||||
return -ENOMEM;
|
||||
|
||||
down_write(&si->xattr_rwsem);
|
||||
|
||||
/* find an existing xattr to delete, including possible totl value */
|
||||
ret = get_next_xattr(inode, &key, xat,
|
||||
sizeof(struct scoutfs_xattr) + name_len + SCOUTFS_XATTR_MAX_TOTL_U64,
|
||||
name, name_len, 0, 0, lck);
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes_totl, name, name_len, 0, 0, lck);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto unlock;
|
||||
goto out;
|
||||
|
||||
/* check existence constraint flags */
|
||||
if (ret == -ENOENT && (flags & XATTR_REPLACE)) {
|
||||
ret = -ENODATA;
|
||||
goto unlock;
|
||||
goto out;
|
||||
} else if (ret >= 0 && (flags & XATTR_CREATE)) {
|
||||
ret = -EEXIST;
|
||||
goto unlock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* not an error to delete something that doesn't exist */
|
||||
if (ret == -ENOENT && !value) {
|
||||
ret = 0;
|
||||
goto unlock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* s64 count delta if we create or delete */
|
||||
if (tgs.totl)
|
||||
if (tgs->totl)
|
||||
tval.count = cpu_to_le64((u64)!!(value) - (u64)!!(ret != -ENOENT));
|
||||
|
||||
/* found fields in key will also be used */
|
||||
found_parts = ret >= 0 ? xattr_nr_parts(xat) : 0;
|
||||
|
||||
if (found_parts && tgs.totl) {
|
||||
if (found_parts && tgs->totl) {
|
||||
/* parse old totl value before we clobber xat buf */
|
||||
val_len = ret - offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
ret = parse_totl_u64(&xat->name[xat->name_len], val_len, &total);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
goto out;
|
||||
|
||||
le64_add_cpu(&tval.total, -total);
|
||||
}
|
||||
|
||||
/* prepare our xattr */
|
||||
/* prepare the xattr header, name, and start of value in first item */
|
||||
if (value) {
|
||||
if (found_parts)
|
||||
id = le64_to_cpu(key.skx_id);
|
||||
@@ -693,17 +728,94 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
xat->val_len = cpu_to_le16(size);
|
||||
memset(xat->__pad, 0, sizeof(xat->__pad));
|
||||
memcpy(xat->name, name, name_len);
|
||||
memcpy(&xat->name[xat->name_len], value, size);
|
||||
memcpy(&xat->name[name_len], value,
|
||||
min(size, SCOUTFS_XATTR_MAX_PART_SIZE -
|
||||
offsetof(struct scoutfs_xattr, name[name_len])));
|
||||
|
||||
if (tgs.totl) {
|
||||
if (tgs->totl) {
|
||||
ret = parse_totl_u64(value, size, &total);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
le64_add_cpu(&tval.total, total);
|
||||
}
|
||||
|
||||
if (tgs->srch && !(found_parts && value)) {
|
||||
if (found_parts)
|
||||
id = le64_to_cpu(key.skx_id);
|
||||
hash = scoutfs_hash64(name, name_len);
|
||||
ret = scoutfs_forest_srch_add(sb, hash, ino, id);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
undo_srch = true;
|
||||
}
|
||||
|
||||
if (tgs->totl) {
|
||||
ret = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
undo_totl = true;
|
||||
}
|
||||
|
||||
if (found_parts && value)
|
||||
ret = change_xattr_items(inode, id, xat, xat_bytes, value, size,
|
||||
xattr_nr_parts(xat), found_parts, lck);
|
||||
else if (found_parts)
|
||||
ret = delete_xattr_items(inode, le64_to_cpu(key.skx_name_hash),
|
||||
le64_to_cpu(key.skx_id), found_parts,
|
||||
lck);
|
||||
else
|
||||
ret = create_xattr_items(inode, id, xat, xat_bytes, value, size,
|
||||
xattr_nr_parts(xat), lck);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* XXX do these want i_mutex or anything? */
|
||||
inode_inc_iversion(inode);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
if (ret < 0 && undo_srch) {
|
||||
err = scoutfs_forest_srch_add(sb, hash, ino, id);
|
||||
BUG_ON(err);
|
||||
}
|
||||
if (ret < 0 && undo_totl) {
|
||||
/* _delta() on dirty items shouldn't fail */
|
||||
tval.total = cpu_to_le64(-le64_to_cpu(tval.total));
|
||||
tval.count = cpu_to_le64(-le64_to_cpu(tval.count));
|
||||
err = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
|
||||
BUG_ON(err);
|
||||
}
|
||||
|
||||
up_write(&si->xattr_rwsem);
|
||||
kfree(xat);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int scoutfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
|
||||
size_t size, int flags)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_xattr_prefix_tags tgs;
|
||||
struct scoutfs_lock *totl_lock = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
size_t name_len = strlen(name);
|
||||
LIST_HEAD(ind_locks);
|
||||
u64 ind_seq;
|
||||
int ret;
|
||||
|
||||
if (scoutfs_xattr_parse_tags(name, name_len, &tgs) != 0)
|
||||
return -EINVAL;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lck);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
if (tgs.totl) {
|
||||
ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &totl_lock);
|
||||
if (ret)
|
||||
@@ -723,79 +835,98 @@ retry:
|
||||
if (ret < 0)
|
||||
goto release;
|
||||
|
||||
if (tgs.srch && !(found_parts && value)) {
|
||||
if (found_parts)
|
||||
id = le64_to_cpu(key.skx_id);
|
||||
hash = scoutfs_hash64(name, name_len);
|
||||
ret = scoutfs_forest_srch_add(sb, hash, ino, id);
|
||||
if (ret < 0)
|
||||
goto release;
|
||||
undo_srch = true;
|
||||
}
|
||||
|
||||
if (tgs.totl) {
|
||||
ret = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
|
||||
if (ret < 0)
|
||||
goto release;
|
||||
undo_totl = true;
|
||||
}
|
||||
|
||||
if (found_parts && value)
|
||||
ret = change_xattr_items(inode, id, xat, bytes,
|
||||
xattr_nr_parts(xat), found_parts, lck);
|
||||
else if (found_parts)
|
||||
ret = delete_xattr_items(inode, le64_to_cpu(key.skx_name_hash),
|
||||
le64_to_cpu(key.skx_id), found_parts,
|
||||
lck);
|
||||
else
|
||||
ret = create_xattr_items(inode, id, xat, bytes, lck);
|
||||
if (ret < 0)
|
||||
goto release;
|
||||
|
||||
/* XXX do these want i_mutex or anything? */
|
||||
inode_inc_iversion(inode);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
scoutfs_update_inode_item(inode, lck, &ind_locks);
|
||||
ret = 0;
|
||||
ret = scoutfs_xattr_set_locked(dentry->d_inode, name, name_len, value, size, flags, &tgs,
|
||||
lck, totl_lock, &ind_locks);
|
||||
if (ret == 0)
|
||||
scoutfs_update_inode_item(inode, lck, &ind_locks);
|
||||
|
||||
release:
|
||||
if (ret < 0 && undo_srch) {
|
||||
err = scoutfs_forest_srch_add(sb, hash, ino, id);
|
||||
BUG_ON(err);
|
||||
}
|
||||
if (ret < 0 && undo_totl) {
|
||||
/* _delta() on dirty items shouldn't fail */
|
||||
tval.total = cpu_to_le64(-le64_to_cpu(tval.total));
|
||||
tval.count = cpu_to_le64(-le64_to_cpu(tval.count));
|
||||
err = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
|
||||
BUG_ON(err);
|
||||
}
|
||||
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
unlock:
|
||||
up_write(&si->xattr_rwsem);
|
||||
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE);
|
||||
scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY);
|
||||
out:
|
||||
vfree(xat);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
/*
|
||||
* Future kernels have this amazing hack to rewind the name to get the
|
||||
* skipped prefix. We're back in the stone ages without the handler
|
||||
* arg, so we Just Know that this is possible. This will become a
|
||||
* compat hook to either call the kernel's xattr_full_name(handler), or
|
||||
* our hack to use the flags as the prefix length.
|
||||
*/
|
||||
static const char *full_name_hack(void *handler, const char *name, int len)
|
||||
{
|
||||
if (size == 0)
|
||||
value = ""; /* set empty value */
|
||||
return name - len;
|
||||
}
|
||||
|
||||
static int scoutfs_xattr_get_handler(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size, int handler_flags)
|
||||
{
|
||||
name = full_name_hack(NULL, name, handler_flags);
|
||||
return scoutfs_xattr_get(dentry, name, value, size);
|
||||
}
|
||||
|
||||
static int scoutfs_xattr_set_handler(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags, int handler_flags)
|
||||
{
|
||||
name = full_name_hack(NULL, name, handler_flags);
|
||||
return scoutfs_xattr_set(dentry, name, value, size, flags);
|
||||
}
|
||||
|
||||
int scoutfs_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
return scoutfs_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
|
||||
}
|
||||
static const struct xattr_handler scoutfs_xattr_user_handler = {
|
||||
.prefix = XATTR_USER_PREFIX,
|
||||
.flags = XATTR_USER_PREFIX_LEN,
|
||||
.get = scoutfs_xattr_get_handler,
|
||||
.set = scoutfs_xattr_set_handler,
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_scoutfs_handler = {
|
||||
.prefix = SCOUTFS_XATTR_PREFIX,
|
||||
.flags = SCOUTFS_XATTR_PREFIX_LEN,
|
||||
.get = scoutfs_xattr_get_handler,
|
||||
.set = scoutfs_xattr_set_handler,
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_trusted_handler = {
|
||||
.prefix = XATTR_TRUSTED_PREFIX,
|
||||
.flags = XATTR_TRUSTED_PREFIX_LEN,
|
||||
.get = scoutfs_xattr_get_handler,
|
||||
.set = scoutfs_xattr_set_handler,
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_security_handler = {
|
||||
.prefix = XATTR_SECURITY_PREFIX,
|
||||
.flags = XATTR_SECURITY_PREFIX_LEN,
|
||||
.get = scoutfs_xattr_get_handler,
|
||||
.set = scoutfs_xattr_set_handler,
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_acl_access_handler = {
|
||||
.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
|
||||
.flags = ACL_TYPE_ACCESS,
|
||||
.get = scoutfs_acl_get_xattr,
|
||||
.set = scoutfs_acl_set_xattr,
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_acl_default_handler = {
|
||||
.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
|
||||
.flags = ACL_TYPE_DEFAULT,
|
||||
.get = scoutfs_acl_get_xattr,
|
||||
.set = scoutfs_acl_set_xattr,
|
||||
};
|
||||
|
||||
const struct xattr_handler *scoutfs_xattr_handlers[] = {
|
||||
&scoutfs_xattr_user_handler,
|
||||
&scoutfs_xattr_scoutfs_handler,
|
||||
&scoutfs_xattr_trusted_handler,
|
||||
&scoutfs_xattr_security_handler,
|
||||
&scoutfs_xattr_acl_access_handler,
|
||||
&scoutfs_xattr_acl_default_handler,
|
||||
NULL
|
||||
};
|
||||
|
||||
ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
size_t size, __u32 *hash_pos, __u64 *id_pos,
|
||||
@@ -807,7 +938,7 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
struct scoutfs_xattr *xat = NULL;
|
||||
struct scoutfs_lock *lck = NULL;
|
||||
struct scoutfs_key key;
|
||||
unsigned int bytes;
|
||||
unsigned int xat_bytes;
|
||||
ssize_t total = 0;
|
||||
u32 name_hash = 0;
|
||||
bool is_hidden;
|
||||
@@ -820,8 +951,8 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
id = *id_pos;
|
||||
|
||||
/* need a buffer large enough for all possible names */
|
||||
bytes = sizeof(struct scoutfs_xattr) + SCOUTFS_XATTR_MAX_NAME_LEN;
|
||||
xat = kmalloc(bytes, GFP_NOFS);
|
||||
xat_bytes = first_item_bytes(SCOUTFS_XATTR_MAX_NAME_LEN, 0);
|
||||
xat = kmalloc(xat_bytes, GFP_NOFS);
|
||||
if (!xat) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@@ -834,8 +965,7 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
down_read(&si->xattr_rwsem);
|
||||
|
||||
for (;;) {
|
||||
ret = get_next_xattr(inode, &key, xat, bytes,
|
||||
NULL, 0, name_hash, id, lck);
|
||||
ret = get_next_xattr(inode, &key, xat, xat_bytes, NULL, 0, name_hash, id, lck);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = total;
|
||||
|
||||
@@ -1,25 +1,29 @@
|
||||
#ifndef _SCOUTFS_XATTR_H_
|
||||
#define _SCOUTFS_XATTR_H_
|
||||
|
||||
ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
|
||||
size_t size);
|
||||
int scoutfs_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
int scoutfs_removexattr(struct dentry *dentry, const char *name);
|
||||
ssize_t scoutfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
|
||||
ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
size_t size, __u32 *hash_pos, __u64 *id_pos,
|
||||
bool e_range, bool show_hidden);
|
||||
|
||||
int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
struct scoutfs_xattr_prefix_tags {
|
||||
unsigned long hide:1,
|
||||
srch:1,
|
||||
totl:1;
|
||||
};
|
||||
|
||||
extern const struct xattr_handler *scoutfs_xattr_handlers[];
|
||||
|
||||
int scoutfs_xattr_get_locked(struct inode *inode, const char *name, void *buffer, size_t size,
|
||||
struct scoutfs_lock *lck);
|
||||
int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_len,
|
||||
const void *value, size_t size, int flags,
|
||||
const struct scoutfs_xattr_prefix_tags *tgs,
|
||||
struct scoutfs_lock *lck, struct scoutfs_lock *totl_lock,
|
||||
struct list_head *ind_locks);
|
||||
|
||||
ssize_t scoutfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
|
||||
ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
|
||||
size_t size, __u32 *hash_pos, __u64 *id_pos,
|
||||
bool e_range, bool show_hidden);
|
||||
int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
|
||||
struct scoutfs_lock *lock);
|
||||
|
||||
int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
|
||||
struct scoutfs_xattr_prefix_tags *tgs);
|
||||
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -3,6 +3,7 @@ src/createmany
|
||||
src/dumb_renameat2
|
||||
src/dumb_setxattr
|
||||
src/handle_cat
|
||||
src/handle_fsetxattr
|
||||
src/bulk_create_paths
|
||||
src/find_xattrs
|
||||
src/stage_tmpfile
|
||||
|
||||
@@ -6,10 +6,12 @@ BIN := src/createmany \
|
||||
src/dumb_renameat2 \
|
||||
src/dumb_setxattr \
|
||||
src/handle_cat \
|
||||
src/handle_fsetxattr \
|
||||
src/bulk_create_paths \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs \
|
||||
src/create_xattr_loop
|
||||
src/create_xattr_loop \
|
||||
src/fragmented_data_extents
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
@@ -1,5 +1,18 @@
|
||||
#!/usr/bin/bash
|
||||
|
||||
#
|
||||
# This fencing script is used for testing clusters of multiple mounts on
|
||||
# a single host. It finds mounts to fence by looking for their rids and
|
||||
# only knows how to "fence" by using forced unmount.
|
||||
#
|
||||
|
||||
echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
|
||||
|
||||
log() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo_fail() {
|
||||
echo "$@" > /dev/stderr
|
||||
exit 1
|
||||
@@ -7,29 +20,24 @@ echo_fail() {
|
||||
|
||||
rid="$SCOUTFS_FENCED_REQ_RID"
|
||||
|
||||
#
|
||||
# Look for a local mount with the rid to fence. Typically we'll at
|
||||
# least find the mount with the server that requested the fence that
|
||||
# we're processing. But it's possible that mounts are unmounted
|
||||
# before, or while, we're running.
|
||||
#
|
||||
mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \
|
||||
echo_fail "findmnt -t scoutfs failed" > /dev/stderr
|
||||
for fs in /sys/fs/scoutfs/*; do
|
||||
[ ! -d "$fs" ] && continue
|
||||
|
||||
for mnt in $mnts; do
|
||||
mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \
|
||||
echo_fail "scoutfs statfs $mnt failed"
|
||||
|
||||
if [ "$mnt_rid" == "$rid" ]; then
|
||||
umount -f "$mnt" || \
|
||||
echo_fail "umout -f $mnt"
|
||||
|
||||
exit 0
|
||||
fs_rid="$(cat $fs/rid)" || \
|
||||
echo_fail "failed to get rid in $fs"
|
||||
if [ "$fs_rid" != "$rid" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
nr="$(cat $fs/data_device_maj_min)" || \
|
||||
echo_fail "failed to get data device major:minor in $fs"
|
||||
|
||||
mnts=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
|
||||
echo_fail "findmnt -t scoutfs -S $nr failed"
|
||||
for mnt in $mnts; do
|
||||
umount -f "$mnt" || \
|
||||
echo_fail "umout -f $mnt failed"
|
||||
done
|
||||
done
|
||||
|
||||
#
|
||||
# If the mount doesn't exist on this host then it can't access the
|
||||
# devices by definition and can be considered fenced.
|
||||
#
|
||||
exit 0
|
||||
|
||||
@@ -56,8 +56,11 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .*: all clients recovered"
|
||||
re="$re|scoutfs .* error: client rid.*lock recovery timed out"
|
||||
|
||||
# some tests mount w/o options
|
||||
# we test bad devices and options
|
||||
re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
|
||||
re="$re|scoutfs .* error: meta_super META flag not set"
|
||||
re="$re|scoutfs .* error: could not open metadev:.*"
|
||||
re="$re|scoutfs .* error: Unknown or malformed option,.*"
|
||||
|
||||
# in debugging kernels we can slow things down a bit
|
||||
re="$re|hrtimer: interrupt took .*"
|
||||
|
||||
@@ -75,6 +75,20 @@ t_fs_nrs()
|
||||
seq 0 $((T_NR_MOUNTS - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
|
||||
# All other cases output 0, including the fs nr being a client which
|
||||
# won't have a quorum/ dir.
|
||||
#
|
||||
t_fs_is_leader()
|
||||
{
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader 2>/dev/null)" == "1" ]; then
|
||||
echo "1"
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# Output the mount nr of the current server. This takes no steps to
|
||||
# ensure that the server doesn't shut down and have some other mount
|
||||
@@ -83,7 +97,7 @@ t_fs_nrs()
|
||||
t_server_nr()
|
||||
{
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "1" ]; then
|
||||
if [ "$(t_fs_is_leader $i)" == "1" ]; then
|
||||
echo $i
|
||||
return
|
||||
fi
|
||||
@@ -101,7 +115,7 @@ t_server_nr()
|
||||
t_first_client_nr()
|
||||
{
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "0" ]; then
|
||||
if [ "$(t_fs_is_leader $i)" == "0" ]; then
|
||||
echo $i
|
||||
return
|
||||
fi
|
||||
@@ -362,3 +376,57 @@ t_wait_for_leader() {
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
t_get_sysfs_mount_option() {
|
||||
local nr="$1"
|
||||
local name="$2"
|
||||
local opt="$(t_sysfs_path $nr)/mount_options/$name"
|
||||
|
||||
cat "$opt"
|
||||
}
|
||||
|
||||
t_set_sysfs_mount_option() {
|
||||
local nr="$1"
|
||||
local name="$2"
|
||||
local val="$3"
|
||||
local opt="$(t_sysfs_path $nr)/mount_options/$name"
|
||||
|
||||
echo "$val" > "$opt"
|
||||
}
|
||||
|
||||
t_set_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local val="$2"
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
t_set_sysfs_mount_option $i $name $val
|
||||
done
|
||||
}
|
||||
|
||||
declare -A _saved_opts
|
||||
t_save_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local ind
|
||||
local opt
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
opt="$(t_sysfs_path $i)/mount_options/$name"
|
||||
ind="${name}_${i}"
|
||||
|
||||
_saved_opts[$ind]="$(cat $opt)"
|
||||
done
|
||||
}
|
||||
|
||||
t_restore_all_sysfs_mount_options() {
|
||||
local name="$1"
|
||||
local ind
|
||||
local i
|
||||
|
||||
for i in $(t_fs_nrs); do
|
||||
ind="${name}_${i}"
|
||||
|
||||
t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}"
|
||||
done
|
||||
}
|
||||
|
||||
6
tests/golden/basic-bad-mounts
Normal file
6
tests/golden/basic-bad-mounts
Normal file
@@ -0,0 +1,6 @@
|
||||
== prepare devices, mount point, and logs
|
||||
== bad devices, bad options
|
||||
== swapped devices
|
||||
== both meta devices
|
||||
== both data devices
|
||||
== good volume, bad option and good options
|
||||
6
tests/golden/basic-truncate
Normal file
6
tests/golden/basic-truncate
Normal file
@@ -0,0 +1,6 @@
|
||||
== truncate writes zeroed partial end of file block
|
||||
0000000 0a79 0a79 0a79 0a79 0a79 0a79 0a79 0a79
|
||||
*
|
||||
0006144 0000 0000 0000 0000 0000 0000 0000 0000
|
||||
*
|
||||
0012288
|
||||
26
tests/golden/data-prealloc
Normal file
26
tests/golden/data-prealloc
Normal file
@@ -0,0 +1,26 @@
|
||||
== initial writes smaller than prealloc grow to prealloc size
|
||||
/mnt/test/test/data-prealloc/file-1: 7 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 7 extents found
|
||||
== larger files get full prealloc extents
|
||||
/mnt/test/test/data-prealloc/file-1: 9 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 9 extents found
|
||||
== non-streaming writes with contig have per-block extents
|
||||
/mnt/test/test/data-prealloc/file-1: 32 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 32 extents found
|
||||
== any writes to region prealloc get full extents
|
||||
/mnt/test/test/data-prealloc/file-1: 4 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 4 extents found
|
||||
/mnt/test/test/data-prealloc/file-1: 4 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 4 extents found
|
||||
== streaming offline writes get full extents either way
|
||||
/mnt/test/test/data-prealloc/file-1: 4 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 4 extents found
|
||||
/mnt/test/test/data-prealloc/file-1: 4 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 4 extents found
|
||||
== goofy preallocation amounts work
|
||||
/mnt/test/test/data-prealloc/file-1: 5 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 5 extents found
|
||||
/mnt/test/test/data-prealloc/file-1: 5 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 5 extents found
|
||||
/mnt/test/test/data-prealloc/file-1: 3 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 3 extents found
|
||||
3
tests/golden/fallocate
Normal file
3
tests/golden/fallocate
Normal file
@@ -0,0 +1,3 @@
|
||||
== creating reasonably large per-mount files
|
||||
== 10s of racing cold reads and fallocate nop
|
||||
== cleaning up files
|
||||
3
tests/golden/large-fragmented-free
Normal file
3
tests/golden/large-fragmented-free
Normal file
@@ -0,0 +1,3 @@
|
||||
== creating fragmented extents
|
||||
== unlink file with moved extents to free extents per block
|
||||
== cleanup
|
||||
3
tests/golden/lock-recover-invalidate
Normal file
3
tests/golden/lock-recover-invalidate
Normal file
@@ -0,0 +1,3 @@
|
||||
== starting background invalidating read/write load
|
||||
== 60s of lock recovery during invalidating load
|
||||
== stopping background load
|
||||
0
tests/golden/lock-rever-invalidate
Normal file
0
tests/golden/lock-rever-invalidate
Normal file
@@ -2,3 +2,4 @@
|
||||
== unlinked and opened inodes still exist
|
||||
== orphan from failed evict deletion is picked up
|
||||
== orphaned inos in all mounts all deleted
|
||||
== 30s of racing evict deletion, orphan scanning, and open by handle
|
||||
|
||||
@@ -7,3 +7,4 @@ found second
|
||||
== changing metadata must increase meta seq
|
||||
== changing contents must increase data seq
|
||||
== make sure dirtying doesn't livelock walk
|
||||
== concurrent update attempts maintain single entries
|
||||
|
||||
@@ -40,6 +40,7 @@ generic/092
|
||||
generic/098
|
||||
generic/101
|
||||
generic/104
|
||||
generic/105
|
||||
generic/106
|
||||
generic/107
|
||||
generic/117
|
||||
@@ -51,6 +52,7 @@ generic/184
|
||||
generic/221
|
||||
generic/228
|
||||
generic/236
|
||||
generic/237
|
||||
generic/245
|
||||
generic/249
|
||||
generic/257
|
||||
@@ -63,6 +65,7 @@ generic/308
|
||||
generic/309
|
||||
generic/313
|
||||
generic/315
|
||||
generic/319
|
||||
generic/322
|
||||
generic/335
|
||||
generic/336
|
||||
@@ -72,6 +75,7 @@ generic/342
|
||||
generic/343
|
||||
generic/348
|
||||
generic/360
|
||||
generic/375
|
||||
generic/376
|
||||
generic/377
|
||||
Not
|
||||
@@ -282,4 +286,4 @@ shared/004
|
||||
shared/032
|
||||
shared/051
|
||||
shared/289
|
||||
Passed all 75 tests
|
||||
Passed all 79 tests
|
||||
|
||||
@@ -58,6 +58,7 @@ $(basename $0) options:
|
||||
-m | Run mkfs on the device before mounting and running
|
||||
| tests. Implies unmounting existing mounts first.
|
||||
-n <nr> | The number of devices and mounts to test.
|
||||
-o <opts> | Add option string to all mounts during all tests.
|
||||
-P | Enable trace_printk.
|
||||
-p | Exit script after preparing mounts only, don't run tests.
|
||||
-q <nr> | The first <nr> mounts will be quorum members. Must be
|
||||
@@ -68,6 +69,7 @@ $(basename $0) options:
|
||||
-s | Skip git repo checkouts.
|
||||
-t | Enabled trace events that match the given glob argument.
|
||||
| Multiple options enable multiple globbed events.
|
||||
-T <nr> | Multiply the original trace buffer size by nr during the run.
|
||||
-X | xfstests git repo. Used by tests/xfstests.sh.
|
||||
-x | xfstests git branch to checkout and track.
|
||||
-y | xfstests ./check additional args
|
||||
@@ -136,6 +138,12 @@ while true; do
|
||||
T_NR_MOUNTS="$2"
|
||||
shift
|
||||
;;
|
||||
-o)
|
||||
test -n "$2" || die "-o must have option string argument"
|
||||
# always appending to existing options
|
||||
T_MNT_OPTIONS+=",$2"
|
||||
shift
|
||||
;;
|
||||
-P)
|
||||
T_TRACE_PRINTK="1"
|
||||
;;
|
||||
@@ -160,6 +168,11 @@ while true; do
|
||||
T_TRACE_GLOB+=("$2")
|
||||
shift
|
||||
;;
|
||||
-T)
|
||||
test -n "$2" || die "-T must have trace buffer size multiplier argument"
|
||||
T_TRACE_MULT="$2"
|
||||
shift
|
||||
;;
|
||||
-X)
|
||||
test -n "$2" || die "-X requires xfstests git repo dir argument"
|
||||
T_XFSTESTS_REPO="$2"
|
||||
@@ -345,6 +358,13 @@ if [ -n "$T_INSMOD" ]; then
|
||||
cmd insmod "$T_KMOD/src/scoutfs.ko"
|
||||
fi
|
||||
|
||||
if [ -n "$T_TRACE_MULT" ]; then
|
||||
orig_trace_size=$(cat /sys/kernel/debug/tracing/buffer_size_kb)
|
||||
mult_trace_size=$((orig_trace_size * T_TRACE_MULT))
|
||||
msg "increasing trace buffer size from $orig_trace_size KiB to $mult_trace_size KiB"
|
||||
echo $mult_trace_size > /sys/kernel/debug/tracing/buffer_size_kb
|
||||
fi
|
||||
|
||||
nr_globs=${#T_TRACE_GLOB[@]}
|
||||
if [ $nr_globs -gt 0 ]; then
|
||||
echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
|
||||
@@ -374,19 +394,21 @@ fi
|
||||
# always describe tracing in the logs
|
||||
cmd cat /sys/kernel/debug/tracing/set_event
|
||||
cmd grep . /sys/kernel/debug/tracing/options/trace_printk \
|
||||
/sys/kernel/debug/tracing/buffer_size_kb \
|
||||
/proc/sys/kernel/ftrace_dump_on_oops
|
||||
|
||||
#
|
||||
# Build a fenced config that runs scripts out of the repository rather
|
||||
# than the default system directory
|
||||
#
|
||||
conf="$T_RESULTS/scoutfs-fencd.conf"
|
||||
conf="$T_RESULTS/scoutfs-fenced.conf"
|
||||
cat > $conf << EOF
|
||||
SCOUTFS_FENCED_DELAY=1
|
||||
SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh
|
||||
SCOUTFS_FENCED_RUN_ARGS=""
|
||||
SCOUTFS_FENCED_RUN_ARGS="ignored run args"
|
||||
EOF
|
||||
export SCOUTFS_FENCED_CONFIG_FILE="$conf"
|
||||
T_FENCED_LOG="$T_RESULTS/fenced.log"
|
||||
|
||||
#
|
||||
# Run the agent in the background, log its output, an kill it if we
|
||||
@@ -394,7 +416,7 @@ export SCOUTFS_FENCED_CONFIG_FILE="$conf"
|
||||
#
|
||||
fenced_log()
|
||||
{
|
||||
echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log"
|
||||
echo "[$(timestamp)] $*" >> "$T_FENCED_LOG"
|
||||
}
|
||||
fenced_pid=""
|
||||
kill_fenced()
|
||||
@@ -405,7 +427,7 @@ kill_fenced()
|
||||
fi
|
||||
}
|
||||
trap kill_fenced EXIT
|
||||
$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" &
|
||||
$T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
|
||||
fenced_pid=$!
|
||||
fenced_log "started fenced pid $fenced_pid in the background"
|
||||
|
||||
@@ -429,6 +451,7 @@ for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
|
||||
if [ "$i" -lt "$T_QUORUM" ]; then
|
||||
opts="$opts,quorum_slot_nr=$i"
|
||||
fi
|
||||
opts="${opts}${T_MNT_OPTIONS}"
|
||||
|
||||
msg "mounting $meta_dev|$data_dev on $dir"
|
||||
cmd mount -t scoutfs $opts "$data_dev" "$dir" &
|
||||
@@ -603,6 +626,9 @@ if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
|
||||
echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
|
||||
echo 0 > /sys/kernel/debug/tracing/options/trace_printk
|
||||
cat /sys/kernel/debug/tracing/trace > "$T_RESULTS/traces"
|
||||
if [ -n "$orig_trace_size" ]; then
|
||||
echo $orig_trace_size > /sys/kernel/debug/tracing/buffer_size_kb
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$skipped" == 0 -a "$failed" == 0 ]; then
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
export-get-name-parent.sh
|
||||
basic-block-counts.sh
|
||||
basic-bad-mounts.sh
|
||||
inode-items-updated.sh
|
||||
simple-inode-index.sh
|
||||
simple-staging.sh
|
||||
simple-release-extents.sh
|
||||
fallocate.sh
|
||||
basic-truncate.sh
|
||||
data-prealloc.sh
|
||||
setattr_more.sh
|
||||
offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
large-fragmented-free.sh
|
||||
enospc.sh
|
||||
srch-basic-functionality.sh
|
||||
simple-xattr-unit.sh
|
||||
@@ -15,6 +20,7 @@ lock-refleak.sh
|
||||
lock-shrink-consistency.sh
|
||||
lock-pr-cw-conflict.sh
|
||||
lock-revoke-getcwd.sh
|
||||
lock-recover-invalidate.sh
|
||||
export-lookup-evict-race.sh
|
||||
createmany-parallel.sh
|
||||
createmany-large-names.sh
|
||||
|
||||
113
tests/src/fragmented_data_extents.c
Normal file
113
tests/src/fragmented_data_extents.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This creates fragmented data extents.
|
||||
*
|
||||
* A file is created that has alternating free and allocated extents.
|
||||
* This also results in the global allocator having the matching
|
||||
* fragmented free extent pattern. While that file is being created,
|
||||
* occasionally an allocated extent is moved to another file. This
|
||||
* results in a file that has fragmented extents at a given stride that
|
||||
* can be deleted to create free data extents with a given stride.
|
||||
*
|
||||
* We don't have hole punching so to do this quickly we use a goofy
|
||||
* combination of fallocate, truncate, and our move_blocks ioctl.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <linux/types.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "ioctl.h"
|
||||
|
||||
#define BLOCK_SIZE 4096
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scoutfs_ioctl_move_blocks mb = {0,};
|
||||
unsigned long long freed_extents;
|
||||
unsigned long long move_stride;
|
||||
unsigned long long i;
|
||||
int alloc_fd;
|
||||
int trunc_fd;
|
||||
off_t off;
|
||||
int ret;
|
||||
|
||||
if (argc != 5) {
|
||||
printf("%s <freed_extents> <move_stride> <alloc_file> <trunc_file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
freed_extents = strtoull(argv[1], NULL, 0);
|
||||
move_stride = strtoull(argv[2], NULL, 0);
|
||||
|
||||
alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
|
||||
if (alloc_fd == -1) {
|
||||
fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
|
||||
if (trunc_fd == -1) {
|
||||
fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) {
|
||||
|
||||
ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "fallocate at off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off, errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ret = ftruncate(alloc_fd, off + BLOCK_SIZE);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "truncate to off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off + BLOCK_SIZE, errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((i % move_stride) == 0) {
|
||||
mb.from_fd = alloc_fd;
|
||||
mb.from_off = off;
|
||||
mb.len = BLOCK_SIZE;
|
||||
mb.to_off = i * BLOCK_SIZE;
|
||||
|
||||
ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "move from off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off,
|
||||
errno, strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (alloc_fd > -1)
|
||||
close(alloc_fd);
|
||||
if (trunc_fd > -1)
|
||||
close(trunc_fd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
189
tests/src/handle_fsetxattr.c
Normal file
189
tests/src/handle_fsetxattr.c
Normal file
@@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <inttypes.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <endian.h>
|
||||
#include <time.h>
|
||||
#include <linux/types.h>
|
||||
#include <sys/xattr.h>
|
||||
|
||||
#define FILEID_SCOUTFS 0x81
|
||||
#define FILEID_SCOUTFS_WITH_PARENT 0x82
|
||||
|
||||
struct our_handle {
|
||||
struct file_handle handle;
|
||||
/*
|
||||
* scoutfs file handle can be ino or ino/parent. The
|
||||
* handle_type field of struct file_handle denotes which
|
||||
* version is in use. We only use the ino variant here.
|
||||
*/
|
||||
__le64 scoutfs_ino;
|
||||
};
|
||||
|
||||
#define DEFAULT_NAME "user.handle_fsetxattr"
|
||||
#define DEFAULT_VALUE "value"
|
||||
|
||||
static void exit_usage(void)
|
||||
{
|
||||
printf(" -h/-? output this usage message and exit\n"
|
||||
" -e keep trying on enoent, consider success an error\n"
|
||||
" -i <num> 64bit inode number for handle open, can be multiple\n"
|
||||
" -m <string> scoutfs mount path string for ioctl fd\n"
|
||||
" -n <string> optional xattr name string, defaults to \""DEFAULT_NAME"\"\n"
|
||||
" -s <num> loop for num seconds, defaults to 0 for one iteration"
|
||||
" -v <string> optional xattr value string, defaults to \""DEFAULT_VALUE"\"\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct our_handle handle;
|
||||
struct timespec ts;
|
||||
bool enoent_success_err = false;
|
||||
uint64_t seconds = 0;
|
||||
char *value = NULL;
|
||||
char *name = NULL;
|
||||
char *mnt = NULL;
|
||||
int nr_inos = 0;
|
||||
uint64_t *inos;
|
||||
uint64_t i;
|
||||
int *fds;
|
||||
int mntfd;
|
||||
int fd;
|
||||
int ret;
|
||||
char c;
|
||||
int j;
|
||||
|
||||
/* can't have more inos than args */
|
||||
inos = calloc(argc, sizeof(inos[0]));
|
||||
fds = calloc(argc, sizeof(fds[0]));
|
||||
if (!inos || !fds) {
|
||||
perror("calloc");
|
||||
exit(1);
|
||||
}
|
||||
for (i = 0; i < argc; i++)
|
||||
fds[i] = -1;
|
||||
|
||||
while ((c = getopt(argc, argv, "+ei:m:n:s:v:")) != -1) {
|
||||
switch (c) {
|
||||
case 'e':
|
||||
enoent_success_err = true;
|
||||
break;
|
||||
case 'i':
|
||||
inos[nr_inos] = strtoll(optarg, NULL, 0);
|
||||
nr_inos++;
|
||||
break;
|
||||
case 'm':
|
||||
mnt = strdup(optarg);
|
||||
break;
|
||||
case 'n':
|
||||
name = strdup(optarg);
|
||||
break;
|
||||
case 's':
|
||||
seconds = strtoll(optarg, NULL, 0);
|
||||
break;
|
||||
case 'v':
|
||||
value = strdup(optarg);
|
||||
break;
|
||||
case '?':
|
||||
printf("unknown argument: %c\n", optind);
|
||||
case 'h':
|
||||
exit_usage();
|
||||
}
|
||||
}
|
||||
|
||||
if (nr_inos == 0) {
|
||||
printf("specify non-zero inode number with -i\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!mnt) {
|
||||
printf("specify scoutfs mount path for ioctl with -p\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (name == NULL)
|
||||
name = DEFAULT_NAME;
|
||||
if (value == NULL)
|
||||
value = DEFAULT_VALUE;
|
||||
|
||||
mntfd = open(mnt, O_RDONLY);
|
||||
if (mntfd == -1) {
|
||||
perror("opening mountpoint");
|
||||
return 1;
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
seconds += ts.tv_sec;
|
||||
|
||||
for (i = 0; ; i++) {
|
||||
for (j = 0; j < nr_inos; j++) {
|
||||
fd = fds[j];
|
||||
|
||||
if (fd < 0) {
|
||||
handle.handle.handle_bytes = sizeof(struct our_handle);
|
||||
handle.handle.handle_type = FILEID_SCOUTFS;
|
||||
handle.scoutfs_ino = htole64(inos[j]);
|
||||
|
||||
fd = open_by_handle_at(mntfd, &handle.handle, O_RDWR);
|
||||
if (fd == -1) {
|
||||
if (!enoent_success_err || errno != ENOENT) {
|
||||
perror("open_by_handle_at");
|
||||
return 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
fds[j] = fd;
|
||||
}
|
||||
|
||||
ret = fsetxattr(fd, name, value, strlen(value), 0);
|
||||
if (ret < 0) {
|
||||
perror("fsetxattr");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if ((i % 10) == 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
if (ts.tv_sec >= seconds)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (enoent_success_err) {
|
||||
bool able = false;
|
||||
for (i = 0; i < nr_inos; i++) {
|
||||
if (fds[i] >= 0) {
|
||||
printf("was able to open ino %"PRIu64"\n", inos[i]);
|
||||
able = true;
|
||||
}
|
||||
}
|
||||
if (able)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* not bothering to close or free */
|
||||
return 0;
|
||||
}
|
||||
36
tests/tests/basic-bad-mounts.sh
Normal file
36
tests/tests/basic-bad-mounts.sh
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
mount_fail()
|
||||
{
|
||||
local mnt=${!#}
|
||||
|
||||
echo "mounting $@" >> $T_TMP.mount.out
|
||||
mount -t scoutfs "$@" >> $T_TMP.mount.out 2>&1
|
||||
if [ $? == 0 ]; then
|
||||
umount "$mnt" || t_fail "couldn't unmount"
|
||||
t_fail "bad mount succeeded"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "== prepare devices, mount point, and logs"
|
||||
SCR="/mnt/scoutfs.extra"
|
||||
mkdir -p "$SCR"
|
||||
> $T_TMP.mount.out
|
||||
scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
|
||||
|| t_fail "mkfs failed"
|
||||
|
||||
echo "== bad devices, bad options"
|
||||
mount_fail -o _bad /dev/null /dev/null "$SCR"
|
||||
|
||||
echo "== swapped devices"
|
||||
mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
|
||||
|
||||
echo "== both meta devices"
|
||||
mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_META_DEV" "$SCR"
|
||||
|
||||
echo "== both data devices"
|
||||
mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
echo "== good volume, bad option and good options"
|
||||
mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
t_pass
|
||||
21
tests/tests/basic-truncate.sh
Normal file
21
tests/tests/basic-truncate.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
#
|
||||
# Test basic correctness of truncate.
|
||||
#
|
||||
|
||||
t_require_commands yes dd od truncate
|
||||
|
||||
FILE="$T_D0/file"
|
||||
|
||||
#
|
||||
# We forgot to write a dirty block that zeroed the tail of a partial
|
||||
# final block as we truncated past it.
|
||||
#
|
||||
echo "== truncate writes zeroed partial end of file block"
|
||||
yes | dd of="$FILE" bs=8K count=1 status=none
|
||||
sync
|
||||
truncate -s 6K "$FILE"
|
||||
truncate -s 12K "$FILE"
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
od -Ad -x "$FILE"
|
||||
|
||||
t_pass
|
||||
136
tests/tests/data-prealloc.sh
Normal file
136
tests/tests/data-prealloc.sh
Normal file
@@ -0,0 +1,136 @@
|
||||
#
|
||||
# test that the data prealloc options behave as expected. We write to
|
||||
# two files a block at a time so that a single file doesn't naturally
|
||||
# merge adjacent consecutive allocations. (we don't have multiple
|
||||
# allocation cursors)
|
||||
#
|
||||
t_require_commands scoutfs stat filefrag dd touch truncate
|
||||
|
||||
write_forwards()
|
||||
{
|
||||
local prefix="$1"
|
||||
local nr="$2"
|
||||
local blk
|
||||
|
||||
touch "$prefix"-{1,2}
|
||||
truncate -s 0 "$prefix"-{1,2}
|
||||
|
||||
for blk in $(seq 0 1 $((nr - 1))); do
|
||||
dd if=/dev/zero of="$prefix"-1 bs=4096 seek=$blk count=1 conv=notrunc status=none
|
||||
dd if=/dev/zero of="$prefix"-2 bs=4096 seek=$blk count=1 conv=notrunc status=none
|
||||
done
|
||||
}
|
||||
|
||||
write_backwards()
|
||||
{
|
||||
local prefix="$1"
|
||||
local nr="$2"
|
||||
local blk
|
||||
|
||||
touch "$prefix"-{1,2}
|
||||
truncate -s 0 "$prefix"-{1,2}
|
||||
|
||||
for blk in $(seq $((nr - 1)) -1 0); do
|
||||
dd if=/dev/zero of="$prefix"-1 bs=4096 seek=$blk count=1 conv=notrunc status=none
|
||||
dd if=/dev/zero of="$prefix"-2 bs=4096 seek=$blk count=1 conv=notrunc status=none
|
||||
done
|
||||
}
|
||||
|
||||
release_files() {
|
||||
local prefix="$1"
|
||||
local size=$(($2 * 4096))
|
||||
local vers
|
||||
local f
|
||||
|
||||
for f in "$prefix"*; do
|
||||
size=$(stat -c "%s" "$f")
|
||||
vers=$(scoutfs stat -s data_version "$f")
|
||||
scoutfs release "$f" -V "$vers" -o 0 -l $size
|
||||
done
|
||||
}
|
||||
|
||||
stage_files() {
|
||||
local prefix="$1"
|
||||
local nr="$2"
|
||||
local vers
|
||||
local f
|
||||
|
||||
for blk in $(seq 0 1 $((nr - 1))); do
|
||||
for f in "$prefix"*; do
|
||||
vers=$(scoutfs stat -s data_version "$f")
|
||||
scoutfs stage /dev/zero "$f" -V "$vers" -o $((blk * 4096)) -l 4096
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
print_extents_found()
|
||||
{
|
||||
local prefix="$1"
|
||||
|
||||
filefrag "$prefix"* 2>&1 | grep "extent.*found" | t_filter_fs
|
||||
}
|
||||
|
||||
t_save_all_sysfs_mount_options data_prealloc_blocks
|
||||
t_save_all_sysfs_mount_options data_prealloc_contig_only
|
||||
restore_options()
|
||||
{
|
||||
t_restore_all_sysfs_mount_options data_prealloc_blocks
|
||||
t_restore_all_sysfs_mount_options data_prealloc_contig_only
|
||||
}
|
||||
trap restore_options EXIT
|
||||
|
||||
prefix="$T_D0/file"
|
||||
|
||||
echo "== initial writes smaller than prealloc grow to prealloc size"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 32
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 1
|
||||
write_forwards $prefix 64
|
||||
print_extents_found $prefix
|
||||
|
||||
echo "== larger files get full prealloc extents"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 32
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 1
|
||||
write_forwards $prefix 128
|
||||
print_extents_found $prefix
|
||||
|
||||
echo "== non-streaming writes with contig have per-block extents"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 32
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 1
|
||||
write_backwards $prefix 32
|
||||
print_extents_found $prefix
|
||||
|
||||
echo "== any writes to region prealloc get full extents"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 16
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
|
||||
write_forwards $prefix 64
|
||||
print_extents_found $prefix
|
||||
write_backwards $prefix 64
|
||||
print_extents_found $prefix
|
||||
|
||||
echo "== streaming offline writes get full extents either way"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 16
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 1
|
||||
write_forwards $prefix 64
|
||||
release_files $prefix 64
|
||||
stage_files $prefix 64
|
||||
print_extents_found $prefix
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
|
||||
release_files $prefix 64
|
||||
stage_files $prefix 64
|
||||
print_extents_found $prefix
|
||||
|
||||
echo "== goofy preallocation amounts work"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 7
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 1
|
||||
write_forwards $prefix 14
|
||||
print_extents_found $prefix
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 13
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
|
||||
write_forwards $prefix 53
|
||||
print_extents_found $prefix
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 1
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
|
||||
write_forwards $prefix 3
|
||||
print_extents_found $prefix
|
||||
|
||||
t_pass
|
||||
38
tests/tests/fallocate.sh
Normal file
38
tests/tests/fallocate.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
t_require_commands fallocate cat
|
||||
|
||||
echo "== creating reasonably large per-mount files"
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/file-$n"
|
||||
|
||||
LC_ALL=C fallocate -l 128MiB "$path" || \
|
||||
t_fail "initial creating fallocate failed"
|
||||
done
|
||||
|
||||
#
|
||||
# we had lock inversions between read and fallocate, dropping
|
||||
# the cache each time forces waiting for IO during the calls
|
||||
# with the inverted locks held so we have a better chance
|
||||
# of the deadlock happening.
|
||||
#
|
||||
DURATION=10
|
||||
echo "== ${DURATION}s of racing cold reads and fallocate nop"
|
||||
END=$((SECONDS + DURATION))
|
||||
while [ $SECONDS -le $END ]; do
|
||||
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
|
||||
for n in $(t_fs_nrs); do
|
||||
eval path="\$T_D${n}/file-$n"
|
||||
|
||||
LC_ALL=C fallocate -o 0 -l 4KiB "$path" &
|
||||
cat "$path" > /dev/null &
|
||||
done
|
||||
|
||||
wait || t_fail "fallocate or cat failed"
|
||||
done
|
||||
|
||||
echo "== cleaning up files"
|
||||
rm -f "$T_D0"/file-*
|
||||
|
||||
t_pass
|
||||
@@ -45,6 +45,18 @@ check_read_write()
|
||||
fi
|
||||
}
|
||||
|
||||
# verify that fenced ran our testing fence script
|
||||
verify_fenced_run()
|
||||
{
|
||||
local rids="$@"
|
||||
local rid
|
||||
|
||||
for rid in $rids; do
|
||||
grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \
|
||||
t_fail "fenced didn't execute RUN script for rid $rid"
|
||||
done
|
||||
}
|
||||
|
||||
echo "== make sure all mounts can see each other"
|
||||
check_read_write
|
||||
|
||||
@@ -62,12 +74,14 @@ done
|
||||
while t_rid_is_fencing $rid; do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rid
|
||||
t_mount $cl
|
||||
check_read_write
|
||||
|
||||
echo "== force unmount all non-server, connection timeout, fence nop, mount"
|
||||
sv=$(t_server_nr)
|
||||
pattern="nonsense"
|
||||
rids=""
|
||||
sync
|
||||
for cl in $(t_fs_nrs); do
|
||||
if [ $cl == $sv ]; then
|
||||
@@ -75,6 +89,7 @@ for cl in $(t_fs_nrs); do
|
||||
fi
|
||||
|
||||
rid=$(t_mount_rid $cl)
|
||||
rids="$rids $rid"
|
||||
pattern="$pattern|$rid"
|
||||
echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
|
||||
|
||||
@@ -89,6 +104,7 @@ done
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rids
|
||||
# remount all the clients
|
||||
for cl in $(t_fs_nrs); do
|
||||
if [ $cl == $sv ]; then
|
||||
@@ -109,11 +125,17 @@ t_wait_for_leader
|
||||
while t_rid_is_fencing $rid; do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rid
|
||||
t_mount $sv
|
||||
check_read_write
|
||||
|
||||
echo "== force unmount everything, new server fences all previous"
|
||||
sync
|
||||
rids=""
|
||||
# get rids before forced unmount breaks scoutfs statfs
|
||||
for nr in $(t_fs_nrs); do
|
||||
rids="$rids $(t_mount_rid $nr)"
|
||||
done
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_force_umount $nr
|
||||
done
|
||||
@@ -122,6 +144,7 @@ t_mount_all
|
||||
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
verify_fenced_run $rids
|
||||
check_read_write
|
||||
|
||||
t_pass
|
||||
|
||||
22
tests/tests/large-fragmented-free.sh
Normal file
22
tests/tests/large-fragmented-free.sh
Normal file
@@ -0,0 +1,22 @@
|
||||
#
|
||||
# Make sure the server can handle a transaction with a data_freed whose
|
||||
# blocks all hit different btree blocks in the main free list. It
|
||||
# probably has to be merged in multiple commits.
|
||||
#
|
||||
|
||||
t_require_commands fragmented_data_extents
|
||||
|
||||
EXTENTS_PER_BTREE_BLOCK=600
|
||||
EXTENTS_PER_LIST_BLOCK=8192
|
||||
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
|
||||
|
||||
echo "== creating fragmented extents"
|
||||
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
|
||||
|
||||
echo "== unlink file with moved extents to free extents per block"
|
||||
rm -f "$T_D0/move"
|
||||
|
||||
echo "== cleanup"
|
||||
rm -f "$T_D0/alloc"
|
||||
|
||||
t_pass
|
||||
43
tests/tests/lock-recover-invalidate.sh
Normal file
43
tests/tests/lock-recover-invalidate.sh
Normal file
@@ -0,0 +1,43 @@
|
||||
#
|
||||
# trigger server failover and lock recovery during heavy invalidating
|
||||
# load on multiple mounts
|
||||
#
|
||||
|
||||
majority_nr=$(t_majority_count)
|
||||
quorum_nr=$T_QUORUM
|
||||
|
||||
test "$quorum_nr" == "$majority_nr" && \
|
||||
t_skip "need remaining majority when leader unmounted"
|
||||
|
||||
test "$T_NR_MOUNTS" -lt "$((quorum_nr + 2))" && \
|
||||
t_skip "need at least 2 non-quorum load mounts"
|
||||
|
||||
echo "== starting background invalidating read/write load"
|
||||
touch "$T_D0/file"
|
||||
load_pids=""
|
||||
for i in $(t_fs_nrs); do
|
||||
if [ "$i" -ge "$quorum_nr" ]; then
|
||||
eval path="\$T_D${i}/file"
|
||||
|
||||
(while true; do touch $path > /dev/null 2>&1; done) &
|
||||
load_pids="$load_pids $!"
|
||||
(while true; do stat $path > /dev/null 2>&1; done) &
|
||||
load_pids="$load_pids $!"
|
||||
fi
|
||||
done
|
||||
|
||||
# had it reproduce in ~40s on wimpy debug kernel guests
|
||||
LENGTH=60
|
||||
echo "== ${LENGTH}s of lock recovery during invalidating load"
|
||||
END=$((SECONDS + LENGTH))
|
||||
while [ "$SECONDS" -lt "$END" ]; do
|
||||
sv=$(t_server_nr)
|
||||
t_umount $sv
|
||||
t_mount $sv
|
||||
# new server had to process greeting for mount to finish
|
||||
done
|
||||
|
||||
echo "== stopping background load"
|
||||
kill $load_pids
|
||||
|
||||
t_pass
|
||||
@@ -30,6 +30,13 @@ inode_exists()
|
||||
test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino"
|
||||
}
|
||||
|
||||
t_save_all_sysfs_mount_options orphan_scan_delay_ms
|
||||
restore_delays()
|
||||
{
|
||||
t_restore_all_sysfs_mount_options orphan_scan_delay_ms
|
||||
}
|
||||
trap restore_delays EXIT
|
||||
|
||||
echo "== test our inode existance function"
|
||||
path="$T_D0/file"
|
||||
touch "$path"
|
||||
@@ -38,6 +45,7 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
|
||||
echo "== unlinked and opened inodes still exist"
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for background sleep to run and open stdin
|
||||
pid="$!"
|
||||
rm -f "$path"
|
||||
inode_exists $ino || echo "$ino didn't exist"
|
||||
@@ -45,7 +53,8 @@ inode_exists $ino || echo "$ino didn't exist"
|
||||
echo "== orphan from failed evict deletion is picked up"
|
||||
# pending kill signal stops evict from getting locks and deleting
|
||||
silent_kill $pid
|
||||
sleep 55
|
||||
t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000
|
||||
sleep 5
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
|
||||
echo "== orphaned inos in all mounts all deleted"
|
||||
@@ -56,6 +65,7 @@ for nr in $(t_fs_nrs); do
|
||||
touch "$path"
|
||||
inos="$inos $(stat -c %i $path)"
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for background sleep to run and open stdin
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
@@ -70,9 +80,63 @@ while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
|
||||
sleep .5
|
||||
done
|
||||
# wait for orphan scans to run
|
||||
sleep 55
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
|
||||
# also have to wait for delayed log merge work from mount
|
||||
sleep 15
|
||||
for ino in $inos; do
|
||||
inode_exists $ino && echo "$ino still exists"
|
||||
done
|
||||
|
||||
RUNTIME=30
|
||||
echo "== ${RUNTIME}s of racing evict deletion, orphan scanning, and open by handle"
|
||||
|
||||
# exclude last client mount
|
||||
last=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
last=$nr
|
||||
done
|
||||
|
||||
END=$((SECONDS + RUNTIME))
|
||||
while [ $SECONDS -lt $END ]; do
|
||||
# hold open per-mount unlinked files
|
||||
pids=""
|
||||
ino_args=""
|
||||
for nr in $(t_fs_nrs); do
|
||||
test $nr == $last && continue
|
||||
|
||||
eval path="\$T_D${nr}/racing-$nr"
|
||||
touch "$path"
|
||||
ino_args="$ino_args -i $(stat -c %i $path)"
|
||||
|
||||
sleep 1000000 < "$path" &
|
||||
sleep .1 # wait for sleep to start and open input :/
|
||||
pids="$pids $!"
|
||||
rm -f "$path"
|
||||
done
|
||||
|
||||
# remount excluded last client to force log merging and make orphan visible
|
||||
sync
|
||||
t_umount $last
|
||||
t_mount $last
|
||||
|
||||
# get all mounts scanning orphans at high frequency
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms 100
|
||||
|
||||
# spin having tasks in each mount trying to open/fsetxattr all inos
|
||||
for nr in $(t_fs_nrs); do
|
||||
test $nr == $last && continue
|
||||
|
||||
eval path="\$T_M${nr}"
|
||||
handle_fsetxattr -e $ino_args -m "$path" -s 2 &
|
||||
done
|
||||
|
||||
# trigger eviction deletion of each file in each mount
|
||||
silent_kill $pids
|
||||
|
||||
wait || t_fail "handle_fsetxattr failed"
|
||||
|
||||
# slow down orphan scanning for the next iteration
|
||||
t_set_all_sysfs_mount_options orphan_scan_delay_ms $(((RUNTIME * 2) * 1000))
|
||||
done
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -103,4 +103,34 @@ while [ "$nr" -lt 100 ]; do
|
||||
((nr++))
|
||||
done
|
||||
|
||||
#
|
||||
# make sure rapid concurrent metadata updates don't create multiple
|
||||
# meta_seq entries
|
||||
#
|
||||
# we had a bug where deletion items created under concurrent_write locks
|
||||
# could get versions older than the items they're deleting which were
|
||||
# protected by read/write locks.
|
||||
#
|
||||
echo "== concurrent update attempts maintain single entries"
|
||||
FILES=4
|
||||
nr=1
|
||||
while [ "$nr" -lt 10 ]; do
|
||||
# touch a bunch of files in parallel from all mounts
|
||||
for i in $(t_fs_nrs); do
|
||||
eval path="\$T_D${i}"
|
||||
seq -f "$path/file-%.0f" 1 $FILES | xargs touch &
|
||||
done
|
||||
wait || t_fail "concurrent file updates failed"
|
||||
|
||||
# make sure no inodes have duplicate entries
|
||||
sync
|
||||
scoutfs walk-inodes -p "$T_D0" meta_seq -- 0 -1 | \
|
||||
grep -v "minor" | \
|
||||
awk '{print $4}' | \
|
||||
sort -n | uniq -c | \
|
||||
awk '($1 != 1)' | \
|
||||
sort -n
|
||||
((nr++))
|
||||
done
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -36,7 +36,8 @@ test_xattr_lengths() {
|
||||
else
|
||||
echo "$name=\"$val\"" > "$T_TMP.good"
|
||||
fi
|
||||
cmp "$T_TMP.good" "$T_TMP.got" || exit 1
|
||||
cmp "$T_TMP.good" "$T_TMP.got" || \
|
||||
t_fail "cmp failed name len $name_len val len $val_len"
|
||||
|
||||
setfattr -x $name "$FILE"
|
||||
}
|
||||
|
||||
@@ -65,7 +65,6 @@ generic/030 # mmap missing
|
||||
generic/075 # file content mismatch failures (fds, etc)
|
||||
generic/080 # mmap missing
|
||||
generic/103 # enospc causes trans commit failures
|
||||
generic/105 # needs trigage: something about acls
|
||||
generic/108 # mount fails on failing device?
|
||||
generic/112 # file content mismatch failures (fds, etc)
|
||||
generic/120 # (can't exec 'cause no mmap)
|
||||
@@ -73,17 +72,14 @@ generic/126 # (can't exec 'cause no mmap)
|
||||
generic/141 # mmap missing
|
||||
generic/213 # enospc causes trans commit failures
|
||||
generic/215 # mmap missing
|
||||
generic/237 # wrong error return from failing setfacl?
|
||||
generic/246 # mmap missing
|
||||
generic/247 # mmap missing
|
||||
generic/248 # mmap missing
|
||||
generic/319 # utils output change? update branch?
|
||||
generic/321 # requires selinux enabled for '+' in ls?
|
||||
generic/325 # mmap missing
|
||||
generic/338 # BUG_ON update inode error handling
|
||||
generic/346 # mmap missing
|
||||
generic/347 # _dmthin_mount doesn't work?
|
||||
generic/375 # utils output change? update branch?
|
||||
EOF
|
||||
|
||||
t_restore_output
|
||||
|
||||
@@ -55,9 +55,21 @@ test -x "$SCOUTFS_FENCED_RUN" || \
|
||||
error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable"
|
||||
|
||||
#
|
||||
# main loop watching for fence request across all filesystems
|
||||
# Main loop watching for fence request across all filesystems. The
|
||||
# server can shut down without waiting for pending fence requests to
|
||||
# finish. All of the interaction with the fence directory and files can
|
||||
# fail at any moment. We will generate log messages when the dir or
|
||||
# files disappear.
|
||||
#
|
||||
|
||||
# generate failure messages to stderr while still echoing 0 for the caller
|
||||
careful_cat()
|
||||
{
|
||||
local path="$@"
|
||||
|
||||
cat "$@" || echo 0
|
||||
}
|
||||
|
||||
while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
for fence in /sys/fs/scoutfs/*/fence/*; do
|
||||
# catches unmatched regex when no dirs
|
||||
@@ -66,7 +78,8 @@ while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
fi
|
||||
|
||||
# skip requests that have been handled
|
||||
if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then
|
||||
if [ "$(careful_cat $fence/fenced)" == 1 -o \
|
||||
"$(careful_cat $fence/error)" == 1 ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -81,10 +94,10 @@ while sleep $SCOUTFS_FENCED_DELAY; do
|
||||
export SCOUTFS_FENCED_REQ_RID="$rid"
|
||||
export SCOUTFS_FENCED_REQ_IP="$ip"
|
||||
|
||||
$run $SCOUTFS_FENCED_RUN_ARGS
|
||||
$SCOUTFS_FENCED_RUN $SCOUTFS_FENCED_RUN_ARGS
|
||||
rc=$?
|
||||
if [ "$rc" != 0 ]; then
|
||||
log_message "server $srv fencing rid $rid saw error status $rc from $run"
|
||||
log_message "server $srv fencing rid $rid saw error status $rc"
|
||||
echo 1 > "$fence/error"
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -15,12 +15,76 @@ general mount options described in the
|
||||
.BR mount (8)
|
||||
manual page.
|
||||
.TP
|
||||
.B acl
|
||||
The acl mount option enables support for POSIX Access Control Lists
|
||||
as detailed in
|
||||
.BR acl (5) .
|
||||
Support for POSIX ACLs is the default.
|
||||
.TP
|
||||
.B data_prealloc_blocks=<blocks>
|
||||
Set the size of preallocation regions of data files, in 4KiB blocks.
|
||||
Writes to these regions that contain no extents will attempt to
|
||||
preallocate the size of the full region. This can waste a lot of space
|
||||
with small files, files with sparse regions, and files whose final
|
||||
length isn't a multiple of the preallocation size. The following
|
||||
data_prealloc_contig_only option, which is the default, restricts this
|
||||
behaviour to waste less space.
|
||||
.sp
|
||||
All the preallocation options can be changed in an active mount by
|
||||
writing to their respective files in the options directory in the
|
||||
mount's sysfs directory.
|
||||
.sp
|
||||
It is worth noting that it is always more efficient in every way to use
|
||||
.BR fallocate (2)
|
||||
to precisely allocate large extents for the resulting size of the file.
|
||||
Always attempt to enable it in software that supports it.
|
||||
.TP
|
||||
.B data_prealloc_contig_only=<0|1>
|
||||
This option, currently the default, limits file data preallocation in
|
||||
two ways. First, it will only preallocate when extending a fully
|
||||
allocated file. Second, it will limit the size of preallocation to the
|
||||
existing length of the file. These limits reduce the amount of
|
||||
preallocation wasted per file at the cost of multiple initial extents in
|
||||
all files. It only supports simple streaming writes, any other write
|
||||
pattern will not be recognized and could result in many fragmented
|
||||
extent allocations.
|
||||
.sp
|
||||
This option can be disabled to encourage large allocated extents
|
||||
regardless of write patterns. This can be helpful if files are written
|
||||
with initial sparse regions (perhaps by multiple threads writing to
|
||||
different regions) and wasted space isn't an issue (perhaps because the
|
||||
file population contains few small files).
|
||||
.TP
|
||||
.B metadev_path=<device>
|
||||
The metadev_path option specifies the path to the block device that
|
||||
contains the filesystem's metadata.
|
||||
.sp
|
||||
This option is required.
|
||||
.TP
|
||||
.B noacl
|
||||
The noacl mount option disables the default support for POSIX Access
|
||||
Control Lists. Any existing system.posix_acl_default and
|
||||
system.posix_acl_access extended attributes remain in inodes. They
|
||||
will appear in listings from
|
||||
.BR listxattr (5)
|
||||
but specific retrieval or reomval operations will fail. They will be
|
||||
used for enforcement again if ACL support is later enabled.
|
||||
.TP
|
||||
.B orphan_scan_delay_ms=<number>
|
||||
This option sets the average expected delay, in milliseconds, between
|
||||
each mount's scan of the global orphaned inode list. Jitter is added to
|
||||
avoid contention so each individual delay between scans is a random
|
||||
value up to 20% less than or greater than this average expected delay.
|
||||
.sp
|
||||
The minimum value for this option is 100ms which is very short and is
|
||||
only reasonable for testing or experiments. The default is 10000ms (10
|
||||
seconds) and the maximum is 60000ms (1 minute).
|
||||
.sp
|
||||
This option can be changed in an active mount by writing to its file in
|
||||
the options directory in the mount's sysfs directory. Writing a new
|
||||
value will cause the next pending orphan scan to be rescheduled
|
||||
with the newly written delay time.
|
||||
.TP
|
||||
.B quorum_slot_nr=<number>
|
||||
The quorum_slot_nr option assigns a quorum member slot to the mount.
|
||||
The mount will use the slot assignment to claim exclusive ownership of
|
||||
|
||||
@@ -15,7 +15,7 @@ environment variable. If that variable is also absent the current working
|
||||
directory will be used.
|
||||
|
||||
.TP
|
||||
.BI "change-format-version [-V, --format-version VERS] [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.BI "change-format-version [-V, --format-version VERS] [-F|--offline] META-DEVICE DATA-DEVICE"
|
||||
.sp
|
||||
Change the format version of an existing file system. The maxmimum
|
||||
supported version is used by default. A specific version in the range
|
||||
@@ -25,7 +25,7 @@ output of --help.
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-F, --offline META-DEVICE DATA-DEVICE"
|
||||
.B "-F, --offline"
|
||||
Change the format version by writing directly to the metadata and data
|
||||
devices. Like mkfs, this writes directly to the devices without
|
||||
protection and must only be used on completely unmounted devices. The
|
||||
@@ -43,7 +43,7 @@ the super blocks on both devices.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "change-quorum-config {-Q|--quorum-slot} NR,ADDR,PORT [-F|--offline META-DEVICE DATA-DEVICE]"
|
||||
.BI "change-quorum-config {-Q|--quorum-slot NR,ADDR,PORT} [-F|--offline] META-DEVICE"
|
||||
.sp
|
||||
Change the quorum configuration for an existing file system. The new
|
||||
configuration completely replaces the old configuration. Any slots
|
||||
@@ -61,7 +61,7 @@ multiple arguments as described in the
|
||||
.B mkfs
|
||||
command.
|
||||
.TP
|
||||
.B "-F, --offline META-DEVICE"
|
||||
.B "-F, --offline"
|
||||
Perform the change offline by updating the superblock in the metadata
|
||||
device. The command will read the super block and refuse to make the
|
||||
change if it sees any evidence that the metadata device is currently in
|
||||
@@ -597,7 +597,7 @@ format.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "print META-DEVICE"
|
||||
.BI "print {-S|--skip-likely-huge} META-DEVICE"
|
||||
.sp
|
||||
Prints out all of the metadata in the file system. This makes no effort
|
||||
to ensure that the structures are consistent as they're traversed and
|
||||
@@ -607,6 +607,20 @@ output.
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-S, --skip-likely-huge"
|
||||
Skip printing structures that are likely to be very large. The
|
||||
structures that are skipped tend to be global and whose size tends to be
|
||||
related to the size of the volume. Examples of skipped structures include
|
||||
the global fs items, srch files, and metadata and data
|
||||
allocators. Similar structures that are not skipped are related to the
|
||||
number of mounts and are maintained at a relatively reasonable size.
|
||||
These include per-mount log trees, srch files, allocators, and the
|
||||
metadata allocators used by server commits.
|
||||
.sp
|
||||
Skipping the larger structures limits the print output to a relatively
|
||||
constant size rather than being a large multiple of the used metadata
|
||||
space of the volume making the output much more useful for inspection.
|
||||
.TP
|
||||
.B "META-DEVICE"
|
||||
The path to the metadata device for the filesystem whose metadata will be
|
||||
printed. Since this command reads via the host's buffer cache, it may not
|
||||
|
||||
@@ -222,7 +222,7 @@ static struct argp_option options[] = {
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"",
|
||||
"META-DEVICE DATA-DEVICE",
|
||||
"Change format version of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ static struct argp_option options[] = {
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"",
|
||||
"META-DEVICE",
|
||||
"Change quorum slots and addresses of an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <ctype.h>
|
||||
#include <uuid/uuid.h>
|
||||
#include <sys/socket.h>
|
||||
@@ -989,9 +990,10 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
|
||||
struct print_args {
|
||||
char *meta_device;
|
||||
bool skip_likely_huge;
|
||||
};
|
||||
|
||||
static int print_volume(int fd)
|
||||
static int print_volume(int fd, struct print_args *args)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct print_recursion_args pa;
|
||||
@@ -1041,23 +1043,26 @@ static int print_volume(int fd)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
for (i = 0; i < array_size(super->meta_alloc); i++) {
|
||||
snprintf(str, sizeof(str), "meta_alloc[%u]", i);
|
||||
err = print_btree(fd, super, str, &super->meta_alloc[i].root,
|
||||
if (!args->skip_likely_huge) {
|
||||
for (i = 0; i < array_size(super->meta_alloc); i++) {
|
||||
snprintf(str, sizeof(str), "meta_alloc[%u]", i);
|
||||
err = print_btree(fd, super, str, &super->meta_alloc[i].root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
err = print_btree(fd, super, "data_alloc", &super->data_alloc.root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
err = print_btree(fd, super, "data_alloc", &super->data_alloc.root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "srch_root", &super->srch_root,
|
||||
print_srch_root_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "logs_root", &super->logs_root,
|
||||
print_log_trees_item, NULL);
|
||||
if (err && !ret)
|
||||
@@ -1065,19 +1070,23 @@ static int print_volume(int fd)
|
||||
|
||||
pa.super = super;
|
||||
pa.fd = fd;
|
||||
err = print_btree_leaf_items(fd, super, &super->srch_root.ref,
|
||||
print_srch_root_files, &pa);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
if (!args->skip_likely_huge) {
|
||||
err = print_btree_leaf_items(fd, super, &super->srch_root.ref,
|
||||
print_srch_root_files, &pa);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
err = print_btree_leaf_items(fd, super, &super->logs_root.ref,
|
||||
print_log_trees_roots, &pa);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "fs_root", &super->fs_root,
|
||||
print_fs_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
if (!args->skip_likely_huge) {
|
||||
err = print_btree(fd, super, "fs_root", &super->fs_root,
|
||||
print_fs_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
out:
|
||||
free(super);
|
||||
@@ -1098,7 +1107,7 @@ static int do_print(struct print_args *args)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = print_volume(fd);
|
||||
ret = print_volume(fd, args);
|
||||
close(fd);
|
||||
return ret;
|
||||
};
|
||||
@@ -1108,6 +1117,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
struct print_args *args = state->input;
|
||||
|
||||
switch (key) {
|
||||
case 'S':
|
||||
args->skip_likely_huge = true;
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
@@ -1125,8 +1137,13 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "skip-likely-huge", 'S', NULL, 0, "Skip large structures to minimize output size"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
NULL,
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEV",
|
||||
"Print metadata structures"
|
||||
|
||||
Reference in New Issue
Block a user