mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-11 14:10:26 +00:00
Compare commits
96 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba819be8f9 | ||
|
|
1b103184ca | ||
|
|
c3890abd7b | ||
|
|
5ab38bfa48 | ||
|
|
e9ad61b444 | ||
|
|
91bbf90f71 | ||
|
|
b5630f540d | ||
|
|
90a4c82363 | ||
|
|
f654fa0fda | ||
|
|
50168a2d2a | ||
|
|
3c0616524a | ||
|
|
8d3e6883c6 | ||
|
|
8747dae61c | ||
|
|
fffcf4a9bb | ||
|
|
b552406427 | ||
|
|
d812599e6b | ||
|
|
03ab5cedb6 | ||
|
|
2b94cd6468 | ||
|
|
5507ee5351 | ||
|
|
1600a121d9 | ||
|
|
6daf24ff37 | ||
|
|
cd5d9ff3e0 | ||
|
|
d94e49eb63 | ||
|
|
1dbe408539 | ||
|
|
bf21699ad7 | ||
|
|
c7c67a173d | ||
|
|
0d10189f58 | ||
|
|
6b88f3268e | ||
|
|
4b2afa61b8 | ||
|
|
222ba2cede | ||
|
|
c7e97eeb1f | ||
|
|
21c070b42d | ||
|
|
77fbf92968 | ||
|
|
d5c699c3b4 | ||
|
|
b56b8e502c | ||
|
|
5ff372561d | ||
|
|
bdecee5e5d | ||
|
|
a9281b75fa | ||
|
|
707e1b2d59 | ||
|
|
006f429f72 | ||
|
|
d71583bcf5 | ||
|
|
bb835b948d | ||
|
|
bcdc4f5423 | ||
|
|
7ceb215c91 | ||
|
|
d4d2b0850b | ||
|
|
cf05aefe50 | ||
|
|
9f06065ce7 | ||
|
|
d2c2fece2a | ||
|
|
0e1e55d25b | ||
|
|
293cee9554 | ||
|
|
a7704e0b56 | ||
|
|
819df4be60 | ||
|
|
592e3d471f | ||
|
|
29160b0bc6 | ||
|
|
11c041d2ea | ||
|
|
46e8dfe884 | ||
|
|
a9beeaf5da | ||
|
|
205d8ebd4a | ||
|
|
e580f33f82 | ||
|
|
d480243c11 | ||
|
|
bafecbc604 | ||
|
|
65be4682e3 | ||
|
|
e88845d185 | ||
|
|
ec50e66fff | ||
|
|
0e91f9a277 | ||
|
|
69068ae2c0 | ||
|
|
016dac39bf | ||
|
|
e69cf3dec8 | ||
|
|
d6c143a639 | ||
|
|
09ae100254 | ||
|
|
50f5077863 | ||
|
|
cca4fcb788 | ||
|
|
1d150da3f0 | ||
|
|
28f03d3558 | ||
|
|
4275f6e6e5 | ||
|
|
70a5b6ffe2 | ||
|
|
b89ecd47b4 | ||
|
|
4293816764 | ||
|
|
f0de59a9a3 | ||
|
|
1f0a08eacb | ||
|
|
dac3f056a5 | ||
|
|
af868aad9b | ||
|
|
cf4df0ef9f | ||
|
|
81aa58253e | ||
|
|
c683ded0e6 | ||
|
|
f27431b3ae | ||
|
|
28c3cee995 | ||
|
|
430960ef3c | ||
|
|
7006a84d96 | ||
|
|
eafb8621da | ||
|
|
006555d42a | ||
|
|
8e458f9230 | ||
|
|
32c0dbce09 | ||
|
|
9c9ba651bd | ||
|
|
14eddb6420 | ||
|
|
597208324d |
@@ -1,6 +1,39 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.19
|
||||
\
|
||||
*Jan 30, 2024*
|
||||
|
||||
Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
|
||||
for creating log merge operations. The previous timeout, now the
|
||||
default, was too short for some systems and was resulting in consistent
|
||||
timeouts which created an excessive number of log trees waiting to be
|
||||
merged.
|
||||
|
||||
Improved performance of many in-mount server operations when there are a
|
||||
large number of log trees waiting to be merged.
|
||||
|
||||
---
|
||||
v1.18
|
||||
\
|
||||
*Nov 7, 2023*
|
||||
|
||||
Fixed a bug where background srch file compaction could stop making
|
||||
forward progress if a partial compaction operation was committed at a
|
||||
specific byte offset in a block. This would cause srch file searches to
|
||||
be progressively more expensive over time. Once this fix is running
|
||||
background compaction will resume, bringing the cost of searches back
|
||||
down.
|
||||
|
||||
---
|
||||
v1.17
|
||||
\
|
||||
*Oct 23, 2023*
|
||||
|
||||
Add support for EL8 generation kernels.
|
||||
|
||||
---
|
||||
v1.16
|
||||
\
|
||||
|
||||
@@ -31,12 +31,12 @@ TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
|
||||
all: module
|
||||
|
||||
module:
|
||||
make $(SCOUTFS_ARGS)
|
||||
$(SP) make C=2 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)
|
||||
$(MAKE) $(SCOUTFS_ARGS)
|
||||
$(SP) $(MAKE) C=2 CF="-D__CHECK_ENDIAN__" $(SCOUTFS_ARGS)
|
||||
|
||||
|
||||
modules_install:
|
||||
make $(SCOUTFS_ARGS) modules_install
|
||||
$(MAKE) $(SCOUTFS_ARGS) modules_install
|
||||
|
||||
|
||||
%.spec: %.spec.in .FORCE
|
||||
@@ -50,4 +50,4 @@ dist: scoutfs-kmod.spec
|
||||
@ tar rf $(TARFILE) --transform="s@\(.*\)@scoutfs-kmod-$(RPM_VERSION)/\1@" scoutfs-kmod.spec
|
||||
|
||||
clean:
|
||||
make $(SCOUTFS_ARGS) clean
|
||||
$(MAKE) $(SCOUTFS_ARGS) clean
|
||||
|
||||
@@ -3,16 +3,28 @@
|
||||
%define kmod_git_hash @@GITHASH@@
|
||||
%define pkg_date %(date +%%Y%%m%%d)
|
||||
|
||||
# Disable the building of the debug package(s).
|
||||
%define debug_package %{nil}
|
||||
|
||||
# take kernel version or default to uname -r
|
||||
%{!?kversion: %global kversion %(uname -r)}
|
||||
%global kernel_version %{kversion}
|
||||
|
||||
%if 0%{?el7}
|
||||
%global kernel_source() /usr/src/kernels/%{kernel_version}.$(arch)
|
||||
%global kernel_release() %{kversion}
|
||||
%endif
|
||||
%if 0%{?el8}
|
||||
%global kernel_source() /usr/src/kernels/%{kernel_version}
|
||||
%endif
|
||||
|
||||
%{!?_release: %global _release 0.%{pkg_date}git%{kmod_git_hash}}
|
||||
|
||||
%if 0%{?el7}
|
||||
Name: %{kmod_name}
|
||||
%endif
|
||||
%if 0%{?el8}
|
||||
Name: kmod-%{kmod_name}
|
||||
%endif
|
||||
Summary: %{kmod_name} kernel module
|
||||
Version: %{kmod_version}
|
||||
Release: %{_release}%{?dist}
|
||||
@@ -20,24 +32,30 @@ License: GPLv2
|
||||
Group: System/Kernel
|
||||
URL: http://scoutfs.org/
|
||||
|
||||
%if 0%{?el7}
|
||||
BuildRequires: %{kernel_module_package_buildreqs}
|
||||
BuildRequires: git
|
||||
%endif
|
||||
%if 0%{?el8}
|
||||
BuildRequires: elfutils-libelf-devel
|
||||
%endif
|
||||
BuildRequires: kernel-devel-uname-r = %{kernel_version}
|
||||
BuildRequires: git
|
||||
BuildRequires: module-init-tools
|
||||
|
||||
ExclusiveArch: x86_64
|
||||
|
||||
Source: %{kmod_name}-kmod-%{kmod_version}.tar
|
||||
|
||||
%if 0%{?el7}
|
||||
# Build only for standard kernel variant(s); for debug packages, append "debug"
|
||||
# after "default" (separated by space)
|
||||
%kernel_module_package default
|
||||
%endif
|
||||
|
||||
# Disable the building of the debug package(s).
|
||||
%define debug_package %{nil}
|
||||
|
||||
%global install_mod_dir extra/%{name}
|
||||
|
||||
%global install_mod_dir extra/%{kmod_name}
|
||||
%if 0%{?el8}
|
||||
%global flavors_to_build x86_64
|
||||
%endif
|
||||
|
||||
%description
|
||||
%{kmod_name} - kernel module
|
||||
@@ -66,7 +84,7 @@ export INSTALL_MOD_DIR=%{install_mod_dir}
|
||||
mkdir -p %{install_mod_dir}
|
||||
for flavor in %{flavors_to_build}; do
|
||||
export KSRC=%{kernel_source $flavor}
|
||||
export KVERSION=%{kernel_release $KSRC}
|
||||
export KVERSION=%{kversion}
|
||||
install -d $INSTALL_MOD_PATH/lib/modules/$KVERSION/%{install_mod_dir}
|
||||
cp $PWD/obj/$flavor/src/scoutfs.ko $INSTALL_MOD_PATH/lib/modules/$KVERSION/%{install_mod_dir}/
|
||||
done
|
||||
@@ -74,6 +92,14 @@ done
|
||||
# mark modules executable so that strip-to-file can strip them
|
||||
find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
|
||||
|
||||
%if 0%{?el8}
|
||||
%files
|
||||
/lib/modules
|
||||
|
||||
%post
|
||||
weak-modules --add-kernel --no-initramfs
|
||||
depmod -a
|
||||
%endif
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
|
||||
@@ -25,6 +25,7 @@ scoutfs-y += \
|
||||
inode.o \
|
||||
ioctl.o \
|
||||
item.o \
|
||||
kernelcompat.o \
|
||||
lock.o \
|
||||
lock_server.o \
|
||||
msg.o \
|
||||
|
||||
@@ -26,6 +26,16 @@ ifneq (,$(shell grep 'dir_emit_dots' include/linux/fs.h))
|
||||
ccflags-y += -DKC_DIR_EMIT_DOTS
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.18-rc2-19-gb5ae6b15bd73
|
||||
#
|
||||
# Folds d_materialise_unique into d_splice_alias. Note reversal
|
||||
# of arguments (Also note Documentation/filesystems/porting.rst)
|
||||
#
|
||||
ifneq (,$(shell grep 'd_materialise_unique' include/linux/dcache.h))
|
||||
ccflags-y += -DKC_D_MATERIALISE_UNIQUE=1
|
||||
endif
|
||||
|
||||
#
|
||||
# RHEL extended the fop struct so to use it we have to set
|
||||
# a flag to indicate that the struct is large enough and
|
||||
@@ -40,6 +50,211 @@ endif
|
||||
#
|
||||
# Added user_ns argument to posix_acl_valid
|
||||
#
|
||||
ifneq (,$(shell grep 'posix_acl_valid.*user_ns,' include/linux/posix_acl.h))
|
||||
ifneq (,$(shell grep 'posix_acl_valid.*user_namespace' include/linux/posix_acl.h))
|
||||
ccflags-y += -DKC_POSIX_ACL_VALID_USER_NS
|
||||
endif
|
||||
|
||||
#
|
||||
# v5.3-12296-g6d2052d188d9
|
||||
#
|
||||
# The RBCOMPUTE function is now passed an extra flag, and should return a bool
|
||||
# to indicate whether the propagated callback should stop or not.
|
||||
#
|
||||
ifneq (,$(shell grep 'static inline bool RBNAME.*_compute_max' include/linux/rbtree_augmented.h))
|
||||
ccflags-y += -DKC_RB_TREE_AUGMENTED_COMPUTE_MAX
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.13-25-g37bc15392a23
|
||||
#
|
||||
# Renames posix_acl_create to __posix_acl_create and provide some
|
||||
# new interfaces for creating ACLs
|
||||
#
|
||||
ifneq (,$(shell grep '__posix_acl_create' include/linux/posix_acl.h))
|
||||
ccflags-y += -DKC___POSIX_ACL_CREATE
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.8-rc1-29-g31051c85b5e2
|
||||
#
|
||||
# inode_change_ok() removed - replace with setattr_prepare()
|
||||
#
|
||||
ifneq (,$(shell grep 'extern int setattr_prepare' include/linux/fs.h))
|
||||
ccflags-y += -DKC_SETATTR_PREPARE
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.15-rc3-4-gae5e165d855d
|
||||
#
|
||||
# linux/iversion.h needs to manually be included for code that
|
||||
# manipulates this field.
|
||||
#
|
||||
ifneq (,$(shell grep -s 'define _LINUX_IVERSION_H' include/linux/iversion.h))
|
||||
ccflags-y += -DKC_NEED_LINUX_IVERSION_H=1
|
||||
endif
|
||||
|
||||
# v4.11-12447-g104b4e5139fe
|
||||
#
|
||||
# Renamed __percpu_counter_add to percpu_counter_add_batch to clarify
|
||||
# that the __ wasn't less safe, just took an extra parameter.
|
||||
#
|
||||
ifneq (,$(shell grep 'percpu_counter_add_batch' include/linux/percpu_counter.h))
|
||||
ccflags-y += -DKC_PERCPU_COUNTER_ADD_BATCH
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.11-4550-g7dea19f9ee63
|
||||
#
|
||||
# Introduced memalloc_nofs_{save,restore} preferred instead of _noio_.
|
||||
#
|
||||
ifneq (,$(shell grep 'memalloc_nofs_save' include/linux/sched/mm.h))
|
||||
ccflags-y += -DKC_MEMALLOC_NOFS_SAVE
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.7-12414-g1eff9d322a44
|
||||
#
|
||||
# Renamed bi_rw to bi_opf to force old code to catch up. We use it as a
|
||||
# single switch between old and new bio structures.
|
||||
#
|
||||
ifneq (,$(shell grep 'bi_opf' include/linux/blk_types.h))
|
||||
ccflags-y += -DKC_BIO_BI_OPF
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.12-rc2-201-g4e4cbee93d56
|
||||
#
|
||||
# Moves to bi_status BLK_STS_ API instead of having a mix of error
|
||||
# end_io args or bi_error.
|
||||
#
|
||||
ifneq (,$(shell grep 'bi_status' include/linux/blk_types.h))
|
||||
ccflags-y += -DKC_BIO_BI_STATUS
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.11-8765-ga0b02131c5fc
|
||||
#
|
||||
# Remove the old ->shrink() API, ->{scan,count}_objects is preferred.
|
||||
#
|
||||
ifneq (,$(shell grep '(*shrink)' include/linux/shrinker.h))
|
||||
ccflags-y += -DKC_SHRINKER_SHRINK
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.19-4777-g6bec00352861
|
||||
#
|
||||
# backing_dev_info is removed from address_space. Instead we need to use
|
||||
# inode_to_bdi() inline from <backing-dev.h>.
|
||||
#
|
||||
ifneq (,$(shell grep 'struct backing_dev_info.*backing_dev_info' include/linux/fs.h))
|
||||
ccflags-y += -DKC_LINUX_BACKING_DEV_INFO=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.3-9290-ge409de992e3e
|
||||
#
|
||||
# xattr handlers are now passed a struct that contains `flags`
|
||||
#
|
||||
ifneq (,$(shell grep 'int...get..const struct xattr_handler.*struct dentry.*dentry,' include/linux/xattr.h))
|
||||
ccflags-y += -DKC_XATTR_STRUCT_XATTR_HANDLER=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.16-rc1-1-g9b2c45d479d0
|
||||
#
|
||||
# kernel_getsockname() and kernel_getpeername dropped addrlen arg
|
||||
#
|
||||
ifneq (,$(shell grep 'kernel_getsockname.*,$$' include/linux/net.h))
|
||||
ccflags-y += -DKC_KERNEL_GETSOCKNAME_ADDRLEN=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.1-rc1-410-geeb1bd5c40ed
|
||||
#
|
||||
# Adds a struct net parameter to sock_create_kern
|
||||
#
|
||||
ifneq (,$(shell grep 'sock_create_kern.*struct net' include/linux/net.h))
|
||||
ccflags-y += -DKC_SOCK_CREATE_KERN_NET=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.18-rc6-1619-gc0371da6047a
|
||||
#
|
||||
# iov_iter is now part of struct msghdr
|
||||
#
|
||||
ifneq (,$(shell grep 'struct iov_iter.*msg_iter' include/linux/socket.h))
|
||||
ccflags-y += -DKC_MSGHDR_STRUCT_IOV_ITER=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.17-rc6-7-g95582b008388
|
||||
#
|
||||
# Kernel has current_time(inode) to uniformly retreive timespec in the right unit
|
||||
#
|
||||
ifneq (,$(shell grep 'extern struct timespec64 current_time' include/linux/fs.h))
|
||||
ccflags-y += -DKC_CURRENT_TIME_INODE=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.9-12228-g530e9b76ae8f
|
||||
#
|
||||
# register_cpu_notifier and family were all removed and to be
|
||||
# replaced with cpuhp_* API calls.
|
||||
#
|
||||
ifneq (,$(shell grep 'define register_hotcpu_notifier' include/linux/cpu.h))
|
||||
ccflags-y += -DKC_CPU_NOTIFIER
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.14-rc8-130-gccad2365668f
|
||||
#
|
||||
# generic_file_buffered_write is removed, backport it
|
||||
#
|
||||
ifneq (,$(shell grep 'extern ssize_t generic_file_buffered_write' include/linux/fs.h))
|
||||
ccflags-y += -DKC_GENERIC_FILE_BUFFERED_WRITE=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v5.7-438-g8151b4c8bee4
|
||||
#
|
||||
# struct address_space_operations switches away from .readpages to .readahead
|
||||
#
|
||||
# RHEL has backported this feature all the way to RHEL8, as part of RHEL_KABI,
|
||||
# which means we need to detect this very precisely
|
||||
#
|
||||
ifneq (,$(shell grep 'readahead.*struct readahead_control' include/linux/fs.h))
|
||||
ccflags-y += -DKC_FILE_AOPS_READAHEAD
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.0-rc7-1743-g8436318205b9
|
||||
#
|
||||
# .aio_read and .aio_write no longer exist. All reads and writes now use the
|
||||
# .read_iter and .write_iter methods, or must implement .read and .write (which
|
||||
# we don't).
|
||||
#
|
||||
ifneq (,$(shell grep 'ssize_t.*aio_read' include/linux/fs.h))
|
||||
ccflags-y += -DKC_LINUX_HAVE_FOP_AIO_READ=1
|
||||
endif
|
||||
|
||||
#
|
||||
# rhel7 has a custom inode_operations_wrapper struct that is discarded
|
||||
# entirely in favor of upstream structure since rhel8.
|
||||
#
|
||||
ifneq (,$(shell grep 'void.*follow_link.*struct dentry' include/linux/fs.h))
|
||||
ccflags-y += -DKC_LINUX_HAVE_RHEL_IOPS_WRAPPER=1
|
||||
endif
|
||||
|
||||
ifneq (,$(shell grep 'size_t.*ki_left;' include/linux/aio.h))
|
||||
ccflags-y += -DKC_LINUX_AIO_KI_LEFT=1
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.4-rc4-4-g98e9cb5711c6
|
||||
#
|
||||
# Introduces a new xattr_handler .name member that can be used to match the
|
||||
# entire field, instead of just a prefix. For these kernels, we must use
|
||||
# the new .name field instead.
|
||||
ifneq (,$(shell grep 'static inline const char .xattr_prefix' include/linux/xattr.h))
|
||||
ccflags-y += -DKC_XATTR_HANDLER_NAME=1
|
||||
endif
|
||||
|
||||
@@ -69,12 +69,14 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
|
||||
char *name;
|
||||
int ret;
|
||||
|
||||
#ifndef KC___POSIX_ACL_CREATE
|
||||
if (!IS_POSIXACL(inode))
|
||||
return NULL;
|
||||
|
||||
acl = get_cached_acl(inode, type);
|
||||
if (acl != ACL_NOT_CACHED)
|
||||
return acl;
|
||||
#endif
|
||||
|
||||
ret = acl_xattr_name_len(type, &name, NULL);
|
||||
if (ret < 0)
|
||||
@@ -96,9 +98,11 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
|
||||
acl = ERR_PTR(ret);
|
||||
}
|
||||
|
||||
#ifndef KC___POSIX_ACL_CREATE
|
||||
/* can set null negative cache */
|
||||
if (!IS_ERR(acl))
|
||||
set_cached_acl(inode, type, acl);
|
||||
#endif
|
||||
|
||||
kfree(value);
|
||||
|
||||
@@ -112,8 +116,10 @@ struct posix_acl *scoutfs_get_acl(struct inode *inode, int type)
|
||||
struct posix_acl *acl;
|
||||
int ret;
|
||||
|
||||
#ifndef KC___POSIX_ACL_CREATE
|
||||
if (!IS_POSIXACL(inode))
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
|
||||
if (ret < 0) {
|
||||
@@ -183,13 +189,15 @@ int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
|
||||
if (!value) {
|
||||
/* can be setting an acl that only affects mode, didn't need xattr */
|
||||
inode_inc_iversion(inode);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_ctime = current_time(inode);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
#ifndef KC___POSIX_ACL_CREATE
|
||||
if (!ret)
|
||||
set_cached_acl(inode, type, acl);
|
||||
#endif
|
||||
|
||||
kfree(value);
|
||||
|
||||
@@ -218,10 +226,17 @@ int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef KC_XATTR_STRUCT_XATTR_HANDLER
|
||||
int scoutfs_acl_get_xattr(const struct xattr_handler *handler, struct dentry *dentry,
|
||||
struct inode *inode, const char *name, void *value,
|
||||
size_t size)
|
||||
{
|
||||
int type = handler->flags;
|
||||
#else
|
||||
int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value, size_t size,
|
||||
int type)
|
||||
{
|
||||
#endif
|
||||
struct posix_acl *acl;
|
||||
int ret = 0;
|
||||
|
||||
@@ -240,9 +255,17 @@ int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef KC_XATTR_STRUCT_XATTR_HANDLER
|
||||
int scoutfs_acl_set_xattr(const struct xattr_handler *handler, struct dentry *dentry,
|
||||
struct inode *inode, const char *name, const void *value,
|
||||
size_t size, int flags)
|
||||
{
|
||||
int type = handler->flags;
|
||||
#else
|
||||
int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *value, size_t size,
|
||||
int flags, int type)
|
||||
{
|
||||
#endif
|
||||
struct posix_acl *acl = NULL;
|
||||
int ret;
|
||||
|
||||
@@ -301,7 +324,7 @@ int scoutfs_init_acl_locked(struct inode *inode, struct inode *dir,
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
|
||||
ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret > 0)
|
||||
@@ -345,7 +368,7 @@ int scoutfs_acl_chmod_locked(struct inode *inode, struct iattr *attr,
|
||||
if (IS_ERR_OR_NULL(acl))
|
||||
return PTR_ERR(acl);
|
||||
|
||||
ret = posix_acl_chmod(&acl, GFP_KERNEL, attr->ia_mode);
|
||||
ret = __posix_acl_chmod(&acl, GFP_KERNEL, attr->ia_mode);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
||||
@@ -6,10 +6,19 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
|
||||
int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
|
||||
int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
|
||||
struct scoutfs_lock *lock, struct list_head *ind_locks);
|
||||
#ifdef KC_XATTR_STRUCT_XATTR_HANDLER
|
||||
int scoutfs_acl_get_xattr(const struct xattr_handler *, struct dentry *dentry,
|
||||
struct inode *inode, const char *name, void *value,
|
||||
size_t size);
|
||||
int scoutfs_acl_set_xattr(const struct xattr_handler *, struct dentry *dentry,
|
||||
struct inode *inode, const char *name, const void *value,
|
||||
size_t size, int flags);
|
||||
#else
|
||||
int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value, size_t size,
|
||||
int type);
|
||||
int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *value, size_t size,
|
||||
int flags, int type);
|
||||
#endif
|
||||
int scoutfs_acl_chmod_locked(struct inode *inode, struct iattr *attr,
|
||||
struct scoutfs_lock *lock, struct list_head *ind_locks);
|
||||
int scoutfs_init_acl_locked(struct inode *inode, struct inode *dir,
|
||||
|
||||
104
kmod/src/block.c
104
kmod/src/block.c
@@ -21,6 +21,7 @@
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/rhashtable.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "super.h"
|
||||
@@ -30,6 +31,7 @@
|
||||
#include "scoutfs_trace.h"
|
||||
#include "alloc.h"
|
||||
#include "triggers.h"
|
||||
#include "util.h"
|
||||
|
||||
/*
|
||||
* The scoutfs block cache manages metadata blocks that can be larger
|
||||
@@ -57,7 +59,7 @@ struct block_info {
|
||||
atomic64_t access_counter;
|
||||
struct rhashtable ht;
|
||||
wait_queue_head_t waitq;
|
||||
struct shrinker shrinker;
|
||||
KC_DEFINE_SHRINKER(shrinker);
|
||||
struct work_struct free_work;
|
||||
struct llist_head free_llist;
|
||||
};
|
||||
@@ -128,7 +130,7 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
|
||||
static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
|
||||
{
|
||||
struct block_private *bp;
|
||||
unsigned int noio_flags;
|
||||
unsigned int nofs_flags;
|
||||
|
||||
/*
|
||||
* If we had multiple blocks per page we'd need to be a little
|
||||
@@ -156,9 +158,9 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
|
||||
* spurious reclaim-on dependencies and warnings.
|
||||
*/
|
||||
lockdep_off();
|
||||
noio_flags = memalloc_noio_save();
|
||||
nofs_flags = memalloc_nofs_save();
|
||||
bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
|
||||
memalloc_noio_restore(noio_flags);
|
||||
memalloc_nofs_restore(nofs_flags);
|
||||
lockdep_on();
|
||||
|
||||
if (!bp->virt) {
|
||||
@@ -436,11 +438,10 @@ static void block_remove_all(struct super_block *sb)
|
||||
* possible. Final freeing, verifying checksums, and unlinking errored
|
||||
* blocks are all done by future users of the blocks.
|
||||
*/
|
||||
static void block_end_io(struct super_block *sb, int rw,
|
||||
static void block_end_io(struct super_block *sb, unsigned int opf,
|
||||
struct block_private *bp, int err)
|
||||
{
|
||||
DECLARE_BLOCK_INFO(sb, binf);
|
||||
bool is_read = !(rw & WRITE);
|
||||
|
||||
if (err) {
|
||||
scoutfs_inc_counter(sb, block_cache_end_io_error);
|
||||
@@ -450,7 +451,7 @@ static void block_end_io(struct super_block *sb, int rw,
|
||||
if (!atomic_dec_and_test(&bp->io_count))
|
||||
return;
|
||||
|
||||
if (is_read && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
|
||||
if (!op_is_write(opf) && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
|
||||
set_bit(BLOCK_BIT_UPTODATE, &bp->bits);
|
||||
|
||||
clear_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
|
||||
@@ -463,13 +464,13 @@ static void block_end_io(struct super_block *sb, int rw,
|
||||
wake_up(&binf->waitq);
|
||||
}
|
||||
|
||||
static void block_bio_end_io(struct bio *bio, int err)
|
||||
static void KC_DECLARE_BIO_END_IO(block_bio_end_io, struct bio *bio)
|
||||
{
|
||||
struct block_private *bp = bio->bi_private;
|
||||
struct super_block *sb = bp->sb;
|
||||
|
||||
TRACE_BLOCK(end_io, bp);
|
||||
block_end_io(sb, bio->bi_rw, bp, err);
|
||||
block_end_io(sb, kc_bio_get_opf(bio), bp, kc_bio_get_errno(bio));
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
@@ -477,7 +478,7 @@ static void block_bio_end_io(struct bio *bio, int err)
|
||||
* Kick off IO for a single block.
|
||||
*/
|
||||
static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
int rw)
|
||||
unsigned int opf)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct bio *bio = NULL;
|
||||
@@ -510,8 +511,9 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
break;
|
||||
}
|
||||
|
||||
bio->bi_sector = sector + (off >> 9);
|
||||
bio->bi_bdev = sbi->meta_bdev;
|
||||
kc_bio_set_opf(bio, opf);
|
||||
kc_bio_set_sector(bio, sector + (off >> 9));
|
||||
bio_set_dev(bio, sbi->meta_bdev);
|
||||
bio->bi_end_io = block_bio_end_io;
|
||||
bio->bi_private = bp;
|
||||
|
||||
@@ -528,18 +530,18 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
BUG();
|
||||
|
||||
if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
|
||||
submit_bio(rw, bio);
|
||||
kc_submit_bio(bio);
|
||||
bio = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (bio)
|
||||
submit_bio(rw, bio);
|
||||
kc_submit_bio(bio);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
/* let racing end_io know we're done */
|
||||
block_end_io(sb, rw, bp, ret);
|
||||
block_end_io(sb, opf, bp, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -640,7 +642,7 @@ static struct block_private *block_read(struct super_block *sb, u64 blkno)
|
||||
|
||||
if (!test_bit(BLOCK_BIT_UPTODATE, &bp->bits) &&
|
||||
test_and_clear_bit(BLOCK_BIT_NEW, &bp->bits)) {
|
||||
ret = block_submit_bio(sb, bp, READ);
|
||||
ret = block_submit_bio(sb, bp, REQ_OP_READ);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
@@ -969,7 +971,7 @@ int scoutfs_block_writer_write(struct super_block *sb,
|
||||
/* retry previous write errors */
|
||||
clear_bit(BLOCK_BIT_ERROR, &bp->bits);
|
||||
|
||||
ret = block_submit_bio(sb, bp, WRITE);
|
||||
ret = block_submit_bio(sb, bp, REQ_OP_WRITE);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
@@ -1069,6 +1071,16 @@ u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
|
||||
return wri->nr_dirty_blocks * SCOUTFS_BLOCK_LG_SIZE;
|
||||
}
|
||||
|
||||
static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
|
||||
struct super_block *sb = binf->sb;
|
||||
|
||||
scoutfs_inc_counter(sb, block_cache_count_objects);
|
||||
|
||||
return shrinker_min_long(atomic_read(&binf->total_inserted));
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a number of cached blocks that haven't been used recently.
|
||||
*
|
||||
@@ -1089,24 +1101,18 @@ u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
|
||||
* atomically remove blocks when the only references are ours and the
|
||||
* hash table.
|
||||
*/
|
||||
static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct block_info *binf = container_of(shrink, struct block_info,
|
||||
shrinker);
|
||||
struct block_info *binf = KC_SHRINKER_CONTAINER_OF(shrink, struct block_info);
|
||||
struct super_block *sb = binf->sb;
|
||||
struct rhashtable_iter iter;
|
||||
struct block_private *bp;
|
||||
bool stop = false;
|
||||
unsigned long nr;
|
||||
unsigned long freed = 0;
|
||||
unsigned long nr = sc->nr_to_scan;
|
||||
u64 recently;
|
||||
|
||||
nr = sc->nr_to_scan;
|
||||
if (nr == 0)
|
||||
goto out;
|
||||
|
||||
scoutfs_inc_counter(sb, block_cache_shrink);
|
||||
|
||||
nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER);
|
||||
scoutfs_inc_counter(sb, block_cache_scan_objects);
|
||||
|
||||
recently = accessed_recently(binf);
|
||||
rhashtable_walk_enter(&binf->ht, &iter);
|
||||
@@ -1151,6 +1157,7 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
if (block_remove_solo(sb, bp)) {
|
||||
scoutfs_inc_counter(sb, block_cache_shrink_remove);
|
||||
TRACE_BLOCK(shrink, bp);
|
||||
freed++;
|
||||
nr--;
|
||||
}
|
||||
block_put(sb, bp);
|
||||
@@ -1159,12 +1166,11 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
|
||||
rhashtable_walk_stop(&iter);
|
||||
rhashtable_walk_exit(&iter);
|
||||
out:
|
||||
|
||||
if (stop)
|
||||
return -1;
|
||||
return SHRINK_STOP;
|
||||
else
|
||||
return min_t(u64, INT_MAX,
|
||||
(u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER);
|
||||
return freed;
|
||||
}
|
||||
|
||||
struct sm_block_completion {
|
||||
@@ -1172,11 +1178,11 @@ struct sm_block_completion {
|
||||
int err;
|
||||
};
|
||||
|
||||
static void sm_block_bio_end_io(struct bio *bio, int err)
|
||||
static void KC_DECLARE_BIO_END_IO(sm_block_bio_end_io, struct bio *bio)
|
||||
{
|
||||
struct sm_block_completion *sbc = bio->bi_private;
|
||||
|
||||
sbc->err = err;
|
||||
sbc->err = kc_bio_get_errno(bio);
|
||||
complete(&sbc->comp);
|
||||
bio_put(bio);
|
||||
}
|
||||
@@ -1191,9 +1197,8 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
|
||||
* only layer that sees the full block buffer so we pass the calculated
|
||||
* crc to the caller for them to check in their context.
|
||||
*/
|
||||
static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw, u64 blkno,
|
||||
struct scoutfs_block_header *hdr, size_t len,
|
||||
__le32 *blk_crc)
|
||||
static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsigned int opf,
|
||||
u64 blkno, struct scoutfs_block_header *hdr, size_t len, __le32 *blk_crc)
|
||||
{
|
||||
struct scoutfs_block_header *pg_hdr;
|
||||
struct sm_block_completion sbc;
|
||||
@@ -1207,7 +1212,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
|
||||
return -EIO;
|
||||
|
||||
if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
|
||||
WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
|
||||
WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
|
||||
return -EINVAL;
|
||||
|
||||
page = alloc_page(GFP_NOFS);
|
||||
@@ -1216,7 +1221,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
|
||||
|
||||
pg_hdr = page_address(page);
|
||||
|
||||
if (rw & WRITE) {
|
||||
if (op_is_write(opf)) {
|
||||
memcpy(pg_hdr, hdr, len);
|
||||
if (len < SCOUTFS_BLOCK_SM_SIZE)
|
||||
memset((char *)pg_hdr + len, 0,
|
||||
@@ -1230,8 +1235,9 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
|
||||
goto out;
|
||||
}
|
||||
|
||||
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
|
||||
bio->bi_bdev = bdev;
|
||||
kc_bio_set_opf(bio, opf | REQ_SYNC);
|
||||
kc_bio_set_sector(bio, blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9));
|
||||
bio_set_dev(bio, bdev);
|
||||
bio->bi_end_io = sm_block_bio_end_io;
|
||||
bio->bi_private = &sbc;
|
||||
bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
|
||||
@@ -1239,12 +1245,12 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
|
||||
init_completion(&sbc.comp);
|
||||
sbc.err = 0;
|
||||
|
||||
submit_bio((rw & WRITE) ? WRITE_SYNC : READ_SYNC, bio);
|
||||
kc_submit_bio(bio);
|
||||
|
||||
wait_for_completion(&sbc.comp);
|
||||
ret = sbc.err;
|
||||
|
||||
if (ret == 0 && !(rw & WRITE)) {
|
||||
if (ret == 0 && !op_is_write(opf)) {
|
||||
memcpy(hdr, pg_hdr, len);
|
||||
*blk_crc = block_calc_crc(pg_hdr, SCOUTFS_BLOCK_SM_SIZE);
|
||||
}
|
||||
@@ -1258,14 +1264,14 @@ int scoutfs_block_read_sm(struct super_block *sb,
|
||||
struct scoutfs_block_header *hdr, size_t len,
|
||||
__le32 *blk_crc)
|
||||
{
|
||||
return sm_block_io(sb, bdev, READ, blkno, hdr, len, blk_crc);
|
||||
return sm_block_io(sb, bdev, REQ_OP_READ, blkno, hdr, len, blk_crc);
|
||||
}
|
||||
|
||||
int scoutfs_block_write_sm(struct super_block *sb,
|
||||
struct block_device *bdev, u64 blkno,
|
||||
struct scoutfs_block_header *hdr, size_t len)
|
||||
{
|
||||
return sm_block_io(sb, bdev, WRITE, blkno, hdr, len, NULL);
|
||||
return sm_block_io(sb, bdev, REQ_OP_WRITE, blkno, hdr, len, NULL);
|
||||
}
|
||||
|
||||
int scoutfs_block_setup(struct super_block *sb)
|
||||
@@ -1290,9 +1296,9 @@ int scoutfs_block_setup(struct super_block *sb)
|
||||
atomic_set(&binf->total_inserted, 0);
|
||||
atomic64_set(&binf->access_counter, 0);
|
||||
init_waitqueue_head(&binf->waitq);
|
||||
binf->shrinker.shrink = block_shrink;
|
||||
binf->shrinker.seeks = DEFAULT_SEEKS;
|
||||
register_shrinker(&binf->shrinker);
|
||||
KC_INIT_SHRINKER_FUNCS(&binf->shrinker, block_count_objects,
|
||||
block_scan_objects);
|
||||
KC_REGISTER_SHRINKER(&binf->shrinker);
|
||||
INIT_WORK(&binf->free_work, block_free_work);
|
||||
init_llist_head(&binf->free_llist);
|
||||
|
||||
@@ -1312,7 +1318,7 @@ void scoutfs_block_destroy(struct super_block *sb)
|
||||
struct block_info *binf = SCOUTFS_SB(sb)->block_info;
|
||||
|
||||
if (binf) {
|
||||
unregister_shrinker(&binf->shrinker);
|
||||
KC_UNREGISTER_SHRINKER(&binf->shrinker);
|
||||
block_remove_all(sb);
|
||||
flush_work(&binf->free_work);
|
||||
rhashtable_destroy(&binf->ht);
|
||||
|
||||
439
kmod/src/btree.c
439
kmod/src/btree.c
@@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb,
|
||||
key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
struct merge_pos {
|
||||
struct merged_range {
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct rb_root root;
|
||||
int size;
|
||||
};
|
||||
|
||||
struct merged_item {
|
||||
struct rb_node node;
|
||||
struct scoutfs_btree_root *root;
|
||||
struct scoutfs_block *bl;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *avl;
|
||||
struct scoutfs_key *key;
|
||||
struct scoutfs_key key;
|
||||
u64 seq;
|
||||
u8 flags;
|
||||
unsigned int val_len;
|
||||
u8 *val;
|
||||
u8 val[0];
|
||||
};
|
||||
|
||||
static struct merge_pos *first_mpos(struct rb_root *root)
|
||||
static inline struct merged_item *mitem_container(struct rb_node *node)
|
||||
{
|
||||
struct rb_node *node = rb_first(root);
|
||||
if (node)
|
||||
return container_of(node, struct merge_pos, node);
|
||||
return node ? container_of(node, struct merged_item, node) : NULL;
|
||||
}
|
||||
|
||||
static inline struct merged_item *first_mitem(struct rb_root *root)
|
||||
{
|
||||
return mitem_container(rb_first(root));
|
||||
}
|
||||
|
||||
static inline struct merged_item *last_mitem(struct rb_root *root)
|
||||
{
|
||||
return mitem_container(rb_last(root));
|
||||
}
|
||||
|
||||
static inline struct merged_item *next_mitem(struct merged_item *mitem)
|
||||
{
|
||||
return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
|
||||
}
|
||||
|
||||
static inline struct merged_item *prev_mitem(struct merged_item *mitem)
|
||||
{
|
||||
return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
|
||||
}
|
||||
|
||||
static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
|
||||
struct rb_node **parent_ret, struct rb_node ***link_ret)
|
||||
{
|
||||
struct rb_node **node = &root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct merged_item *mitem;
|
||||
int cmp;
|
||||
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
mitem = container_of(*node, struct merged_item, node);
|
||||
|
||||
cmp = scoutfs_key_compare(key, &mitem->key);
|
||||
|
||||
if (cmp < 0) {
|
||||
node = &(*node)->rb_left;
|
||||
} else if (cmp > 0) {
|
||||
node = &(*node)->rb_right;
|
||||
} else {
|
||||
*parent_ret = NULL;
|
||||
*link_ret = NULL;
|
||||
return mitem;
|
||||
}
|
||||
}
|
||||
|
||||
*parent_ret = parent;
|
||||
*link_ret = node;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct merge_pos *next_mpos(struct merge_pos *mpos)
|
||||
static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
|
||||
struct rb_node *parent, struct rb_node **link)
|
||||
{
|
||||
struct rb_node *node;
|
||||
|
||||
if (mpos && (node = rb_next(&mpos->node)))
|
||||
return container_of(node, struct merge_pos, node);
|
||||
else
|
||||
return NULL;
|
||||
rb_link_node(&mitem->node, parent, link);
|
||||
rb_insert_color(&mitem->node, &rng->root);
|
||||
rng->size += item_len_bytes(mitem->val_len);
|
||||
}
|
||||
|
||||
static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
|
||||
static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
|
||||
struct merged_item *new)
|
||||
{
|
||||
scoutfs_block_put(sb, mpos->bl);
|
||||
kfree(mpos);
|
||||
rb_replace_node(&victim->node, &new->node, &rng->root);
|
||||
RB_CLEAR_NODE(&victim->node);
|
||||
rng->size -= item_len_bytes(victim->val_len);
|
||||
rng->size += item_len_bytes(new->val_len);
|
||||
}
|
||||
|
||||
static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
|
||||
static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
|
||||
{
|
||||
struct rb_node **node = &pos_root->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct merge_pos *mpos;
|
||||
int cmp;
|
||||
if (IS_ERR_OR_NULL(mitem))
|
||||
return;
|
||||
|
||||
parent = NULL;
|
||||
while (*node) {
|
||||
parent = *node;
|
||||
mpos = container_of(*node, struct merge_pos, node);
|
||||
|
||||
/* sort merge items by key then newest to oldest */
|
||||
cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
|
||||
-scoutfs_cmp(ins->seq, mpos->seq);
|
||||
|
||||
if (cmp < 0)
|
||||
node = &(*node)->rb_left;
|
||||
else
|
||||
node = &(*node)->rb_right;
|
||||
if (!RB_EMPTY_NODE(&mitem->node)) {
|
||||
rng->size -= item_len_bytes(mitem->val_len);
|
||||
rb_erase(&mitem->node, &rng->root);
|
||||
}
|
||||
|
||||
rb_link_node(&ins->node, parent, node);
|
||||
rb_insert_color(&ins->node, pos_root);
|
||||
kfree(mitem);
|
||||
}
|
||||
|
||||
static void trim_range_size(struct merged_range *rng, int merge_window)
|
||||
{
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
|
||||
mitem = last_mitem(&rng->root);
|
||||
while (mitem && rng->size > merge_window) {
|
||||
|
||||
rng->end = mitem->key;
|
||||
scoutfs_key_dec(&rng->end);
|
||||
|
||||
tmp = mitem;
|
||||
mitem = prev_mitem(mitem);
|
||||
free_mitem(rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
static void trim_range_end(struct merged_range *rng)
|
||||
{
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
|
||||
mitem = last_mitem(&rng->root);
|
||||
while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
|
||||
tmp = mitem;
|
||||
mitem = prev_mitem(mitem);
|
||||
free_mitem(rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next item in the merge_pos root in the caller's range and
|
||||
* insert it into the rbtree sorted by key and version so that merging
|
||||
* can find the next newest item at the front of the rbtree. We free
|
||||
* the mpos on error or if there are no more items in the range.
|
||||
* Record and combine logged items from log roots for merging with the
|
||||
* writable destination root. The caller is responsible for trimming
|
||||
* the range if it gets too large or if the key range shrinks.
|
||||
*/
|
||||
static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
|
||||
struct scoutfs_key *start, struct scoutfs_key *end)
|
||||
static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, void *arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_avl_node *next;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_key walk_key;
|
||||
int ret = 0;
|
||||
struct merged_range *rng = arg;
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *found;
|
||||
struct rb_node *parent;
|
||||
struct rb_node **link;
|
||||
int ret;
|
||||
|
||||
/* always erase before freeing or inserting */
|
||||
if (!RB_EMPTY_NODE(&mpos->node)) {
|
||||
rb_erase(&mpos->node, pos_root);
|
||||
RB_CLEAR_NODE(&mpos->node);
|
||||
}
|
||||
|
||||
/*
|
||||
* advance to next item via the avl tree. The caller's pos is
|
||||
* only ever incremented past the last key so we can use next to
|
||||
* iterate rather than using search to skip past multiple items.
|
||||
*/
|
||||
if (mpos->avl)
|
||||
mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
|
||||
|
||||
/* find the next leaf with the key if we run out of items */
|
||||
walk_key = *start;
|
||||
while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
|
||||
scoutfs_block_put(sb, mpos->bl);
|
||||
mpos->bl = NULL;
|
||||
ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
|
||||
0, &mpos->bl, &kr, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
free_mpos(sb, mpos);
|
||||
found = find_mitem(&rng->root, key, &parent, &link);
|
||||
if (found) {
|
||||
ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret > 0) {
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
free_mitem(rng, found);
|
||||
}
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
mpos->bt = mpos->bl->data;
|
||||
|
||||
mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
|
||||
start, NULL, NULL, &next, NULL) ?: next;
|
||||
if (mpos->avl == NULL)
|
||||
walk_key = kr.iter_next;
|
||||
if (found->seq >= seq) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* see if we're out of items within the range */
|
||||
item = node_item(mpos->avl);
|
||||
if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
|
||||
free_mpos(sb, mpos);
|
||||
ret = 0;
|
||||
mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
|
||||
if (!mitem) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* insert the next item within range at its version */
|
||||
mpos->key = item_key(item);
|
||||
mpos->seq = le64_to_cpu(item->seq);
|
||||
mpos->flags = item->flags;
|
||||
mpos->val_len = item_val_len(item);
|
||||
mpos->val = item_val(mpos->bt, item);
|
||||
mitem->key = *key;
|
||||
mitem->seq = seq;
|
||||
mitem->flags = flags;
|
||||
mitem->val_len = val_len;
|
||||
if (val_len)
|
||||
memcpy(mitem->val, val, val_len);
|
||||
|
||||
if (found) {
|
||||
replace_mitem(rng, found, mitem);
|
||||
free_mitem(rng, found);
|
||||
} else {
|
||||
insert_mitem(rng, mitem, parent, link);
|
||||
}
|
||||
|
||||
insert_mpos(pos_root, mpos);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller has reset all the merge positions for all the input log
|
||||
* btree roots and wants the next logged item it should try and merge
|
||||
* with the items in the fs_root.
|
||||
* Read a range of merged items. The caller has set the key bounds of
|
||||
* the range. We read a merge window's worth of items from blocks in
|
||||
* each input btree.
|
||||
*
|
||||
* We look ahead in the logged item stream to see if we should merge any
|
||||
* older logged delta items into one result for the caller. We also
|
||||
* take this opportunity to skip and reset the mpos for any older
|
||||
* versions of the first item.
|
||||
* The caller can only use the smallest range that overlaps with all the
|
||||
* blocks that we read. We start reading from the range's start key so
|
||||
* it will always be present and we don't need to adjust it. The final
|
||||
* block we read from each input might not cover the range's end so it
|
||||
* needs to be adjusted.
|
||||
*
|
||||
* The end range can also shrink if we have to drop items because the
|
||||
* items exceeded the merge window size.
|
||||
*/
|
||||
static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
|
||||
struct scoutfs_key *end, struct merge_pos **mpos_ret)
|
||||
static int read_merged_range(struct super_block *sb, struct merged_range *rng,
|
||||
struct list_head *inputs, int merge_window)
|
||||
{
|
||||
struct merge_pos *mpos;
|
||||
struct merge_pos *next;
|
||||
struct scoutfs_btree_root_head *rhead;
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
|
||||
!scoutfs_key_compare(mpos->key, next->key)) {
|
||||
list_for_each_entry(rhead, inputs, head) {
|
||||
key = rng->start;
|
||||
|
||||
ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
|
||||
next->val, next->val_len);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* reset advances to the next item */
|
||||
key = *mpos->key;
|
||||
scoutfs_key_inc(&key);
|
||||
|
||||
/* always skip next combined or older version */
|
||||
ret = reset_mpos(sb, pos_root, next, &key, end);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (ret == SCOUTFS_DELTA_COMBINED) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_combined);
|
||||
} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
/* if merging resulted in no info, skip current */
|
||||
ret = reset_mpos(sb, pos_root, mpos, &key, end);
|
||||
for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
|
||||
start = key;
|
||||
end = rng->end;
|
||||
ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
|
||||
merge_read_item, rng);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_key_compare(&end, &rng->end) >= 0)
|
||||
break;
|
||||
|
||||
key = end;
|
||||
scoutfs_key_inc(&key);
|
||||
}
|
||||
|
||||
if (scoutfs_key_compare(&end, &rng->end) < 0) {
|
||||
rng->end = end;
|
||||
trim_range_end(rng);
|
||||
}
|
||||
|
||||
if (rng->size > merge_window)
|
||||
trim_range_size(rng, merge_window);
|
||||
}
|
||||
|
||||
*mpos_ret = mpos;
|
||||
trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
|
||||
* to allocators running low or needing to join/split the parent.
|
||||
* *next_ret is set to the next key which hasn't been merged so that the
|
||||
* caller can retry with a new allocator and subtree.
|
||||
*
|
||||
* The number of input roots can be immense. The merge_window specifies
|
||||
* the size of the set of merged items that we'll maintain as we iterate
|
||||
* over all the input roots. Once we've merged items into the window
|
||||
* from all the input roots the merged input items are then merged to
|
||||
* the writable destination root. It may take multiple passes of
|
||||
* windows of merged items to cover the input key range.
|
||||
*/
|
||||
int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
@@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_key *next_ret,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct list_head *inputs,
|
||||
bool subtree, int dirty_limit, int alloc_low)
|
||||
bool subtree, int dirty_limit, int alloc_low, int merge_window)
|
||||
{
|
||||
struct scoutfs_btree_root_head *rhead;
|
||||
struct rb_root pos_root = RB_ROOT;
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_avl_node *par;
|
||||
struct scoutfs_key next;
|
||||
struct merge_pos *mpos;
|
||||
struct merge_pos *tmp;
|
||||
struct merged_item *mitem;
|
||||
struct merged_item *tmp;
|
||||
struct merged_range rng;
|
||||
int walk_val_len;
|
||||
int walk_flags;
|
||||
bool is_del;
|
||||
@@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
trace_scoutfs_btree_merge(sb, root, start, end);
|
||||
scoutfs_inc_counter(sb, btree_merge);
|
||||
|
||||
list_for_each_entry(rhead, inputs, head) {
|
||||
mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
|
||||
if (!mpos) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
RB_CLEAR_NODE(&mpos->node);
|
||||
mpos->root = &rhead->root;
|
||||
|
||||
ret = reset_mpos(sb, &pos_root, mpos, start, end);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
walk_flags = BTW_DIRTY;
|
||||
if (subtree)
|
||||
walk_flags |= BTW_SUBTREE;
|
||||
walk_val_len = 0;
|
||||
|
||||
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
|
||||
rng.start = *start;
|
||||
rng.end = *end;
|
||||
rng.root = RB_ROOT;
|
||||
rng.size = 0;
|
||||
|
||||
ret = read_merged_range(sb, &rng, inputs, merge_window);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
for (;;) {
|
||||
/* read next window as it empties (and it is possible to read an empty range) */
|
||||
mitem = first_mitem(&rng.root);
|
||||
if (!mitem) {
|
||||
/* done if the read range hit the end */
|
||||
if (scoutfs_key_compare(&rng.end, end) >= 0)
|
||||
break;
|
||||
|
||||
/* read next batch of merged items */
|
||||
rng.start = rng.end;
|
||||
scoutfs_key_inc(&rng.start);
|
||||
rng.end = *end;
|
||||
ret = read_merged_range(sb, &rng, inputs, merge_window);
|
||||
if (ret < 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
|
||||
scoutfs_inc_counter(sb, btree_merge_dirty_limit);
|
||||
ret = -ERANGE;
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
|
||||
scoutfs_inc_counter(sb, btree_merge_alloc_low);
|
||||
ret = -ERANGE;
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
bl = NULL;
|
||||
ret = btree_walk(sb, alloc, wri, root, walk_flags,
|
||||
mpos->key, walk_val_len, &bl, &kr, NULL);
|
||||
&mitem->key, walk_val_len, &bl, &kr, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret == -ERANGE)
|
||||
*next_ret = *mpos->key;
|
||||
*next_ret = mitem->key;
|
||||
goto out;
|
||||
}
|
||||
bt = bl->data;
|
||||
@@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
continue;
|
||||
}
|
||||
|
||||
while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
|
||||
|
||||
while (mitem) {
|
||||
/* walk to new leaf if we exceed parent ref key */
|
||||
if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
|
||||
if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
|
||||
break;
|
||||
|
||||
/* see if there's an existing item */
|
||||
item = leaf_item_hash_search(sb, bt, mpos->key);
|
||||
is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
item = leaf_item_hash_search(sb, bt, &mitem->key);
|
||||
is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
|
||||
/* see if we're merging delta items */
|
||||
if (item && !is_del)
|
||||
delta = scoutfs_forest_combine_deltas(mpos->key,
|
||||
delta = scoutfs_forest_combine_deltas(&mitem->key,
|
||||
item_val(bt, item),
|
||||
item_val_len(item),
|
||||
mpos->val, mpos->val_len);
|
||||
mitem->val, mitem->val_len);
|
||||
else
|
||||
delta = 0;
|
||||
if (delta < 0) {
|
||||
@@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
scoutfs_inc_counter(sb, btree_merge_delta_null);
|
||||
}
|
||||
|
||||
trace_scoutfs_btree_merge_items(sb, mpos->root,
|
||||
mpos->key, mpos->val_len,
|
||||
trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
|
||||
item ? root : NULL,
|
||||
item ? item_key(item) : NULL,
|
||||
item ? item_val_len(item) : 0, is_del);
|
||||
|
||||
/* rewalk and split if ins/update needs room */
|
||||
if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
|
||||
if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
|
||||
walk_flags |= BTW_INSERT;
|
||||
walk_val_len = mpos->val_len;
|
||||
walk_val_len = mitem->val_len;
|
||||
break;
|
||||
}
|
||||
|
||||
/* insert missing non-deletion merge items */
|
||||
if (!item && !is_del) {
|
||||
scoutfs_avl_search(&bt->item_root,
|
||||
cmp_key_item, mpos->key,
|
||||
scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
|
||||
&cmp, &par, NULL, NULL);
|
||||
create_item(bt, mpos->key, mpos->seq, mpos->flags,
|
||||
mpos->val, mpos->val_len, par, cmp);
|
||||
create_item(bt, &mitem->key, mitem->seq, mitem->flags,
|
||||
mitem->val, mitem->val_len, par, cmp);
|
||||
scoutfs_inc_counter(sb, btree_merge_insert);
|
||||
}
|
||||
|
||||
/* update existing items */
|
||||
if (item && !is_del && !delta) {
|
||||
item->seq = cpu_to_le64(mpos->seq);
|
||||
item->flags = mpos->flags;
|
||||
update_item_value(bt, item, mpos->val, mpos->val_len);
|
||||
item->seq = cpu_to_le64(mitem->seq);
|
||||
item->flags = mitem->flags;
|
||||
update_item_value(bt, item, mitem->val, mitem->val_len);
|
||||
scoutfs_inc_counter(sb, btree_merge_update);
|
||||
}
|
||||
|
||||
/* update combined delta item seq */
|
||||
if (delta == SCOUTFS_DELTA_COMBINED) {
|
||||
item->seq = cpu_to_le64(mpos->seq);
|
||||
item->seq = cpu_to_le64(mitem->seq);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
walk_flags &= ~(BTW_INSERT | BTW_DELETE);
|
||||
walk_val_len = 0;
|
||||
|
||||
/* finished with this key, skip any older items */
|
||||
next = *mpos->key;
|
||||
scoutfs_key_inc(&next);
|
||||
ret = reset_mpos(sb, &pos_root, mpos, &next, end);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* finished with this merged item */
|
||||
tmp = mitem;
|
||||
mitem = next_mitem(mitem);
|
||||
free_mitem(&rng, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
scoutfs_block_put(sb, bl);
|
||||
rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
|
||||
free_mpos(sb, mpos);
|
||||
}
|
||||
rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
|
||||
free_mitem(&rng, mitem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
|
||||
struct scoutfs_key *next_ret,
|
||||
struct scoutfs_btree_root *root,
|
||||
struct list_head *input_list,
|
||||
bool subtree, int dirty_limit, int alloc_low);
|
||||
bool subtree, int dirty_limit, int alloc_low, int merge_window);
|
||||
|
||||
int scoutfs_btree_free_blocks(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc,
|
||||
|
||||
@@ -30,6 +30,8 @@
|
||||
EXPAND_COUNTER(block_cache_free) \
|
||||
EXPAND_COUNTER(block_cache_free_work) \
|
||||
EXPAND_COUNTER(block_cache_remove_stale) \
|
||||
EXPAND_COUNTER(block_cache_count_objects) \
|
||||
EXPAND_COUNTER(block_cache_scan_objects) \
|
||||
EXPAND_COUNTER(block_cache_shrink) \
|
||||
EXPAND_COUNTER(block_cache_shrink_next) \
|
||||
EXPAND_COUNTER(block_cache_shrink_recent) \
|
||||
@@ -88,6 +90,8 @@
|
||||
EXPAND_COUNTER(forest_read_items) \
|
||||
EXPAND_COUNTER(forest_roots_next_hint) \
|
||||
EXPAND_COUNTER(forest_set_bloom_bits) \
|
||||
EXPAND_COUNTER(item_cache_count_objects) \
|
||||
EXPAND_COUNTER(item_cache_scan_objects) \
|
||||
EXPAND_COUNTER(item_clear_dirty) \
|
||||
EXPAND_COUNTER(item_create) \
|
||||
EXPAND_COUNTER(item_delete) \
|
||||
@@ -121,6 +125,7 @@
|
||||
EXPAND_COUNTER(item_update) \
|
||||
EXPAND_COUNTER(item_write_dirty) \
|
||||
EXPAND_COUNTER(lock_alloc) \
|
||||
EXPAND_COUNTER(lock_count_objects) \
|
||||
EXPAND_COUNTER(lock_free) \
|
||||
EXPAND_COUNTER(lock_grant_request) \
|
||||
EXPAND_COUNTER(lock_grant_response) \
|
||||
@@ -134,11 +139,13 @@
|
||||
EXPAND_COUNTER(lock_lock_error) \
|
||||
EXPAND_COUNTER(lock_nonblock_eagain) \
|
||||
EXPAND_COUNTER(lock_recover_request) \
|
||||
EXPAND_COUNTER(lock_scan_objects) \
|
||||
EXPAND_COUNTER(lock_shrink_attempted) \
|
||||
EXPAND_COUNTER(lock_shrink_aborted) \
|
||||
EXPAND_COUNTER(lock_shrink_work) \
|
||||
EXPAND_COUNTER(lock_unlock) \
|
||||
EXPAND_COUNTER(lock_wait) \
|
||||
EXPAND_COUNTER(log_merge_wait_timeout) \
|
||||
EXPAND_COUNTER(net_dropped_response) \
|
||||
EXPAND_COUNTER(net_send_bytes) \
|
||||
EXPAND_COUNTER(net_send_error) \
|
||||
@@ -232,12 +239,12 @@ struct scoutfs_counters {
|
||||
#define SCOUTFS_PCPU_COUNTER_BATCH (1 << 30)
|
||||
|
||||
#define scoutfs_inc_counter(sb, which) \
|
||||
__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, 1, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
percpu_counter_add_batch(&SCOUTFS_SB(sb)->counters->which, 1, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
|
||||
#define scoutfs_add_counter(sb, which, cnt) \
|
||||
__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
percpu_counter_add_batch(&SCOUTFS_SB(sb)->counters->which, cnt, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
|
||||
void __init scoutfs_init_counters(void);
|
||||
int scoutfs_setup_counters(struct super_block *sb);
|
||||
|
||||
@@ -307,7 +307,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
|
||||
LIST_HEAD(ind_locks);
|
||||
s64 ret = 0;
|
||||
|
||||
WARN_ON_ONCE(inode && !mutex_is_locked(&inode->i_mutex));
|
||||
WARN_ON_ONCE(inode && !inode_is_locked(inode));
|
||||
|
||||
/* clamp last to the last possible block? */
|
||||
if (last > SCOUTFS_BLOCK_SM_MAX)
|
||||
@@ -558,7 +558,7 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
|
||||
u64 offset;
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(create && !mutex_is_locked(&inode->i_mutex));
|
||||
WARN_ON_ONCE(create && !inode_is_locked(inode));
|
||||
|
||||
/* make sure caller holds a cluster lock */
|
||||
lock = scoutfs_per_task_get(&si->pt_data_lock);
|
||||
@@ -704,7 +704,7 @@ static int scoutfs_readpage(struct file *file, struct page *page)
|
||||
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
|
||||
ret = scoutfs_data_wait_check(inode, page_offset(page),
|
||||
PAGE_CACHE_SIZE, SEF_OFFLINE,
|
||||
PAGE_SIZE, SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_READ, &dw,
|
||||
inode_lock);
|
||||
if (ret != 0) {
|
||||
@@ -729,6 +729,7 @@ static int scoutfs_readpage(struct file *file, struct page *page)
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifndef KC_FILE_AOPS_READAHEAD
|
||||
/*
|
||||
* This is used for opportunistic read-ahead which can throw the pages
|
||||
* away if it needs to. If the caller didn't deal with offline extents
|
||||
@@ -754,14 +755,14 @@ static int scoutfs_readpages(struct file *file, struct address_space *mapping,
|
||||
|
||||
list_for_each_entry_safe(page, tmp, pages, lru) {
|
||||
ret = scoutfs_data_wait_check(inode, page_offset(page),
|
||||
PAGE_CACHE_SIZE, SEF_OFFLINE,
|
||||
PAGE_SIZE, SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_READ, NULL,
|
||||
inode_lock);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret > 0) {
|
||||
list_del(&page->lru);
|
||||
page_cache_release(page);
|
||||
put_page(page);
|
||||
if (--nr_pages == 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
@@ -775,6 +776,29 @@ out:
|
||||
BUG_ON(!list_empty(pages));
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
static void scoutfs_readahead(struct readahead_control *rac)
|
||||
{
|
||||
struct inode *inode = rac->file->f_inode;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
ret = scoutfs_data_wait_check(inode, readahead_pos(rac),
|
||||
readahead_length(rac), SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_READ, NULL,
|
||||
inode_lock);
|
||||
if (ret == 0)
|
||||
mpage_readahead(rac, scoutfs_get_block_read);
|
||||
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
@@ -1057,7 +1081,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
@@ -1118,7 +1142,7 @@ out_extent:
|
||||
up_write(&si->extent_sem);
|
||||
out_mutex:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
|
||||
out:
|
||||
trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
|
||||
@@ -1221,7 +1245,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
struct data_ext_args from_args;
|
||||
struct data_ext_args to_args;
|
||||
struct scoutfs_extent ext;
|
||||
struct timespec cur_time;
|
||||
struct kc_timespec cur_time;
|
||||
LIST_HEAD(locks);
|
||||
bool done = false;
|
||||
loff_t from_size;
|
||||
@@ -1442,7 +1466,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
up_write(&from_si->extent_sem);
|
||||
up_write(&to_si->extent_sem);
|
||||
|
||||
cur_time = CURRENT_TIME;
|
||||
cur_time = current_time(from);
|
||||
if (!is_stage) {
|
||||
to->i_ctime = to->i_mtime = cur_time;
|
||||
inode_inc_iversion(to);
|
||||
@@ -1529,7 +1553,7 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
down_read(&si->extent_sem);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
|
||||
@@ -1583,7 +1607,7 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||
unlock:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
up_read(&si->extent_sem);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
|
||||
out:
|
||||
if (ret == 1)
|
||||
@@ -1783,6 +1807,37 @@ int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_data_wait_check_iter(struct inode *inode, loff_t pos, struct iov_iter *iter,
|
||||
u8 sef, u8 op, struct scoutfs_data_wait *dw,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
size_t count = iov_iter_count(iter);
|
||||
size_t off = iter->iov_offset;
|
||||
const struct iovec *iov;
|
||||
size_t len;
|
||||
int ret = 0;
|
||||
|
||||
for (iov = iter->iov; count > 0; iov++) {
|
||||
len = iov->iov_len - off;
|
||||
if (len == 0)
|
||||
continue;
|
||||
|
||||
/* aren't we waiting on too much data here ? */
|
||||
ret = scoutfs_data_wait_check(inode, pos, len,
|
||||
sef, op, dw, lock);
|
||||
|
||||
if (ret != 0)
|
||||
break;
|
||||
|
||||
|
||||
pos += len;
|
||||
count -= len;
|
||||
off = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw)
|
||||
{
|
||||
DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt);
|
||||
@@ -1873,7 +1928,11 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
|
||||
|
||||
const struct address_space_operations scoutfs_file_aops = {
|
||||
.readpage = scoutfs_readpage,
|
||||
#ifndef KC_FILE_AOPS_READAHEAD
|
||||
.readpages = scoutfs_readpages,
|
||||
#else
|
||||
.readahead = scoutfs_readahead,
|
||||
#endif
|
||||
.writepage = scoutfs_writepage,
|
||||
.writepages = scoutfs_writepages,
|
||||
.write_begin = scoutfs_write_begin,
|
||||
@@ -1881,10 +1940,15 @@ const struct address_space_operations scoutfs_file_aops = {
|
||||
};
|
||||
|
||||
const struct file_operations scoutfs_file_fops = {
|
||||
#ifdef KC_LINUX_HAVE_FOP_AIO_READ
|
||||
.read = do_sync_read,
|
||||
.write = do_sync_write,
|
||||
.aio_read = scoutfs_file_aio_read,
|
||||
.aio_write = scoutfs_file_aio_write,
|
||||
#else
|
||||
.read_iter = scoutfs_file_read_iter,
|
||||
.write_iter = scoutfs_file_write_iter,
|
||||
#endif
|
||||
.unlocked_ioctl = scoutfs_ioctl,
|
||||
.fsync = scoutfs_file_fsync,
|
||||
.llseek = scoutfs_file_llseek,
|
||||
|
||||
@@ -65,6 +65,9 @@ int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos, u8 sef,
|
||||
u8 op, struct scoutfs_data_wait *ow,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_data_wait_check_iter(struct inode *inode, loff_t pos, struct iov_iter *iter,
|
||||
u8 sef, u8 op, struct scoutfs_data_wait *ow,
|
||||
struct scoutfs_lock *lock);
|
||||
bool scoutfs_data_wait_found(struct scoutfs_data_wait *ow);
|
||||
int scoutfs_data_wait(struct inode *inode,
|
||||
struct scoutfs_data_wait *ow);
|
||||
|
||||
109
kmod/src/dir.c
109
kmod/src/dir.c
@@ -272,7 +272,7 @@ static void set_dentry_fsdata(struct dentry *dentry, struct scoutfs_lock *lock)
|
||||
|
||||
static bool test_dentry_fsdata(struct dentry *dentry, u64 refresh)
|
||||
{
|
||||
u64 fsd = (unsigned long)ACCESS_ONCE(dentry->d_fsdata);
|
||||
u64 fsd = (unsigned long)READ_ONCE(dentry->d_fsdata);
|
||||
|
||||
return fsd == refresh;
|
||||
}
|
||||
@@ -735,7 +735,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
dir->i_mtime = dir->i_ctime = current_time(inode);
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime;
|
||||
si->crtime = inode->i_mtime;
|
||||
inode_inc_iversion(dir);
|
||||
@@ -859,7 +859,7 @@ retry:
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
i_size_write(dir, dir_size);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
dir->i_mtime = dir->i_ctime = current_time(inode);
|
||||
inode->i_ctime = dir->i_mtime;
|
||||
inc_nlink(inode);
|
||||
inode_inc_iversion(dir);
|
||||
@@ -900,7 +900,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct timespec ts = current_kernel_time();
|
||||
struct kc_timespec ts = current_time(inode);
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_lock *dir_lock = NULL;
|
||||
@@ -1059,14 +1059,14 @@ static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino
|
||||
}
|
||||
|
||||
/*
|
||||
* Full a buffer with the null terminated symlink, point nd at it, and
|
||||
* return it so put_link can free it once the vfs is done.
|
||||
* Fill a buffer with the null terminated symlink, and return it
|
||||
* so callers can free it once the vfs is done.
|
||||
*
|
||||
* We chose to pay the runtime cost of per-call allocation and copy
|
||||
* overhead instead of wiring up symlinks to the page cache, storing
|
||||
* each small link in a full page, and later having to reclaim them.
|
||||
*/
|
||||
static void *scoutfs_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
static void *scoutfs_get_link_target(struct dentry *dentry)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
@@ -1125,32 +1125,41 @@ out:
|
||||
if (ret < 0) {
|
||||
kfree(path);
|
||||
path = ERR_PTR(ret);
|
||||
} else {
|
||||
nd_set_link(nd, path);
|
||||
}
|
||||
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
|
||||
return path;
|
||||
}
|
||||
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
static void *scoutfs_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
char *path;
|
||||
|
||||
path = scoutfs_get_link_target(dentry);
|
||||
if (!IS_ERR_OR_NULL(path))
|
||||
nd_set_link(nd, path);
|
||||
return path;
|
||||
}
|
||||
|
||||
static void scoutfs_put_link(struct dentry *dentry, struct nameidata *nd,
|
||||
void *cookie)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(cookie))
|
||||
kfree(cookie);
|
||||
}
|
||||
#else
|
||||
static const char *scoutfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done)
|
||||
{
|
||||
char *path;
|
||||
|
||||
const struct inode_operations scoutfs_symlink_iops = {
|
||||
.readlink = generic_readlink,
|
||||
.follow_link = scoutfs_follow_link,
|
||||
.put_link = scoutfs_put_link,
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.removexattr = generic_removexattr,
|
||||
.get_acl = scoutfs_get_acl,
|
||||
};
|
||||
path = scoutfs_get_link_target(dentry);
|
||||
if (!IS_ERR_OR_NULL(path))
|
||||
set_delayed_call(done, kfree_link, path);
|
||||
|
||||
return path;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Symlink target paths can be annoyingly large. We store relatively
|
||||
@@ -1204,7 +1213,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
set_dentry_fsdata(dentry, dir_lock);
|
||||
|
||||
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
|
||||
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
||||
dir->i_mtime = dir->i_ctime = current_time(inode);
|
||||
inode_inc_iversion(dir);
|
||||
|
||||
inode->i_ctime = dir->i_mtime;
|
||||
@@ -1558,7 +1567,7 @@ static int scoutfs_rename_common(struct inode *old_dir,
|
||||
struct scoutfs_lock *orph_lock = NULL;
|
||||
struct scoutfs_dirent new_dent;
|
||||
struct scoutfs_dirent old_dent;
|
||||
struct timespec now;
|
||||
struct kc_timespec now;
|
||||
bool ins_new = false;
|
||||
bool del_new = false;
|
||||
bool ins_old = false;
|
||||
@@ -1724,7 +1733,7 @@ retry:
|
||||
inc_nlink(new_dir);
|
||||
}
|
||||
|
||||
now = CURRENT_TIME;
|
||||
now = current_time(old_inode);
|
||||
old_dir->i_ctime = now;
|
||||
old_dir->i_mtime = now;
|
||||
if (new_dir != old_dir) {
|
||||
@@ -1811,12 +1820,14 @@ out_unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
static int scoutfs_rename(struct inode *old_dir,
|
||||
struct dentry *old_dentry, struct inode *new_dir,
|
||||
struct dentry *new_dentry)
|
||||
{
|
||||
return scoutfs_rename_common(old_dir, old_dentry, new_dir, new_dentry, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int scoutfs_rename2(struct inode *old_dir,
|
||||
struct dentry *old_dentry, struct inode *new_dir,
|
||||
@@ -1861,7 +1872,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
|
||||
if (ret < 0)
|
||||
goto out; /* XXX returning error but items created */
|
||||
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
|
||||
si->crtime = inode->i_mtime;
|
||||
insert_inode_hash(inode);
|
||||
ihold(inode); /* need to update inode modifications in d_tmpfile */
|
||||
@@ -1886,6 +1897,37 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
const struct inode_operations scoutfs_symlink_iops = {
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.readlink = generic_readlink,
|
||||
.follow_link = scoutfs_follow_link,
|
||||
.put_link = scoutfs_put_link,
|
||||
#else
|
||||
.get_link = scoutfs_get_link,
|
||||
#endif
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
#endif
|
||||
.listxattr = scoutfs_listxattr,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.removexattr = generic_removexattr,
|
||||
#endif
|
||||
.get_acl = scoutfs_get_acl,
|
||||
#ifndef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.tmpfile = scoutfs_tmpfile,
|
||||
.rename = scoutfs_rename_common,
|
||||
.symlink = scoutfs_symlink,
|
||||
.unlink = scoutfs_unlink,
|
||||
.link = scoutfs_link,
|
||||
.mkdir = scoutfs_mkdir,
|
||||
.create = scoutfs_create,
|
||||
.lookup = scoutfs_lookup,
|
||||
#endif
|
||||
};
|
||||
|
||||
const struct file_operations scoutfs_dir_fops = {
|
||||
.KC_FOP_READDIR = scoutfs_readdir,
|
||||
#ifdef KC_FMODE_KABI_ITERATE
|
||||
@@ -1897,9 +1939,12 @@ const struct file_operations scoutfs_dir_fops = {
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
const struct inode_operations_wrapper scoutfs_dir_iops = {
|
||||
.ops = {
|
||||
#else
|
||||
const struct inode_operations scoutfs_dir_iops = {
|
||||
#endif
|
||||
.lookup = scoutfs_lookup,
|
||||
.mknod = scoutfs_mknod,
|
||||
.create = scoutfs_create,
|
||||
@@ -1907,17 +1952,25 @@ const struct inode_operations_wrapper scoutfs_dir_iops = {
|
||||
.link = scoutfs_link,
|
||||
.unlink = scoutfs_unlink,
|
||||
.rmdir = scoutfs_unlink,
|
||||
.rename = scoutfs_rename,
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.rename = scoutfs_rename,
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.removexattr = generic_removexattr,
|
||||
#endif
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.get_acl = scoutfs_get_acl,
|
||||
.symlink = scoutfs_symlink,
|
||||
.permission = scoutfs_permission,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
},
|
||||
#endif
|
||||
.tmpfile = scoutfs_tmpfile,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.rename2 = scoutfs_rename2,
|
||||
#else
|
||||
.rename = scoutfs_rename2,
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -5,7 +5,11 @@
|
||||
#include "lock.h"
|
||||
|
||||
extern const struct file_operations scoutfs_dir_fops;
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
extern const struct inode_operations_wrapper scoutfs_dir_iops;
|
||||
#else
|
||||
extern const struct inode_operations scoutfs_dir_iops;
|
||||
#endif
|
||||
extern const struct inode_operations scoutfs_symlink_iops;
|
||||
|
||||
extern const struct dentry_operations scoutfs_dentry_ops;
|
||||
|
||||
138
kmod/src/file.c
138
kmod/src/file.c
@@ -29,6 +29,7 @@
|
||||
#include "per_task.h"
|
||||
#include "omap.h"
|
||||
|
||||
#ifdef KC_LINUX_HAVE_FOP_AIO_READ
|
||||
/*
|
||||
* Start a high level file read. We check for offline extents in the
|
||||
* read region here so that we only check the extents once. We use the
|
||||
@@ -42,27 +43,27 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
struct inode *inode = file_inode(file);
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *scoutfs_inode_lock = NULL;
|
||||
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
|
||||
DECLARE_DATA_WAIT(dw);
|
||||
int ret;
|
||||
|
||||
retry:
|
||||
/* protect checked extents from release */
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
atomic_inc(&inode->i_dio_count);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &scoutfs_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
|
||||
ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos,
|
||||
SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_READ,
|
||||
&dw, inode_lock);
|
||||
&dw, scoutfs_inode_lock);
|
||||
if (ret != 0)
|
||||
goto out;
|
||||
} else {
|
||||
@@ -74,7 +75,7 @@ retry:
|
||||
out:
|
||||
inode_dio_done(inode);
|
||||
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
|
||||
scoutfs_unlock(sb, scoutfs_inode_lock, SCOUTFS_LOCK_READ);
|
||||
|
||||
if (scoutfs_data_wait_found(&dw)) {
|
||||
ret = scoutfs_data_wait(inode, &dw);
|
||||
@@ -92,7 +93,7 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
struct inode *inode = file_inode(file);
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *inode_lock = NULL;
|
||||
struct scoutfs_lock *scoutfs_inode_lock = NULL;
|
||||
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
|
||||
DECLARE_DATA_WAIT(dw);
|
||||
int ret;
|
||||
@@ -101,22 +102,22 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
return 0;
|
||||
|
||||
retry:
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &scoutfs_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_complete_truncate(inode, inode_lock);
|
||||
ret = scoutfs_complete_truncate(inode, scoutfs_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
|
||||
/* data_version is per inode, whole file must be online */
|
||||
ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode),
|
||||
SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_WRITE,
|
||||
&dw, inode_lock);
|
||||
&dw, scoutfs_inode_lock);
|
||||
if (ret != 0)
|
||||
goto out;
|
||||
}
|
||||
@@ -127,8 +128,8 @@ retry:
|
||||
|
||||
out:
|
||||
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
|
||||
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
scoutfs_unlock(sb, scoutfs_inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
inode_unlock(inode);
|
||||
|
||||
if (scoutfs_data_wait_found(&dw)) {
|
||||
ret = scoutfs_data_wait(inode, &dw);
|
||||
@@ -146,6 +147,113 @@ out:
|
||||
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
ssize_t scoutfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *scoutfs_inode_lock = NULL;
|
||||
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
|
||||
DECLARE_DATA_WAIT(dw);
|
||||
int ret;
|
||||
|
||||
retry:
|
||||
/* protect checked extents from release */
|
||||
inode_lock(inode);
|
||||
atomic_inc(&inode->i_dio_count);
|
||||
inode_unlock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &scoutfs_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
|
||||
ret = scoutfs_data_wait_check_iter(inode, iocb->ki_pos, to,
|
||||
SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_READ,
|
||||
&dw, scoutfs_inode_lock);
|
||||
if (ret != 0)
|
||||
goto out;
|
||||
} else {
|
||||
WARN_ON_ONCE(true);
|
||||
}
|
||||
|
||||
ret = generic_file_read_iter(iocb, to);
|
||||
|
||||
out:
|
||||
inode_dio_end(inode);
|
||||
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
|
||||
scoutfs_unlock(sb, scoutfs_inode_lock, SCOUTFS_LOCK_READ);
|
||||
|
||||
if (scoutfs_data_wait_found(&dw)) {
|
||||
ret = scoutfs_data_wait(inode, &dw);
|
||||
if (ret == 0)
|
||||
goto retry;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t scoutfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *scoutfs_inode_lock = NULL;
|
||||
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
|
||||
DECLARE_DATA_WAIT(dw);
|
||||
int ret;
|
||||
int written;
|
||||
|
||||
retry:
|
||||
inode_lock(inode);
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &scoutfs_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = generic_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_complete_truncate(inode, scoutfs_inode_lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
|
||||
/* data_version is per inode, whole file must be online */
|
||||
ret = scoutfs_data_wait_check_iter(inode, iocb->ki_pos, from,
|
||||
SEF_OFFLINE,
|
||||
SCOUTFS_IOC_DWO_WRITE,
|
||||
&dw, scoutfs_inode_lock);
|
||||
if (ret != 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* XXX: remove SUID bit */
|
||||
|
||||
written = __generic_file_write_iter(iocb, from);
|
||||
|
||||
out:
|
||||
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
|
||||
scoutfs_unlock(sb, scoutfs_inode_lock, SCOUTFS_LOCK_WRITE);
|
||||
inode_unlock(inode);
|
||||
|
||||
if (scoutfs_data_wait_found(&dw)) {
|
||||
ret = scoutfs_data_wait(inode, &dw);
|
||||
if (ret == 0)
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (ret > 0 || ret == -EIOCBQUEUED)
|
||||
ret = generic_write_sync(iocb, written);
|
||||
|
||||
return written ? written : ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
int scoutfs_permission(struct inode *inode, int mask)
|
||||
{
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
#ifndef _SCOUTFS_FILE_H_
|
||||
#define _SCOUTFS_FILE_H_
|
||||
|
||||
#ifdef KC_LINUX_HAVE_FOP_AIO_READ
|
||||
ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos);
|
||||
ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos);
|
||||
#else
|
||||
ssize_t scoutfs_file_read_iter(struct kiocb *, struct iov_iter *);
|
||||
ssize_t scoutfs_file_write_iter(struct kiocb *, struct iov_iter *);
|
||||
#endif
|
||||
int scoutfs_permission(struct inode *inode, int mask);
|
||||
loff_t scoutfs_file_llseek(struct file *file, loff_t offset, int whence);
|
||||
|
||||
|
||||
@@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
|
||||
ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
|
||||
&next, &comp.root, &inputs,
|
||||
!!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
|
||||
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
|
||||
SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
|
||||
(2 * 1024 * 1024));
|
||||
if (ret == -ERANGE) {
|
||||
comp.remain = next;
|
||||
le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
|
||||
|
||||
@@ -143,10 +143,12 @@ void scoutfs_destroy_inode(struct inode *inode)
|
||||
static const struct inode_operations scoutfs_file_iops = {
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.removexattr = generic_removexattr,
|
||||
#endif
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.get_acl = scoutfs_get_acl,
|
||||
.fiemap = scoutfs_data_fiemap,
|
||||
};
|
||||
@@ -154,10 +156,12 @@ static const struct inode_operations scoutfs_file_iops = {
|
||||
static const struct inode_operations scoutfs_special_iops = {
|
||||
.getattr = scoutfs_getattr,
|
||||
.setattr = scoutfs_setattr,
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
.setxattr = generic_setxattr,
|
||||
.getxattr = generic_getxattr,
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.removexattr = generic_removexattr,
|
||||
#endif
|
||||
.listxattr = scoutfs_listxattr,
|
||||
.get_acl = scoutfs_get_acl,
|
||||
};
|
||||
|
||||
@@ -174,8 +178,12 @@ static void set_inode_ops(struct inode *inode)
|
||||
inode->i_fop = &scoutfs_file_fops;
|
||||
break;
|
||||
case S_IFDIR:
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
inode->i_op = &scoutfs_dir_iops.ops;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
#else
|
||||
inode->i_op = &scoutfs_dir_iops;
|
||||
#endif
|
||||
inode->i_fop = &scoutfs_dir_fops;
|
||||
break;
|
||||
case S_IFLNK:
|
||||
@@ -247,7 +255,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
|
||||
i_size_write(inode, le64_to_cpu(cinode->size));
|
||||
inode->i_version = le64_to_cpu(cinode->version);
|
||||
inode_set_iversion_queried(inode, le64_to_cpu(cinode->version));
|
||||
set_nlink(inode, le32_to_cpu(cinode->nlink));
|
||||
i_uid_write(inode, le32_to_cpu(cinode->uid));
|
||||
i_gid_write(inode, le32_to_cpu(cinode->gid));
|
||||
@@ -340,10 +348,17 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
#else
|
||||
int scoutfs_getattr(const struct path *path, struct kstat *stat,
|
||||
u32 request_mask, unsigned int query_flags)
|
||||
{
|
||||
struct inode *inode = d_inode(path->dentry);
|
||||
#endif
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
int ret;
|
||||
@@ -384,7 +399,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
|
||||
scoutfs_inode_inc_data_version(inode);
|
||||
|
||||
truncate_setsize(inode, new_size);
|
||||
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
|
||||
inode->i_ctime = inode->i_mtime = current_time(inode);
|
||||
if (truncate)
|
||||
si->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
|
||||
scoutfs_inode_set_data_seq(inode);
|
||||
@@ -467,8 +482,7 @@ retry:
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = inode_change_ok(inode, attr);
|
||||
ret = setattr_prepare(dentry, attr);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -496,9 +510,9 @@ retry:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
/* XXX callee locks instead? */
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
ret = scoutfs_data_wait(inode, &dw);
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
if (ret == 0)
|
||||
goto retry;
|
||||
@@ -750,7 +764,7 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf)
|
||||
/* XXX ensure refresh, instead clear in drop_inode? */
|
||||
si = SCOUTFS_I(inode);
|
||||
atomic64_set(&si->last_refreshed, 0);
|
||||
inode->i_version = 0;
|
||||
inode_set_iversion_queried(inode, 0);
|
||||
}
|
||||
|
||||
ret = scoutfs_inode_refresh(inode, lock);
|
||||
@@ -798,7 +812,7 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
|
||||
scoutfs_inode_get_onoff(inode, &online_blocks, &offline_blocks);
|
||||
|
||||
cinode->size = cpu_to_le64(i_size_read(inode));
|
||||
cinode->version = cpu_to_le64(inode->i_version);
|
||||
cinode->version = cpu_to_le64(inode_peek_iversion(inode));
|
||||
cinode->nlink = cpu_to_le32(inode->i_nlink);
|
||||
cinode->uid = cpu_to_le32(i_uid_read(inode));
|
||||
cinode->gid = cpu_to_le32(i_gid_read(inode));
|
||||
@@ -1475,7 +1489,7 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
|
||||
inode->i_ino = ino; /* XXX overflow */
|
||||
inode_init_owner(inode, dir, mode);
|
||||
inode_set_bytes(inode, 0);
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
|
||||
inode->i_rdev = rdev;
|
||||
set_inode_ops(inode);
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ struct scoutfs_inode_info {
|
||||
u64 online_blocks;
|
||||
u64 offline_blocks;
|
||||
u32 flags;
|
||||
struct timespec crtime;
|
||||
struct kc_timespec crtime;
|
||||
|
||||
/*
|
||||
* Protects per-inode extent items, most particularly readers
|
||||
@@ -123,8 +123,13 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off);
|
||||
int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock);
|
||||
|
||||
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock);
|
||||
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
|
||||
int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
#else
|
||||
int scoutfs_getattr(const struct path *path, struct kstat *stat,
|
||||
u32 request_mask, unsigned int query_flags);
|
||||
#endif
|
||||
int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
|
||||
int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/aio.h>
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/backing-dev.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "key.h"
|
||||
@@ -302,7 +303,7 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
@@ -351,7 +352,7 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
|
||||
|
||||
out:
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
mnt_drop_write_file(file);
|
||||
|
||||
trace_scoutfs_ioc_release_ret(sb, scoutfs_ino(inode), ret);
|
||||
@@ -393,7 +394,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
@@ -411,7 +412,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
unlock:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
iput(inode);
|
||||
out:
|
||||
return ret;
|
||||
@@ -448,7 +449,6 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
|
||||
struct scoutfs_ioctl_stage args;
|
||||
@@ -480,8 +480,10 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
|
||||
/* the iocb is really only used for the file pointer :P */
|
||||
init_sync_kiocb(&kiocb, file);
|
||||
kiocb.ki_pos = args.offset;
|
||||
#ifdef KC_LINUX_AIO_KI_LEFT
|
||||
kiocb.ki_left = args.length;
|
||||
kiocb.ki_nbytes = args.length;
|
||||
#endif
|
||||
iov.iov_base = (void __user *)(unsigned long)args.buf_ptr;
|
||||
iov.iov_len = args.length;
|
||||
|
||||
@@ -489,7 +491,7 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
@@ -516,7 +518,7 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
|
||||
}
|
||||
|
||||
si->staging = true;
|
||||
current->backing_dev_info = mapping->backing_dev_info;
|
||||
current->backing_dev_info = inode_to_bdi(inode);
|
||||
|
||||
pos = args.offset;
|
||||
written = 0;
|
||||
@@ -533,7 +535,7 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
|
||||
out:
|
||||
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
mnt_drop_write_file(file);
|
||||
|
||||
trace_scoutfs_ioc_stage_ret(sb, scoutfs_ino(inode), ret);
|
||||
@@ -652,7 +654,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
|
||||
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
|
||||
@@ -696,7 +698,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
|
||||
unlock:
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
mnt_drop_write_file(file);
|
||||
out:
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "trans.h"
|
||||
#include "counters.h"
|
||||
#include "scoutfs_trace.h"
|
||||
#include "util.h"
|
||||
|
||||
/*
|
||||
* The item cache maintains a consistent view of items that are read
|
||||
@@ -76,8 +77,10 @@ struct item_cache_info {
|
||||
/* almost always read, barely written */
|
||||
struct super_block *sb;
|
||||
struct item_percpu_pages __percpu *pcpu_pages;
|
||||
struct shrinker shrinker;
|
||||
KC_DEFINE_SHRINKER(shrinker);
|
||||
#ifdef KC_CPU_NOTIFIER
|
||||
struct notifier_block notifier;
|
||||
#endif
|
||||
|
||||
/* often walked, but per-cpu refs are fast path */
|
||||
rwlock_t rwlock;
|
||||
@@ -2277,7 +2280,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
list_add(&page->list, &pages);
|
||||
list_add(&page->lru, &pages);
|
||||
|
||||
first = NULL;
|
||||
prev = &first;
|
||||
@@ -2290,7 +2293,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
list_add(&second->list, &pages);
|
||||
list_add(&second->lru, &pages);
|
||||
}
|
||||
|
||||
/* read lock next sorted page, we're only dirty_list user */
|
||||
@@ -2347,8 +2350,8 @@ int scoutfs_item_write_dirty(struct super_block *sb)
|
||||
/* write all the dirty items into log btree blocks */
|
||||
ret = scoutfs_forest_insert_list(sb, first);
|
||||
out:
|
||||
list_for_each_entry_safe(page, second, &pages, list) {
|
||||
list_del_init(&page->list);
|
||||
list_for_each_entry_safe(page, second, &pages, lru) {
|
||||
list_del_init(&page->lru);
|
||||
__free_page(page);
|
||||
}
|
||||
|
||||
@@ -2530,27 +2533,35 @@ retry:
|
||||
put_pg(sb, right);
|
||||
}
|
||||
|
||||
static unsigned long item_cache_count_objects(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct item_cache_info *cinf = KC_SHRINKER_CONTAINER_OF(shrink, struct item_cache_info);
|
||||
struct super_block *sb = cinf->sb;
|
||||
|
||||
scoutfs_inc_counter(sb, item_cache_count_objects);
|
||||
|
||||
return shrinker_min_long(cinf->lru_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Shrink the size the item cache. We're operating against the fast
|
||||
* path lock ordering and we skip pages if we can't acquire locks. We
|
||||
* can run into dirty pages or pages with items that weren't visible to
|
||||
* the earliest active reader which must be skipped.
|
||||
*/
|
||||
static int item_lru_shrink(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
static unsigned long item_cache_scan_objects(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct item_cache_info *cinf = container_of(shrink,
|
||||
struct item_cache_info,
|
||||
shrinker);
|
||||
struct item_cache_info *cinf = KC_SHRINKER_CONTAINER_OF(shrink, struct item_cache_info);
|
||||
struct super_block *sb = cinf->sb;
|
||||
struct cached_page *tmp;
|
||||
struct cached_page *pg;
|
||||
unsigned long freed = 0;
|
||||
u64 first_reader_seq;
|
||||
int nr;
|
||||
int nr = sc->nr_to_scan;
|
||||
|
||||
if (sc->nr_to_scan == 0)
|
||||
goto out;
|
||||
nr = sc->nr_to_scan;
|
||||
scoutfs_inc_counter(sb, item_cache_scan_objects);
|
||||
|
||||
/* can't invalidate pages with items that weren't visible to first reader */
|
||||
first_reader_seq = first_active_reader_seq(cinf);
|
||||
@@ -2582,6 +2593,7 @@ static int item_lru_shrink(struct shrinker *shrink,
|
||||
rbtree_erase(&pg->node, &cinf->pg_root);
|
||||
invalidate_pcpu_page(pg);
|
||||
write_unlock(&pg->rwlock);
|
||||
freed++;
|
||||
|
||||
put_pg(sb, pg);
|
||||
|
||||
@@ -2591,10 +2603,11 @@ static int item_lru_shrink(struct shrinker *shrink,
|
||||
|
||||
write_unlock(&cinf->rwlock);
|
||||
spin_unlock(&cinf->lru_lock);
|
||||
out:
|
||||
return min_t(unsigned long, cinf->lru_pages, INT_MAX);
|
||||
|
||||
return freed;
|
||||
}
|
||||
|
||||
#ifdef KC_CPU_NOTIFIER
|
||||
static int item_cpu_callback(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
{
|
||||
@@ -2609,6 +2622,7 @@ static int item_cpu_callback(struct notifier_block *nfb,
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
#endif
|
||||
|
||||
int scoutfs_item_setup(struct super_block *sb)
|
||||
{
|
||||
@@ -2638,11 +2652,13 @@ int scoutfs_item_setup(struct super_block *sb)
|
||||
for_each_possible_cpu(cpu)
|
||||
init_pcpu_pages(cinf, cpu);
|
||||
|
||||
cinf->shrinker.shrink = item_lru_shrink;
|
||||
cinf->shrinker.seeks = DEFAULT_SEEKS;
|
||||
register_shrinker(&cinf->shrinker);
|
||||
KC_INIT_SHRINKER_FUNCS(&cinf->shrinker, item_cache_count_objects,
|
||||
item_cache_scan_objects);
|
||||
KC_REGISTER_SHRINKER(&cinf->shrinker);
|
||||
#ifdef KC_CPU_NOTIFIER
|
||||
cinf->notifier.notifier_call = item_cpu_callback;
|
||||
register_hotcpu_notifier(&cinf->notifier);
|
||||
#endif
|
||||
|
||||
sbi->item_cache_info = cinf;
|
||||
return 0;
|
||||
@@ -2662,8 +2678,10 @@ void scoutfs_item_destroy(struct super_block *sb)
|
||||
if (cinf) {
|
||||
BUG_ON(!list_empty(&cinf->active_list));
|
||||
|
||||
#ifdef KC_CPU_NOTIFIER
|
||||
unregister_hotcpu_notifier(&cinf->notifier);
|
||||
unregister_shrinker(&cinf->shrinker);
|
||||
#endif
|
||||
KC_UNREGISTER_SHRINKER(&cinf->shrinker);
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
drop_pcpu_pages(sb, cinf, cpu);
|
||||
|
||||
84
kmod/src/kernelcompat.c
Normal file
84
kmod/src/kernelcompat.c
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
#include <linux/uio.h>
|
||||
|
||||
#include "kernelcompat.h"
|
||||
|
||||
#ifdef KC_SHRINKER_SHRINK
|
||||
#include <linux/shrinker.h>
|
||||
/*
|
||||
* If a target doesn't have that .{count,scan}_objects() interface then
|
||||
* we have a .shrink() helper that performs the shrink work in terms of
|
||||
* count/scan.
|
||||
*/
|
||||
int kc_shrink_wrapper_fn(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct kc_shrinker_wrapper *wrapper = container_of(shrink, struct kc_shrinker_wrapper, shrink);
|
||||
unsigned long nr;
|
||||
unsigned long rc;
|
||||
|
||||
if (sc->nr_to_scan != 0) {
|
||||
rc = wrapper->scan_objects(shrink, sc);
|
||||
/* translate magic values to the equivalent for older kernels */
|
||||
if (rc == SHRINK_STOP)
|
||||
return -1;
|
||||
else if (rc == SHRINK_EMPTY)
|
||||
return 0;
|
||||
}
|
||||
|
||||
nr = wrapper->count_objects(shrink, sc);
|
||||
|
||||
return min_t(unsigned long, nr, INT_MAX);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef KC_CURRENT_TIME_INODE
|
||||
struct timespec64 kc_current_time(struct inode *inode)
|
||||
{
|
||||
struct timespec64 now;
|
||||
unsigned gran;
|
||||
|
||||
getnstimeofday64(&now);
|
||||
|
||||
if (unlikely(!inode->i_sb)) {
|
||||
WARN(1, "current_time() called with uninitialized super_block in the inode");
|
||||
return now;
|
||||
}
|
||||
|
||||
gran = inode->i_sb->s_time_gran;
|
||||
|
||||
/* Avoid division in the common cases 1 ns and 1 s. */
|
||||
if (gran == 1) {
|
||||
/* nothing */
|
||||
} else if (gran == NSEC_PER_SEC) {
|
||||
now.tv_nsec = 0;
|
||||
} else if (gran > 1 && gran < NSEC_PER_SEC) {
|
||||
now.tv_nsec -= now.tv_nsec % gran;
|
||||
} else {
|
||||
WARN(1, "illegal file time granularity: %u", gran);
|
||||
}
|
||||
|
||||
return now;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef KC_GENERIC_FILE_BUFFERED_WRITE
|
||||
ssize_t
|
||||
kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos, loff_t *ppos,
|
||||
size_t count, ssize_t written)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
ssize_t status;
|
||||
struct iov_iter i;
|
||||
|
||||
iov_iter_init(&i, WRITE, iov, nr_segs, count);
|
||||
status = generic_perform_write(file, &i, pos);
|
||||
|
||||
if (likely(status >= 0)) {
|
||||
written += status;
|
||||
*ppos = pos + status;
|
||||
}
|
||||
|
||||
return written ? written : status;
|
||||
}
|
||||
#endif
|
||||
@@ -1,8 +1,35 @@
|
||||
#ifndef _SCOUTFS_KERNELCOMPAT_H_
|
||||
#define _SCOUTFS_KERNELCOMPAT_H_
|
||||
|
||||
#ifndef KC_ITERATE_DIR_CONTEXT
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
|
||||
/*
|
||||
* v4.15-rc3-4-gae5e165d855d
|
||||
*
|
||||
* new API for handling inode->i_version. This forces us to
|
||||
* include this API where we need. We include it here for
|
||||
* convenience instead of where it's needed.
|
||||
*/
|
||||
#ifdef KC_NEED_LINUX_IVERSION_H
|
||||
#include <linux/iversion.h>
|
||||
#else
|
||||
/*
|
||||
* Kernels before above version will need to fall back to
|
||||
* manipulating inode->i_version as previous with degraded
|
||||
* methods.
|
||||
*/
|
||||
#define inode_set_iversion_queried(inode, val) \
|
||||
do { \
|
||||
(inode)->i_version = val; \
|
||||
} while (0)
|
||||
#define inode_peek_iversion(inode) \
|
||||
({ \
|
||||
(inode)->i_version; \
|
||||
})
|
||||
#endif
|
||||
|
||||
#ifndef KC_ITERATE_DIR_CONTEXT
|
||||
typedef filldir_t kc_readdir_ctx_t;
|
||||
#define KC_DECLARE_READDIR(name, file, dirent, ctx) name(file, dirent, ctx)
|
||||
#define KC_FOP_READDIR readdir
|
||||
@@ -52,4 +79,198 @@ static inline int dir_emit_dots(struct file *file, void *dirent,
|
||||
#define kc_posix_acl_valid(user_ns, acl) posix_acl_valid(acl)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* v3.6-rc1-24-gdbf2576e37da
|
||||
*
|
||||
* All workqueues are now non-reentrant, and the bit flag is removed
|
||||
* shortly after its uses were removed.
|
||||
*/
|
||||
#ifndef WQ_NON_REENTRANT
|
||||
#define WQ_NON_REENTRANT 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* v3.18-rc2-19-gb5ae6b15bd73
|
||||
*
|
||||
* Folds d_materialise_unique into d_splice_alias. Note reversal
|
||||
* of arguments (Also note Documentation/filesystems/porting.rst)
|
||||
*/
|
||||
#ifndef KC_D_MATERIALISE_UNIQUE
|
||||
#define d_materialise_unique(dentry, inode) d_splice_alias(inode, dentry)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* v4.8-rc1-29-g31051c85b5e2
|
||||
*
|
||||
* fall back to inode_change_ok() if setattr_prepare() isn't available
|
||||
*/
|
||||
#ifndef KC_SETATTR_PREPARE
|
||||
#define setattr_prepare(dentry, attr) inode_change_ok(d_inode(dentry), attr)
|
||||
#endif
|
||||
|
||||
#ifndef KC___POSIX_ACL_CREATE
|
||||
#define __posix_acl_create posix_acl_create
|
||||
#define __posix_acl_chmod posix_acl_chmod
|
||||
#endif
|
||||
|
||||
#ifndef KC_PERCPU_COUNTER_ADD_BATCH
|
||||
#define percpu_counter_add_batch __percpu_counter_add
|
||||
#endif
|
||||
|
||||
#ifndef KC_MEMALLOC_NOFS_SAVE
|
||||
#define memalloc_nofs_save memalloc_noio_save
|
||||
#define memalloc_nofs_restore memalloc_noio_restore
|
||||
#endif
|
||||
|
||||
#ifdef KC_BIO_BI_OPF
|
||||
#define kc_bio_get_opf(bio) \
|
||||
({ \
|
||||
(bio)->bi_opf; \
|
||||
})
|
||||
#define kc_bio_set_opf(bio, opf) \
|
||||
do { \
|
||||
(bio)->bi_opf = opf; \
|
||||
} while (0)
|
||||
#define kc_bio_set_sector(bio, sect) \
|
||||
do { \
|
||||
(bio)->bi_iter.bi_sector = sect;\
|
||||
} while (0)
|
||||
#define kc_submit_bio(bio) submit_bio(bio)
|
||||
#else
|
||||
#define kc_bio_get_opf(bio) \
|
||||
({ \
|
||||
(bio)->bi_rw; \
|
||||
})
|
||||
#define kc_bio_set_opf(bio, opf) \
|
||||
do { \
|
||||
(bio)->bi_rw = opf; \
|
||||
} while (0)
|
||||
#define kc_bio_set_sector(bio, sect) \
|
||||
do { \
|
||||
(bio)->bi_sector = sect; \
|
||||
} while (0)
|
||||
#define kc_submit_bio(bio) \
|
||||
do { \
|
||||
submit_bio((bio)->bi_rw, bio); \
|
||||
} while (0)
|
||||
#define bio_set_dev(bio, bdev) \
|
||||
do { \
|
||||
(bio)->bi_bdev = (bdev); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef KC_BIO_BI_STATUS
|
||||
#define KC_DECLARE_BIO_END_IO(name, bio) name(bio)
|
||||
#define kc_bio_get_errno(bio) ({ blk_status_to_errno((bio)->bi_status); })
|
||||
#else
|
||||
#define KC_DECLARE_BIO_END_IO(name, bio) name(bio, int _error_arg)
|
||||
#define kc_bio_get_errno(bio) ({ (int)((void)(bio), _error_arg); })
|
||||
#endif
|
||||
|
||||
/*
|
||||
* v4.13-rc1-6-ge462ec50cb5f
|
||||
*
|
||||
* MS_* (mount) flags from <linux/mount.h> should not be used in the kernel
|
||||
* anymore from 4.x onwards. Instead, we need to use the SB_* (superblock) flags
|
||||
*/
|
||||
#ifndef SB_POSIXACL
|
||||
#define SB_POSIXACL MS_POSIXACL
|
||||
#define SB_I_VERSION MS_I_VERSION
|
||||
#endif
|
||||
|
||||
#ifndef KC_CURRENT_TIME_INODE
|
||||
struct timespec64 kc_current_time(struct inode *inode);
|
||||
#define current_time kc_current_time
|
||||
#define kc_timespec timespec
|
||||
#else
|
||||
#define kc_timespec timespec64
|
||||
#endif
|
||||
|
||||
#ifndef KC_SHRINKER_SHRINK
|
||||
|
||||
#define KC_DEFINE_SHRINKER(name) struct shrinker name
|
||||
#define KC_INIT_SHRINKER_FUNCS(name, countfn, scanfn) do { \
|
||||
__typeof__(name) _shrink = (name); \
|
||||
_shrink->count_objects = (countfn); \
|
||||
_shrink->scan_objects = (scanfn); \
|
||||
_shrink->seeks = DEFAULT_SEEKS; \
|
||||
} while (0)
|
||||
|
||||
#define KC_SHRINKER_CONTAINER_OF(ptr, type) container_of(ptr, type, shrinker)
|
||||
#define KC_REGISTER_SHRINKER(ptr) (register_shrinker(ptr))
|
||||
#define KC_UNREGISTER_SHRINKER(ptr) (unregister_shrinker(ptr))
|
||||
#define KC_SHRINKER_FN(ptr) (ptr)
|
||||
#else
|
||||
|
||||
#include <linux/shrinker.h>
|
||||
#ifndef SHRINK_STOP
|
||||
#define SHRINK_STOP (~0UL)
|
||||
#define SHRINK_EMPTY (~0UL - 1)
|
||||
#endif
|
||||
|
||||
int kc_shrink_wrapper_fn(struct shrinker *shrink, struct shrink_control *sc);
|
||||
struct kc_shrinker_wrapper {
|
||||
unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc);
|
||||
unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc);
|
||||
struct shrinker shrink;
|
||||
};
|
||||
|
||||
#define KC_DEFINE_SHRINKER(name) struct kc_shrinker_wrapper name;
|
||||
#define KC_INIT_SHRINKER_FUNCS(name, countfn, scanfn) do { \
|
||||
struct kc_shrinker_wrapper *_wrap = (name); \
|
||||
_wrap->count_objects = (countfn); \
|
||||
_wrap->scan_objects = (scanfn); \
|
||||
_wrap->shrink.shrink = kc_shrink_wrapper_fn; \
|
||||
_wrap->shrink.seeks = DEFAULT_SEEKS; \
|
||||
} while (0)
|
||||
#define KC_SHRINKER_CONTAINER_OF(ptr, type) container_of(container_of(ptr, struct kc_shrinker_wrapper, shrink), type, shrinker)
|
||||
#define KC_REGISTER_SHRINKER(ptr) (register_shrinker(ptr.shrink))
|
||||
#define KC_UNREGISTER_SHRINKER(ptr) (unregister_shrinker(ptr.shrink))
|
||||
#define KC_SHRINKER_FN(ptr) (ptr.shrink)
|
||||
|
||||
#endif /* KC_SHRINKER_SHRINK */
|
||||
|
||||
#ifdef KC_KERNEL_GETSOCKNAME_ADDRLEN
|
||||
#include <linux/net.h>
|
||||
#include <linux/inet.h>
|
||||
static inline int kc_kernel_getsockname(struct socket *sock, struct sockaddr *addr)
|
||||
{
|
||||
int addrlen = sizeof(struct sockaddr_in);
|
||||
int ret = kernel_getsockname(sock, addr, &addrlen);
|
||||
if (ret == 0 && addrlen != sizeof(struct sockaddr_in))
|
||||
return -EAFNOSUPPORT;
|
||||
else if (ret < 0)
|
||||
return ret;
|
||||
|
||||
return sizeof(struct sockaddr_in);
|
||||
}
|
||||
static inline int kc_kernel_getpeername(struct socket *sock, struct sockaddr *addr)
|
||||
{
|
||||
int addrlen = sizeof(struct sockaddr_in);
|
||||
int ret = kernel_getpeername(sock, addr, &addrlen);
|
||||
if (ret == 0 && addrlen != sizeof(struct sockaddr_in))
|
||||
return -EAFNOSUPPORT;
|
||||
else if (ret < 0)
|
||||
return ret;
|
||||
|
||||
return sizeof(struct sockaddr_in);
|
||||
}
|
||||
#else
|
||||
#define kc_kernel_getsockname(sock, addr) kernel_getsockname(sock, addr)
|
||||
#define kc_kernel_getpeername(sock, addr) kernel_getpeername(sock, addr)
|
||||
#endif
|
||||
|
||||
#ifdef KC_SOCK_CREATE_KERN_NET
|
||||
#define kc_sock_create_kern(family, type, proto, res) sock_create_kern(&init_net, family, type, proto, res)
|
||||
#else
|
||||
#define kc_sock_create_kern sock_create_kern
|
||||
#endif
|
||||
|
||||
#ifndef KC_GENERIC_FILE_BUFFERED_WRITE
|
||||
ssize_t kc_generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos, loff_t *ppos,
|
||||
size_t count, ssize_t written);
|
||||
#define generic_file_buffered_write kc_generic_file_buffered_write
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/preempt_mask.h> /* a rhel shed.h needed preempt_offset? */
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mm.h>
|
||||
@@ -36,6 +35,7 @@
|
||||
#include "xattr.h"
|
||||
#include "item.h"
|
||||
#include "omap.h"
|
||||
#include "util.h"
|
||||
|
||||
/*
|
||||
* scoutfs uses a lock service to manage item cache consistency between
|
||||
@@ -77,7 +77,7 @@ struct lock_info {
|
||||
bool unmounting;
|
||||
struct rb_root lock_tree;
|
||||
struct rb_root lock_range_tree;
|
||||
struct shrinker shrinker;
|
||||
KC_DEFINE_SHRINKER(shrinker);
|
||||
struct list_head lru_list;
|
||||
unsigned long long lru_nr;
|
||||
struct workqueue_struct *workq;
|
||||
@@ -1346,7 +1346,7 @@ void scoutfs_lock_del_coverage(struct super_block *sb,
|
||||
bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
|
||||
enum scoutfs_lock_mode mode)
|
||||
{
|
||||
signed char lock_mode = ACCESS_ONCE(lock->mode);
|
||||
signed char lock_mode = READ_ONCE(lock->mode);
|
||||
|
||||
return lock_modes_match(lock_mode, mode) &&
|
||||
scoutfs_key_compare_ranges(key, key,
|
||||
@@ -1401,6 +1401,17 @@ static void lock_shrink_worker(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long lock_count_objects(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
|
||||
struct super_block *sb = linfo->sb;
|
||||
|
||||
scoutfs_inc_counter(sb, lock_count_objects);
|
||||
|
||||
return shrinker_min_long(linfo->lru_nr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start the shrinking process for locks on the lru. If a lock is on
|
||||
* the lru then it can't have any active users. We don't want to block
|
||||
@@ -1413,21 +1424,18 @@ static void lock_shrink_worker(struct work_struct *work)
|
||||
* mode which will prevent the lock from being freed when the null
|
||||
* response arrives.
|
||||
*/
|
||||
static int scoutfs_lock_shrink(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
static unsigned long lock_scan_objects(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct lock_info *linfo = container_of(shrink, struct lock_info,
|
||||
shrinker);
|
||||
struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
|
||||
struct super_block *sb = linfo->sb;
|
||||
struct scoutfs_lock *lock;
|
||||
struct scoutfs_lock *tmp;
|
||||
unsigned long nr;
|
||||
unsigned long freed = 0;
|
||||
unsigned long nr = sc->nr_to_scan;
|
||||
bool added = false;
|
||||
int ret;
|
||||
|
||||
nr = sc->nr_to_scan;
|
||||
if (nr == 0)
|
||||
goto out;
|
||||
scoutfs_inc_counter(sb, lock_scan_objects);
|
||||
|
||||
spin_lock(&linfo->lock);
|
||||
|
||||
@@ -1445,6 +1453,7 @@ restart:
|
||||
lock->request_pending = 1;
|
||||
list_add_tail(&lock->shrink_head, &linfo->shrink_list);
|
||||
added = true;
|
||||
freed++;
|
||||
|
||||
scoutfs_inc_counter(sb, lock_shrink_attempted);
|
||||
trace_scoutfs_lock_shrink(sb, lock);
|
||||
@@ -1459,10 +1468,8 @@ restart:
|
||||
if (added)
|
||||
queue_work(linfo->workq, &linfo->shrink_work);
|
||||
|
||||
out:
|
||||
ret = min_t(unsigned long, linfo->lru_nr, INT_MAX);
|
||||
trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, ret);
|
||||
return ret;
|
||||
trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, freed);
|
||||
return freed;
|
||||
}
|
||||
|
||||
void scoutfs_free_unused_locks(struct super_block *sb)
|
||||
@@ -1473,7 +1480,7 @@ void scoutfs_free_unused_locks(struct super_block *sb)
|
||||
.nr_to_scan = INT_MAX,
|
||||
};
|
||||
|
||||
linfo->shrinker.shrink(&linfo->shrinker, &sc);
|
||||
lock_scan_objects(KC_SHRINKER_FN(&linfo->shrinker), &sc);
|
||||
}
|
||||
|
||||
static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
|
||||
@@ -1580,7 +1587,7 @@ void scoutfs_lock_shutdown(struct super_block *sb)
|
||||
trace_scoutfs_lock_shutdown(sb, linfo);
|
||||
|
||||
/* stop the shrinker from queueing work */
|
||||
unregister_shrinker(&linfo->shrinker);
|
||||
KC_UNREGISTER_SHRINKER(&linfo->shrinker);
|
||||
flush_work(&linfo->shrink_work);
|
||||
|
||||
/* cause current and future lock calls to return errors */
|
||||
@@ -1699,9 +1706,9 @@ int scoutfs_lock_setup(struct super_block *sb)
|
||||
spin_lock_init(&linfo->lock);
|
||||
linfo->lock_tree = RB_ROOT;
|
||||
linfo->lock_range_tree = RB_ROOT;
|
||||
linfo->shrinker.shrink = scoutfs_lock_shrink;
|
||||
linfo->shrinker.seeks = DEFAULT_SEEKS;
|
||||
register_shrinker(&linfo->shrinker);
|
||||
KC_INIT_SHRINKER_FUNCS(&linfo->shrinker, lock_count_objects,
|
||||
lock_scan_objects);
|
||||
KC_REGISTER_SHRINKER(&linfo->shrinker);
|
||||
INIT_LIST_HEAD(&linfo->lru_list);
|
||||
INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
|
||||
INIT_LIST_HEAD(&linfo->inv_list);
|
||||
|
||||
@@ -549,12 +549,16 @@ static int recvmsg_full(struct socket *sock, void *buf, unsigned len)
|
||||
|
||||
while (len) {
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.msg_iov = (struct iovec *)&kv;
|
||||
msg.msg_iovlen = 1;
|
||||
msg.msg_flags = MSG_NOSIGNAL;
|
||||
kv.iov_base = buf;
|
||||
kv.iov_len = len;
|
||||
|
||||
#ifndef KC_MSGHDR_STRUCT_IOV_ITER
|
||||
msg.msg_iov = (struct iovec *)&kv;
|
||||
msg.msg_iovlen = 1;
|
||||
#else
|
||||
iov_iter_init(&msg.msg_iter, READ, (struct iovec *)&kv, len, 1);
|
||||
#endif
|
||||
ret = kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
|
||||
if (ret <= 0)
|
||||
return -ECONNABORTED;
|
||||
@@ -707,12 +711,16 @@ static int sendmsg_full(struct socket *sock, void *buf, unsigned len)
|
||||
|
||||
while (len) {
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.msg_iov = (struct iovec *)&kv;
|
||||
msg.msg_iovlen = 1;
|
||||
msg.msg_flags = MSG_NOSIGNAL;
|
||||
kv.iov_base = buf;
|
||||
kv.iov_len = len;
|
||||
|
||||
#ifndef KC_MSGHDR_STRUCT_IOV_ITER
|
||||
msg.msg_iov = (struct iovec *)&kv;
|
||||
msg.msg_iovlen = 1;
|
||||
#else
|
||||
iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&kv, len, 1);
|
||||
#endif
|
||||
ret = kernel_sendmsg(sock, &msg, &kv, 1, len);
|
||||
if (ret <= 0)
|
||||
return -ECONNABORTED;
|
||||
@@ -897,7 +905,6 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
||||
struct socket *sock)
|
||||
{
|
||||
struct timeval tv;
|
||||
int addrlen;
|
||||
int optval;
|
||||
int ret;
|
||||
|
||||
@@ -947,23 +954,18 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
addrlen = sizeof(struct sockaddr_in);
|
||||
ret = kernel_getsockname(sock, (struct sockaddr *)&conn->sockname,
|
||||
&addrlen);
|
||||
if (ret == 0 && addrlen != sizeof(struct sockaddr_in))
|
||||
ret = -EAFNOSUPPORT;
|
||||
if (ret)
|
||||
ret = kc_kernel_getsockname(sock, (struct sockaddr *)&conn->sockname);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
addrlen = sizeof(struct sockaddr_in);
|
||||
ret = kernel_getpeername(sock, (struct sockaddr *)&conn->peername,
|
||||
&addrlen);
|
||||
if (ret == 0 && addrlen != sizeof(struct sockaddr_in))
|
||||
ret = -EAFNOSUPPORT;
|
||||
if (ret)
|
||||
ret = kc_kernel_getpeername(sock, (struct sockaddr *)&conn->peername);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
|
||||
conn->last_peername = conn->peername;
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@@ -1052,7 +1054,7 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
|
||||
|
||||
trace_scoutfs_net_connect_work_enter(sb, 0, 0);
|
||||
|
||||
ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
|
||||
ret = kc_sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1453,7 +1455,7 @@ int scoutfs_net_bind(struct super_block *sb,
|
||||
if (WARN_ON_ONCE(conn->sock))
|
||||
return -EINVAL;
|
||||
|
||||
ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
|
||||
ret = kc_sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1471,20 +1473,18 @@ int scoutfs_net_bind(struct super_block *sb,
|
||||
goto out;
|
||||
|
||||
ret = kernel_listen(sock, 255);
|
||||
if (ret)
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
addrlen = sizeof(struct sockaddr_in);
|
||||
ret = kernel_getsockname(sock, (struct sockaddr *)&conn->sockname,
|
||||
&addrlen);
|
||||
if (ret == 0 && addrlen != sizeof(struct sockaddr_in))
|
||||
ret = -EAFNOSUPPORT;
|
||||
if (ret)
|
||||
ret = kc_kernel_getsockname(sock, (struct sockaddr *)&conn->sockname);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
|
||||
conn->sock = sock;
|
||||
*sin = conn->sockname;
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
if (ret < 0 && sock)
|
||||
sock_release(sock);
|
||||
|
||||
@@ -33,6 +33,7 @@ enum {
|
||||
Opt_acl,
|
||||
Opt_data_prealloc_blocks,
|
||||
Opt_data_prealloc_contig_only,
|
||||
Opt_log_merge_wait_timeout_ms,
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
@@ -45,6 +46,7 @@ static const match_table_t tokens = {
|
||||
{Opt_acl, "acl"},
|
||||
{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
|
||||
{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
|
||||
{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
@@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts)
|
||||
kfree(opts->metadev_path);
|
||||
}
|
||||
|
||||
#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL
|
||||
#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500
|
||||
#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
#define MIN_ORPHAN_SCAN_DELAY_MS 100UL
|
||||
#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
@@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
}
|
||||
|
||||
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
|
||||
scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
|
||||
val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
@@ -169,7 +191,7 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
switch (token) {
|
||||
|
||||
case Opt_acl:
|
||||
sb->s_flags |= MS_POSIXACL;
|
||||
sb->s_flags |= SB_POSIXACL;
|
||||
break;
|
||||
|
||||
case Opt_data_prealloc_blocks:
|
||||
@@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->data_prealloc_contig_only = nr;
|
||||
break;
|
||||
|
||||
case Opt_log_merge_wait_timeout_ms:
|
||||
ret = match_int(args, &nr);
|
||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
opts->log_merge_wait_timeout_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_metadev_path:
|
||||
ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
|
||||
if (ret < 0)
|
||||
@@ -203,7 +233,7 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
break;
|
||||
|
||||
case Opt_noacl:
|
||||
sb->s_flags &= ~MS_POSIXACL;
|
||||
sb->s_flags &= ~SB_POSIXACL;
|
||||
break;
|
||||
|
||||
case Opt_orphan_scan_delay_ms:
|
||||
@@ -327,7 +357,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct super_block *sb = root->d_sb;
|
||||
struct scoutfs_mount_options opts;
|
||||
const bool is_acl = !!(sb->s_flags & MS_POSIXACL);
|
||||
const bool is_acl = !!(sb->s_flags & SB_POSIXACL);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
@@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
|
||||
}
|
||||
SCOUTFS_ATTR_RW(data_prealloc_contig_only);
|
||||
|
||||
static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
|
||||
}
|
||||
static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
int val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoint(nullterm, 0, &val);
|
||||
ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
|
||||
if (ret == 0) {
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.log_merge_wait_timeout_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
ret = count;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
|
||||
|
||||
static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
@@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
|
||||
static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
struct scoutfs_mount_options {
|
||||
u64 data_prealloc_blocks;
|
||||
bool data_prealloc_contig_only;
|
||||
unsigned int log_merge_wait_timeout_ms;
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
@@ -183,7 +183,7 @@ static int create_socket(struct super_block *sb)
|
||||
int addrlen;
|
||||
int ret;
|
||||
|
||||
ret = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
|
||||
ret = kc_sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "quorum couldn't create udp socket: %d", ret);
|
||||
goto out;
|
||||
@@ -243,8 +243,10 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only
|
||||
};
|
||||
struct sockaddr_in sin;
|
||||
struct msghdr mh = {
|
||||
#ifndef KC_MSGHDR_STRUCT_IOV_ITER
|
||||
.msg_iov = (struct iovec *)&kv,
|
||||
.msg_iovlen = 1,
|
||||
#endif
|
||||
.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
|
||||
.msg_name = &sin,
|
||||
.msg_namelen = sizeof(sin),
|
||||
@@ -266,6 +268,9 @@ static int send_msg_members(struct super_block *sb, int type, u64 term, int only
|
||||
|
||||
scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
|
||||
now = ktime_get();
|
||||
#ifdef KC_MSGHDR_STRUCT_IOV_ITER
|
||||
iov_iter_init(&mh.msg_iter, WRITE, (struct iovec *)&kv, sizeof(qmes), 1);
|
||||
#endif
|
||||
ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
|
||||
if (ret != kv.iov_len)
|
||||
failed++;
|
||||
@@ -308,8 +313,10 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
|
||||
.iov_len = sizeof(struct scoutfs_quorum_message),
|
||||
};
|
||||
struct msghdr mh = {
|
||||
#ifndef KC_MSGHDR_STRUCT_IOV_ITER
|
||||
.msg_iov = (struct iovec *)&kv,
|
||||
.msg_iovlen = 1,
|
||||
#endif
|
||||
.msg_flags = MSG_NOSIGNAL,
|
||||
};
|
||||
|
||||
@@ -331,6 +338,9 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef KC_MSGHDR_STRUCT_IOV_ITER
|
||||
iov_iter_init(&mh.msg_iter, READ, (struct iovec *)&kv, sizeof(struct scoutfs_quorum_message), 1);
|
||||
#endif
|
||||
ret = kernel_recvmsg(qinf->sock, &mh, &kv, 1, kv.iov_len, mh.msg_flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
@@ -719,11 +729,13 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst = {0,};
|
||||
struct hb_recording hbr = {{0,},};
|
||||
struct hb_recording hbr;
|
||||
bool record_hb;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
memset(&hbr, 0, sizeof(struct hb_recording));
|
||||
|
||||
/* recording votes from slots as native single word bitmap */
|
||||
BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);
|
||||
|
||||
@@ -771,8 +783,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
msg.type = SCOUTFS_QUORUM_MSG_INVALID;
|
||||
|
||||
trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for,
|
||||
qst.vote_bits,
|
||||
ktime_to_timespec64(qst.timeout));
|
||||
qst.vote_bits, ktime_to_ns(qst.timeout));
|
||||
|
||||
/* receiving greater terms resets term, becomes follower */
|
||||
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
|
||||
|
||||
@@ -439,6 +439,7 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->journal_info = (unsigned long)journal_info;
|
||||
__entry->holders = holders;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
|
||||
@@ -1746,21 +1747,41 @@ TRACE_EVENT(scoutfs_btree_merge,
|
||||
sk_trace_args(end))
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_btree_merge_read_range,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
|
||||
int size),
|
||||
|
||||
TP_ARGS(sb, start, end, size),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
sk_trace_define(start)
|
||||
sk_trace_define(end)
|
||||
__field(int, size)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
sk_trace_assign(start, start);
|
||||
sk_trace_assign(end, end);
|
||||
__entry->size = size;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
TP_PROTO(struct super_block *sb,
|
||||
struct scoutfs_btree_root *m_root,
|
||||
struct scoutfs_key *m_key, int m_val_len,
|
||||
struct scoutfs_btree_root *f_root,
|
||||
struct scoutfs_key *f_key, int f_val_len,
|
||||
int is_del),
|
||||
|
||||
TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
|
||||
TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, m_root_blkno)
|
||||
__field(__u64, m_root_seq)
|
||||
__field(__u8, m_root_height)
|
||||
sk_trace_define(m_key)
|
||||
__field(int, m_val_len)
|
||||
__field(__u64, f_root_blkno)
|
||||
@@ -1773,10 +1794,6 @@ TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->m_root_blkno = m_root ?
|
||||
le64_to_cpu(m_root->ref.blkno) : 0;
|
||||
__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
|
||||
__entry->m_root_height = m_root ? m_root->height : 0;
|
||||
sk_trace_assign(m_key, m_key);
|
||||
__entry->m_val_len = m_val_len;
|
||||
__entry->f_root_blkno = f_root ?
|
||||
@@ -1788,11 +1805,9 @@ TRACE_EVENT(scoutfs_btree_merge_items,
|
||||
__entry->is_del = !!is_del;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
|
||||
SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
|
||||
__entry->m_root_height, sk_trace_args(m_key),
|
||||
__entry->m_val_len, __entry->f_root_blkno,
|
||||
__entry->f_root_seq, __entry->f_root_height,
|
||||
TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
|
||||
__entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
|
||||
sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
|
||||
);
|
||||
|
||||
@@ -2024,9 +2039,9 @@ DEFINE_EVENT(scoutfs_quorum_message_class, scoutfs_quorum_recv_message,
|
||||
|
||||
TRACE_EVENT(scoutfs_quorum_loop,
|
||||
TP_PROTO(struct super_block *sb, int role, u64 term, int vote_for,
|
||||
unsigned long vote_bits, struct timespec64 timeout),
|
||||
unsigned long vote_bits, unsigned long long nsecs),
|
||||
|
||||
TP_ARGS(sb, role, term, vote_for, vote_bits, timeout),
|
||||
TP_ARGS(sb, role, term, vote_for, vote_bits, nsecs),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
@@ -2035,8 +2050,7 @@ TRACE_EVENT(scoutfs_quorum_loop,
|
||||
__field(int, vote_for)
|
||||
__field(unsigned long, vote_bits)
|
||||
__field(unsigned long, vote_count)
|
||||
__field(unsigned long long, timeout_sec)
|
||||
__field(int, timeout_nsec)
|
||||
__field(unsigned long long, nsecs)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -2046,14 +2060,13 @@ TRACE_EVENT(scoutfs_quorum_loop,
|
||||
__entry->vote_for = vote_for;
|
||||
__entry->vote_bits = vote_bits;
|
||||
__entry->vote_count = hweight_long(vote_bits);
|
||||
__entry->timeout_sec = timeout.tv_sec;
|
||||
__entry->timeout_nsec = timeout.tv_nsec;
|
||||
__entry->nsecs = nsecs;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" term %llu role %d vote_for %d vote_bits 0x%lx vote_count %lu timeout %llu.%u",
|
||||
TP_printk(SCSBF" term %llu role %d vote_for %d vote_bits 0x%lx vote_count %lu timeout %llu",
|
||||
SCSB_TRACE_ARGS, __entry->term, __entry->role,
|
||||
__entry->vote_for, __entry->vote_bits, __entry->vote_count,
|
||||
__entry->timeout_sec, __entry->timeout_nsec)
|
||||
__entry->nsecs)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_seq_last,
|
||||
@@ -2077,6 +2090,71 @@ TRACE_EVENT(scoutfs_trans_seq_last,
|
||||
SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_server_finalize_items,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
|
||||
u64 item_get_trans_seq),
|
||||
|
||||
TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, c_rid)
|
||||
__field(__u64, item_rid)
|
||||
__field(__u64, item_nr)
|
||||
__field(__u64, item_flags)
|
||||
__field(__u64, item_get_trans_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->c_rid = rid;
|
||||
__entry->item_rid = item_rid;
|
||||
__entry->item_nr = item_nr;
|
||||
__entry->item_flags = item_flags;
|
||||
__entry->item_get_trans_seq = item_get_trans_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
|
||||
__entry->item_flags, __entry->item_get_trans_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_server_finalize_decision,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
|
||||
bool ours_visible, bool finalize_ours, unsigned int delay_ms,
|
||||
u64 finalize_sent_seq),
|
||||
|
||||
TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
|
||||
finalize_sent_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, c_rid)
|
||||
__field(bool, saw_finalized)
|
||||
__field(bool, others_active)
|
||||
__field(bool, ours_visible)
|
||||
__field(bool, finalize_ours)
|
||||
__field(unsigned int, delay_ms)
|
||||
__field(__u64, finalize_sent_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->c_rid = rid;
|
||||
__entry->saw_finalized = saw_finalized;
|
||||
__entry->others_active = others_active;
|
||||
__entry->ours_visible = ours_visible;
|
||||
__entry->finalize_ours = finalize_ours;
|
||||
__entry->delay_ms = delay_ms;
|
||||
__entry->finalize_sent_seq = finalize_sent_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
|
||||
__entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
|
||||
__entry->finalize_sent_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_get_log_merge_status,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
|
||||
u64 nr_requests, u64 nr_complete, u64 seq),
|
||||
@@ -2801,6 +2879,81 @@ TRACE_EVENT(scoutfs_omap_should_delete,
|
||||
SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
|
||||
);
|
||||
|
||||
#define SSCF_FMT "[bo %llu bs %llu es %llu]"
|
||||
#define SSCF_FIELDS(pref) \
|
||||
__field(__u64, pref##_blkno) \
|
||||
__field(__u64, pref##_blocks) \
|
||||
__field(__u64, pref##_entries)
|
||||
#define SSCF_ASSIGN(pref, sfl) \
|
||||
__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno); \
|
||||
__entry->pref##_blocks = le64_to_cpu((sfl)->blocks); \
|
||||
__entry->pref##_entries = le64_to_cpu((sfl)->entries);
|
||||
#define SSCF_ENTRY_ARGS(pref) \
|
||||
__entry->pref##_blkno, \
|
||||
__entry->pref##_blocks, \
|
||||
__entry->pref##_entries
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
|
||||
|
||||
TP_ARGS(sb, sc),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, id)
|
||||
__field(__u8, nr)
|
||||
__field(__u8, flags)
|
||||
SSCF_FIELDS(out)
|
||||
__field(__u64, in0_blk)
|
||||
__field(__u64, in0_pos)
|
||||
SSCF_FIELDS(in0)
|
||||
__field(__u64, in1_blk)
|
||||
__field(__u64, in1_pos)
|
||||
SSCF_FIELDS(in1)
|
||||
__field(__u64, in2_blk)
|
||||
__field(__u64, in2_pos)
|
||||
SSCF_FIELDS(in2)
|
||||
__field(__u64, in3_blk)
|
||||
__field(__u64, in3_pos)
|
||||
SSCF_FIELDS(in3)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->id = le64_to_cpu(sc->id);
|
||||
__entry->nr = sc->nr;
|
||||
__entry->flags = sc->flags;
|
||||
SSCF_ASSIGN(out, &sc->out)
|
||||
__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in0, &sc->in[0].sfl)
|
||||
__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in1, &sc->in[1].sfl)
|
||||
__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in2, &sc->in[2].sfl)
|
||||
__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
|
||||
__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
|
||||
SSCF_ASSIGN(in3, &sc->in[3].sfl)
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
|
||||
SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
|
||||
__entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
|
||||
__entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
|
||||
__entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
|
||||
__entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
|
||||
TP_ARGS(sb, sc)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
|
||||
TP_ARGS(sb, sc)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_SCOUTFS_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
||||
@@ -91,6 +91,7 @@ do { \
|
||||
struct server_info {
|
||||
struct super_block *sb;
|
||||
spinlock_t lock;
|
||||
seqlock_t seqlock;
|
||||
wait_queue_head_t waitq;
|
||||
|
||||
struct workqueue_struct *wq;
|
||||
@@ -132,11 +133,9 @@ struct server_info {
|
||||
struct mutex mounted_clients_mutex;
|
||||
|
||||
/* stable super stored from commits, given in locks and rpcs */
|
||||
seqcount_t stable_seqcount;
|
||||
struct scoutfs_super_block stable_super;
|
||||
|
||||
/* serializing and get and set volume options */
|
||||
seqcount_t volopt_seqcount;
|
||||
struct mutex volopt_mutex;
|
||||
struct scoutfs_volume_options volopt;
|
||||
|
||||
@@ -149,6 +148,8 @@ struct server_info {
|
||||
struct scoutfs_quorum_config qconf;
|
||||
/* a running server maintains a private dirty super */
|
||||
struct scoutfs_super_block dirty_super;
|
||||
|
||||
u64 finalize_sent_seq;
|
||||
};
|
||||
|
||||
#define DECLARE_SERVER_INFO(sb, name) \
|
||||
@@ -182,7 +183,7 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
|
||||
unsigned seq;
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&server->volopt_seqcount);
|
||||
seq = read_seqbegin(&server->seqlock);
|
||||
if ((le64_to_cpu(server->volopt.set_bits) & bit)) {
|
||||
is_set = true;
|
||||
*val = le64_to_cpup(opt);
|
||||
@@ -190,7 +191,7 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
|
||||
is_set = false;
|
||||
*val = 0;
|
||||
};
|
||||
} while (read_seqcount_retry(&server->volopt_seqcount, seq));
|
||||
} while (read_seqretry(&server->seqlock, seq));
|
||||
|
||||
return is_set;
|
||||
}
|
||||
@@ -414,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
|
||||
wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the higher of the avail or freed used by the active commit
|
||||
* since this holder joined the commit. This is *not* the amount used
|
||||
* by the holder, we don't track per-holder alloc use.
|
||||
*/
|
||||
static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u32 avail_used;
|
||||
u32 freed_used;
|
||||
u32 avail_now;
|
||||
u32 freed_now;
|
||||
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
|
||||
|
||||
avail_used = hold->avail - avail_now;
|
||||
freed_used = hold->freed - freed_now;
|
||||
|
||||
return max(avail_used, freed_used);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called while holding the commit and returns once the commit
|
||||
* is successfully written. Many holders can all wait for all holders
|
||||
@@ -506,7 +528,7 @@ static void get_stable(struct super_block *sb, struct scoutfs_super_block *super
|
||||
unsigned int seq;
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&server->stable_seqcount);
|
||||
seq = read_seqbegin(&server->seqlock);
|
||||
if (super)
|
||||
*super = server->stable_super;
|
||||
if (roots) {
|
||||
@@ -514,7 +536,7 @@ static void get_stable(struct super_block *sb, struct scoutfs_super_block *super
|
||||
roots->logs_root = server->stable_super.logs_root;
|
||||
roots->srch_root = server->stable_super.srch_root;
|
||||
}
|
||||
} while (read_seqcount_retry(&server->stable_seqcount, seq));
|
||||
} while (read_seqretry(&server->seqlock, seq));
|
||||
}
|
||||
|
||||
u64 scoutfs_server_seq(struct super_block *sb)
|
||||
@@ -548,11 +570,9 @@ void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq)
|
||||
|
||||
static void set_stable_super(struct server_info *server, struct scoutfs_super_block *super)
|
||||
{
|
||||
preempt_disable();
|
||||
write_seqcount_begin(&server->stable_seqcount);
|
||||
write_seqlock(&server->seqlock);
|
||||
server->stable_super = *super;
|
||||
write_seqcount_end(&server->stable_seqcount);
|
||||
preempt_enable();
|
||||
write_sequnlock(&server->seqlock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -941,22 +961,24 @@ static int find_log_trees_item(struct super_block *sb,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next log_trees item from the key. Fills the caller's log_trees and sets
|
||||
* the key past the returned log_trees for iteration. Returns 0 when done, > 0 for each
|
||||
* item, and -errno on fatal errors.
|
||||
* Find the log_trees item with the greatest nr for each rid. Fills the
|
||||
* caller's log_trees and sets the key before the returned log_trees for
|
||||
* the next iteration. Returns 0 when done, > 0 for each item, and
|
||||
* -errno on fatal errors.
|
||||
*/
|
||||
static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
|
||||
static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
struct scoutfs_key *key, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_btree_next(sb, root, key, &iref);
|
||||
ret = scoutfs_btree_prev(sb, root, key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
|
||||
memcpy(lt, iref.val, iref.val_len);
|
||||
*key = *iref.key;
|
||||
scoutfs_key_inc(key);
|
||||
key->sklt_nr = 0;
|
||||
scoutfs_key_dec(key);
|
||||
ret = 1;
|
||||
} else {
|
||||
ret = -EIO;
|
||||
@@ -1051,21 +1073,13 @@ static int next_log_merge_item(struct super_block *sb,
|
||||
* abandoned log btree finalized. If it takes too long each client has
|
||||
* a change to make forward progress before being asked to commit again.
|
||||
*
|
||||
* We're waiting on heavy state that is protected by mutexes and
|
||||
* transaction machinery. It's tricky to recreate that state for
|
||||
* lightweight condition tests that don't change task state. Instead of
|
||||
* trying to get that right, particularly as we unwind after success or
|
||||
* after timeouts, waiters use an unsatisfying poll. Short enough to
|
||||
* not add terrible latency, given how heavy and infrequent this already
|
||||
* is, and long enough to not melt the cpu. This could be tuned if it
|
||||
* becomes a problem.
|
||||
*
|
||||
* This can end up finalizing a new empty log btree if a new mount
|
||||
* happens to arrive at just the right time. That's fine, merging will
|
||||
* ignore and tear down the empty input.
|
||||
*/
|
||||
#define FINALIZE_POLL_MS (11)
|
||||
#define FINALIZE_TIMEOUT_MS (MSEC_PER_SEC / 2)
|
||||
#define FINALIZE_POLL_MIN_DELAY_MS 5U
|
||||
#define FINALIZE_POLL_MAX_DELAY_MS 100U
|
||||
#define FINALIZE_POLL_DELAY_GROWTH_PCT 150U
|
||||
static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
|
||||
u64 rid, struct commit_hold *hold)
|
||||
{
|
||||
@@ -1073,8 +1087,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_log_trees each_lt;
|
||||
struct scoutfs_log_trees fin;
|
||||
unsigned int delay_ms;
|
||||
unsigned long timeo;
|
||||
bool saw_finalized;
|
||||
bool others_active;
|
||||
@@ -1082,10 +1098,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
bool ours_visible;
|
||||
struct scoutfs_key key;
|
||||
char *err_str = NULL;
|
||||
ktime_t start;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
|
||||
scoutfs_options_read(sb, &opts);
|
||||
timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
|
||||
delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
|
||||
start = ktime_get_raw();
|
||||
|
||||
for (;;) {
|
||||
/* nothing to do if there's already a merge in flight */
|
||||
@@ -1102,8 +1122,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
saw_finalized = false;
|
||||
others_active = false;
|
||||
ours_visible = false;
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
|
||||
trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
|
||||
le64_to_cpu(each_lt.nr),
|
||||
le64_to_cpu(each_lt.flags),
|
||||
le64_to_cpu(each_lt.get_trans_seq));
|
||||
|
||||
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
|
||||
saw_finalized = true;
|
||||
@@ -1128,6 +1153,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
finalize_ours = (lt->item_root.height > 2) ||
|
||||
(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);
|
||||
|
||||
trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
|
||||
ours_visible, finalize_ours, delay_ms,
|
||||
server->finalize_sent_seq);
|
||||
|
||||
/* done if we're not finalizing and there's no finalized */
|
||||
if (!finalize_ours && !saw_finalized) {
|
||||
ret = 0;
|
||||
@@ -1135,12 +1164,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
}
|
||||
|
||||
/* send sync requests soon to give time to commit */
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while (others_active &&
|
||||
(ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
(ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
|
||||
|
||||
if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
|
||||
(le64_to_cpu(each_lt.rid) == rid))
|
||||
(le64_to_cpu(each_lt.rid) == rid) ||
|
||||
(le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
|
||||
continue;
|
||||
|
||||
ret = scoutfs_net_submit_request_node(sb, server->conn,
|
||||
@@ -1160,6 +1190,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
break;
|
||||
}
|
||||
|
||||
server->finalize_sent_seq = scoutfs_server_seq(sb);
|
||||
|
||||
/* Finalize ours if it's visible to others */
|
||||
if (ours_visible) {
|
||||
fin = *lt;
|
||||
@@ -1197,13 +1229,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
if (ret < 0)
|
||||
err_str = "applying commit before waiting for finalized";
|
||||
|
||||
msleep(FINALIZE_POLL_MS);
|
||||
msleep(delay_ms);
|
||||
delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
|
||||
FINALIZE_POLL_MAX_DELAY_MS);
|
||||
|
||||
server_hold_commit(sb, hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
/* done if we timed out */
|
||||
if (time_after(jiffies, timeo)) {
|
||||
scoutfs_inc_counter(sb, log_merge_wait_timeout);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
@@ -1786,43 +1821,29 @@ out:
|
||||
* Give the caller the last seq before outstanding client commits. All
|
||||
* seqs up to and including this are stable, new client transactions can
|
||||
* only have greater seqs.
|
||||
*
|
||||
* For each rid, only its greatest log trees nr can be an open commit.
|
||||
* We look at the last log_trees item for each client rid and record its
|
||||
* trans seq if it hasn't been committed.
|
||||
*/
|
||||
static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
|
||||
{
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees *lt;
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_key key;
|
||||
u64 last_seq = 0;
|
||||
int ret;
|
||||
|
||||
last_seq = scoutfs_server_seq(sb) - 1;
|
||||
scoutfs_key_init_log_trees(&key, 0, 0);
|
||||
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
for (;; scoutfs_key_inc(&key)) {
|
||||
ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
|
||||
if (ret == 0) {
|
||||
if (iref.val_len == sizeof(*lt)) {
|
||||
lt = iref.val;
|
||||
if ((le64_to_cpu(lt->get_trans_seq) >
|
||||
le64_to_cpu(lt->commit_trans_seq)) &&
|
||||
le64_to_cpu(lt->get_trans_seq) <= last_seq) {
|
||||
last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
|
||||
}
|
||||
key = *iref.key;
|
||||
} else {
|
||||
ret = -EIO;
|
||||
}
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
}
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
|
||||
while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) {
|
||||
if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
|
||||
le64_to_cpu(lt.get_trans_seq) <= last_seq) {
|
||||
last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1969,9 +1990,7 @@ static int server_srch_get_compact(struct super_block *sb,
|
||||
ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
|
||||
&super->srch_root, rid, sc);
|
||||
mutex_unlock(&server->srch_mutex);
|
||||
if (ret == 0 && sc->nr == 0)
|
||||
ret = -ENOENT;
|
||||
if (ret < 0)
|
||||
if (ret < 0 || (ret == 0 && sc->nr == 0))
|
||||
goto apply;
|
||||
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
@@ -2476,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
|
||||
while (!server_is_stopping(server)) {
|
||||
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
if (!commit) {
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
commit = true;
|
||||
}
|
||||
|
||||
ret = next_log_merge_item(sb, &super->log_merge,
|
||||
SCOUTFS_LOG_MERGE_FREEING_ZONE,
|
||||
@@ -2525,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
/* freed blocks are in allocator, we *have* to update fr */
|
||||
BUG_ON(ret < 0);
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
break;
|
||||
if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
commit = false;
|
||||
if (ret < 0) {
|
||||
err_str = "looping commit del/upd freeing item";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3073,9 +3096,9 @@ static int server_get_volopt(struct super_block *sb, struct scoutfs_net_connecti
|
||||
}
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&server->volopt_seqcount);
|
||||
seq = read_seqbegin(&server->seqlock);
|
||||
volopt = server->volopt;
|
||||
} while (read_seqcount_retry(&server->volopt_seqcount, seq));
|
||||
} while (read_seqretry(&server->seqlock, seq));
|
||||
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, &volopt, sizeof(volopt));
|
||||
@@ -3144,12 +3167,12 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
|
||||
apply:
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
write_seqlock(&server->seqlock);
|
||||
if (ret == 0)
|
||||
server->volopt = super->volopt;
|
||||
else
|
||||
super->volopt = server->volopt;
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
write_sequnlock(&server->seqlock);
|
||||
|
||||
mutex_unlock(&server->volopt_mutex);
|
||||
out:
|
||||
@@ -3192,12 +3215,12 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
|
||||
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
write_seqlock(&server->seqlock);
|
||||
if (ret == 0)
|
||||
server->volopt = super->volopt;
|
||||
else
|
||||
super->volopt = server->volopt;
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
write_sequnlock(&server->seqlock);
|
||||
|
||||
mutex_unlock(&server->volopt_mutex);
|
||||
out:
|
||||
@@ -4303,6 +4326,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
server->finalize_sent_seq = 0;
|
||||
|
||||
/* first make sure no other servers are still running */
|
||||
ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
|
||||
@@ -4336,9 +4360,9 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
}
|
||||
|
||||
/* update volume options early, possibly for use during startup */
|
||||
write_seqcount_begin(&server->volopt_seqcount);
|
||||
write_seqlock(&server->seqlock);
|
||||
server->volopt = super->volopt;
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
write_sequnlock(&server->seqlock);
|
||||
|
||||
atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
|
||||
set_stable_super(server, super);
|
||||
@@ -4464,7 +4488,7 @@ void scoutfs_server_stop_wait(struct super_block *sb)
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
stop_server(server);
|
||||
flush_work_sync(&server->work);
|
||||
flush_work(&server->work);
|
||||
}
|
||||
|
||||
int scoutfs_server_setup(struct super_block *sb)
|
||||
@@ -4478,6 +4502,7 @@ int scoutfs_server_setup(struct super_block *sb)
|
||||
|
||||
server->sb = sb;
|
||||
spin_lock_init(&server->lock);
|
||||
seqlock_init(&server->seqlock);
|
||||
init_waitqueue_head(&server->waitq);
|
||||
INIT_WORK(&server->work, scoutfs_server_worker);
|
||||
server->status = SERVER_DOWN;
|
||||
@@ -4492,8 +4517,6 @@ int scoutfs_server_setup(struct super_block *sb)
|
||||
INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work);
|
||||
mutex_init(&server->srch_mutex);
|
||||
mutex_init(&server->mounted_clients_mutex);
|
||||
seqcount_init(&server->stable_seqcount);
|
||||
seqcount_init(&server->volopt_seqcount);
|
||||
mutex_init(&server->volopt_mutex);
|
||||
INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
|
||||
INIT_DELAYED_WORK(&server->reclaim_dwork, reclaim_worker);
|
||||
|
||||
211
kmod/src/srch.c
211
kmod/src/srch.c
@@ -30,6 +30,9 @@
|
||||
#include "client.h"
|
||||
#include "counters.h"
|
||||
#include "scoutfs_trace.h"
|
||||
#include "triggers.h"
|
||||
#include "sysfs.h"
|
||||
#include "msg.h"
|
||||
|
||||
/*
|
||||
* This srch subsystem gives us a way to find inodes that have a given
|
||||
@@ -68,10 +71,14 @@ struct srch_info {
|
||||
atomic_t shutdown;
|
||||
struct workqueue_struct *workq;
|
||||
struct delayed_work compact_dwork;
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
atomic_t compact_delay_ms;
|
||||
};
|
||||
|
||||
#define DECLARE_SRCH_INFO(sb, name) \
|
||||
struct srch_info *name = SCOUTFS_SB(sb)->srch_info
|
||||
#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
|
||||
DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
|
||||
|
||||
#define SRE_FMT "%016llx.%llu.%llu"
|
||||
#define SRE_ARG(sre) \
|
||||
@@ -520,6 +527,95 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Padded entries are encoded in pairs after an existing entry. All of
|
||||
* the pairs cancel each other out by all readers (the second encoding
|
||||
* looks like deletion) so they aren't visible to the first/last bounds of
|
||||
* the block or file.
|
||||
*/
|
||||
static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
|
||||
struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
|
||||
sre, &srb->tail);
|
||||
if (ret > 0) {
|
||||
srb->tail = *sre;
|
||||
le32_add_cpu(&srb->entry_nr, 1);
|
||||
le32_add_cpu(&srb->entry_bytes, ret);
|
||||
le64_add_cpu(&sfl->entries, 1);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called by a testing trigger to create a very specific case of
|
||||
* encoded entry offsets. We want the last entry in the block to start
|
||||
* precisely at the _SAFE_BYTES offset.
|
||||
*
|
||||
* This is called when there is a single existing entry in the block.
|
||||
* We have the entire block to work with. We encode pairs of matching
|
||||
* entries. This hides them from readers (both searches and merging) as
|
||||
* they're interpreted as creation and deletion and are deleted. We use
|
||||
* the existing hash value of the first entry in the block but then set
|
||||
* the inode to an impossibly large number so it doesn't interfere with
|
||||
* anything.
|
||||
*
|
||||
* To hit the specific offset we very carefully manage the amount of
|
||||
* bytes of change between fields in the entry. We know that if we
|
||||
* change all the byte of the ino and id we end up with a 20 byte
|
||||
* (2+8+8,2) encoding of the pair of entries. To have the last entry
|
||||
* start at the _SAFE_POS offset we know that the final 20 byte pair
|
||||
* encoding needs to end at 2 bytes (second entry encoding) after the
|
||||
* _SAFE_POS offset.
|
||||
*
|
||||
* So as we encode pairs we watch the delta of our current offset from
|
||||
* that desired final offset of 2 past _SAFE_POS. If we're a multiple
|
||||
* of 20 away then we encode the full 20 byte pairs. If we're not, then
|
||||
* we drop a byte to encode 19 bytes. That'll slowly change the offset
|
||||
* to be a multiple of 20 again while encoding large entries.
|
||||
*/
|
||||
static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
|
||||
struct scoutfs_srch_block *srb)
|
||||
{
|
||||
struct scoutfs_srch_entry sre;
|
||||
u32 target;
|
||||
s32 diff;
|
||||
u64 hash;
|
||||
u64 ino;
|
||||
u64 id;
|
||||
int ret;
|
||||
|
||||
hash = le64_to_cpu(srb->tail.hash);
|
||||
ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
|
||||
id = le64_to_cpu(srb->tail.id);
|
||||
|
||||
target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
|
||||
|
||||
while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
|
||||
ino ^= 1ULL << (7 * 8);
|
||||
if (diff % 20 == 0) {
|
||||
id ^= 1ULL << (7 * 8);
|
||||
} else {
|
||||
id ^= 1ULL << (6 * 8);
|
||||
}
|
||||
|
||||
sre.hash = cpu_to_le64(hash);
|
||||
sre.ino = cpu_to_le64(ino);
|
||||
sre.id = cpu_to_le64(id);
|
||||
|
||||
ret = append_padded_entry(sfl, blk, srb, &sre);
|
||||
if (ret == 0)
|
||||
ret = append_padded_entry(sfl, blk, srb, &sre);
|
||||
BUG_ON(ret != 0);
|
||||
|
||||
diff = target - le32_to_cpu(srb->entry_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller is dropping an ino/id because the tracking rbtree is full.
|
||||
* This loses information so we can't return any entries at or after the
|
||||
@@ -987,6 +1083,9 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
|
||||
force = true;
|
||||
|
||||
if (sfl->ref.blkno == 0 ||
|
||||
(!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
|
||||
return 0;
|
||||
@@ -1462,7 +1561,7 @@ static int kway_merge(struct super_block *sb,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_srch_file *sfl,
|
||||
kway_get_t kway_get, kway_advance_t kway_adv,
|
||||
void **args, int nr)
|
||||
void **args, int nr, bool logs_input)
|
||||
{
|
||||
DECLARE_SRCH_INFO(sb, srinf);
|
||||
struct scoutfs_srch_block *srb = NULL;
|
||||
@@ -1567,6 +1666,15 @@ static int kway_merge(struct super_block *sb,
|
||||
blk++;
|
||||
}
|
||||
|
||||
/* end sorted block on _SAFE offset for testing */
|
||||
if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
|
||||
scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
|
||||
pad_entries_at_safe(sfl, blk, srb);
|
||||
scoutfs_block_put(sb, bl);
|
||||
bl = NULL;
|
||||
blk++;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, srch_compact_entry);
|
||||
|
||||
} else {
|
||||
@@ -1609,6 +1717,8 @@ static int kway_merge(struct super_block *sb,
|
||||
empty++;
|
||||
ret = 0;
|
||||
} else if (ret < 0) {
|
||||
if (ret == -ENOANO) /* just testing trigger */
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -1747,7 +1857,7 @@ static int compact_logs(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
page->private = 0;
|
||||
list_add_tail(&page->list, &pages);
|
||||
list_add_tail(&page->lru, &pages);
|
||||
nr_pages++;
|
||||
scoutfs_inc_counter(sb, srch_compact_log_page);
|
||||
}
|
||||
@@ -1800,7 +1910,7 @@ static int compact_logs(struct super_block *sb,
|
||||
|
||||
/* sort page entries and reset private for _next */
|
||||
i = 0;
|
||||
list_for_each_entry(page, &pages, list) {
|
||||
list_for_each_entry(page, &pages, lru) {
|
||||
args[i++] = page;
|
||||
|
||||
if (atomic_read(&srinf->shutdown)) {
|
||||
@@ -1816,12 +1926,12 @@ static int compact_logs(struct super_block *sb,
|
||||
}
|
||||
|
||||
ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
|
||||
args, nr_pages);
|
||||
args, nr_pages, true);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* make sure we finished all the pages */
|
||||
list_for_each_entry(page, &pages, list) {
|
||||
list_for_each_entry(page, &pages, lru) {
|
||||
sre = page_priv_sre(page);
|
||||
if (page->private < SRES_PER_PAGE && sre->ino != 0) {
|
||||
ret = -ENOSPC;
|
||||
@@ -1834,8 +1944,8 @@ static int compact_logs(struct super_block *sb,
|
||||
out:
|
||||
scoutfs_block_put(sb, bl);
|
||||
vfree(args);
|
||||
list_for_each_entry_safe(page, tmp, &pages, list) {
|
||||
list_del(&page->list);
|
||||
list_for_each_entry_safe(page, tmp, &pages, lru) {
|
||||
list_del(&page->lru);
|
||||
__free_page(page);
|
||||
}
|
||||
|
||||
@@ -1874,12 +1984,18 @@ static int kway_get_reader(struct super_block *sb,
|
||||
srb = rdr->bl->data;
|
||||
|
||||
if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
|
||||
rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
|
||||
rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
|
||||
rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
|
||||
/* XXX inconsistency */
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
|
||||
scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
|
||||
/* only used in testing */
|
||||
return -ENOANO;
|
||||
}
|
||||
|
||||
/* decode entry, possibly skipping start of the block */
|
||||
while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
|
||||
ret = decode_entry(srb->entries + rdr->pos,
|
||||
@@ -1969,7 +2085,7 @@ static int compact_sorted(struct super_block *sb,
|
||||
}
|
||||
|
||||
ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
|
||||
kway_adv_reader, args, nr);
|
||||
kway_adv_reader, args, nr, false);
|
||||
|
||||
sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
|
||||
for (i = 0; i < nr; i++) {
|
||||
@@ -2098,8 +2214,15 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait 10s between compact attempts on error, immediate after success */
|
||||
#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
static void queue_compact_work(struct srch_info *srinf, bool immediate)
|
||||
{
|
||||
unsigned long delay;
|
||||
|
||||
if (!atomic_read(&srinf->shutdown)) {
|
||||
delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
|
||||
queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a compaction operation from the server, sort the entries from the
|
||||
@@ -2127,7 +2250,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
|
||||
struct super_block *sb = srinf->sb;
|
||||
struct scoutfs_block_writer wri;
|
||||
struct scoutfs_alloc alloc;
|
||||
unsigned long delay;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
@@ -2140,6 +2262,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
|
||||
scoutfs_block_writer_init(sb, &wri);
|
||||
|
||||
ret = scoutfs_client_srch_get_compact(sb, sc);
|
||||
if (ret >= 0)
|
||||
trace_scoutfs_srch_compact_client_recv(sb, sc);
|
||||
if (ret < 0 || sc->nr == 0)
|
||||
goto out;
|
||||
|
||||
@@ -2168,6 +2292,7 @@ commit:
|
||||
sc->meta_freed = alloc.freed;
|
||||
sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;
|
||||
|
||||
trace_scoutfs_srch_compact_client_send(sb, sc);
|
||||
err = scoutfs_client_srch_commit_compact(sb, sc);
|
||||
if (err < 0 && ret == 0)
|
||||
ret = err;
|
||||
@@ -2178,14 +2303,56 @@ out:
|
||||
scoutfs_inc_counter(sb, srch_compact_error);
|
||||
|
||||
scoutfs_block_writer_forget_all(sb, &wri);
|
||||
if (!atomic_read(&srinf->shutdown)) {
|
||||
delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
|
||||
queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
|
||||
}
|
||||
queue_compact_work(srinf, sc->nr > 0 && ret == 0);
|
||||
|
||||
kfree(sc);
|
||||
}
|
||||
|
||||
static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
|
||||
}
|
||||
|
||||
#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
|
||||
#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
|
||||
#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_SRCH_INFO(sb, srinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
u64 val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoll(nullterm, 0, &val);
|
||||
if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
|
||||
scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
|
||||
MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
atomic_set(&srinf->compact_delay_ms, val);
|
||||
cancel_delayed_work(&srinf->compact_dwork);
|
||||
queue_compact_work(srinf, false);
|
||||
|
||||
return count;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(compact_delay_ms);
|
||||
|
||||
static struct attribute *srch_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(compact_delay_ms),
|
||||
NULL,
|
||||
};
|
||||
|
||||
void scoutfs_srch_destroy(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
@@ -2202,6 +2369,8 @@ void scoutfs_srch_destroy(struct super_block *sb)
|
||||
destroy_workqueue(srinf->workq);
|
||||
}
|
||||
|
||||
scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
|
||||
|
||||
kfree(srinf);
|
||||
sbi->srch_info = NULL;
|
||||
}
|
||||
@@ -2219,8 +2388,15 @@ int scoutfs_srch_setup(struct super_block *sb)
|
||||
srinf->sb = sb;
|
||||
atomic_set(&srinf->shutdown, 0);
|
||||
INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
|
||||
scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
|
||||
atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
|
||||
|
||||
sbi->srch_info = srinf;
|
||||
|
||||
ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
srinf->workq = alloc_workqueue("scoutfs_srch_compact",
|
||||
WQ_NON_REENTRANT | WQ_UNBOUND |
|
||||
WQ_HIGHPRI, 0);
|
||||
@@ -2229,8 +2405,7 @@ int scoutfs_srch_setup(struct super_block *sb)
|
||||
goto out;
|
||||
}
|
||||
|
||||
queue_delayed_work(srinf->workq, &srinf->compact_dwork,
|
||||
msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));
|
||||
queue_compact_work(srinf, false);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/magic.h>
|
||||
@@ -178,7 +179,7 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
/*
|
||||
* Wait for invalidation and iput to finish with any lingering
|
||||
* inode references that escaped the evict_inodes in
|
||||
* generic_shutdown_super. MS_ACTIVE is clear so final iput
|
||||
* generic_shutdown_super. SB_ACTIVE is clear so final iput
|
||||
* will always evict.
|
||||
*/
|
||||
scoutfs_lock_flush_invalidate(sb);
|
||||
@@ -485,7 +486,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sb->s_d_op = &scoutfs_dentry_ops;
|
||||
sb->s_export_op = &scoutfs_export_ops;
|
||||
sb->s_xattr = scoutfs_xattr_handlers;
|
||||
sb->s_flags |= MS_I_VERSION | MS_POSIXACL;
|
||||
sb->s_flags |= SB_I_VERSION | SB_POSIXACL;
|
||||
sb->s_time_gran = 1;
|
||||
|
||||
/* btree blocks use long lived bh->b_data refs */
|
||||
@@ -674,14 +675,14 @@ out:
|
||||
teardown_module();
|
||||
return ret;
|
||||
}
|
||||
module_init(scoutfs_module_init)
|
||||
module_init(scoutfs_module_init);
|
||||
|
||||
static void __exit scoutfs_module_exit(void)
|
||||
{
|
||||
unregister_filesystem(&scoutfs_fs_type);
|
||||
teardown_module();
|
||||
}
|
||||
module_exit(scoutfs_module_exit)
|
||||
module_exit(scoutfs_module_exit);
|
||||
|
||||
MODULE_AUTHOR("Zach Brown <zab@versity.com>");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
@@ -39,6 +39,9 @@ struct scoutfs_triggers {
|
||||
|
||||
static char *names[] = {
|
||||
[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
|
||||
[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
|
||||
[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
|
||||
[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
|
||||
[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
|
||||
};
|
||||
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
|
||||
enum scoutfs_trigger {
|
||||
SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
|
||||
SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
|
||||
SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
|
||||
SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
|
||||
SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
|
||||
SCOUTFS_TRIGGER_NR,
|
||||
};
|
||||
|
||||
@@ -46,6 +46,23 @@ static struct scoutfs_tseq_entry *tseq_rb_next(struct scoutfs_tseq_entry *ent)
|
||||
return rb_entry(node, struct scoutfs_tseq_entry, node);
|
||||
}
|
||||
|
||||
#ifdef KC_RB_TREE_AUGMENTED_COMPUTE_MAX
|
||||
static bool tseq_compute_total(struct scoutfs_tseq_entry *ent, bool exit)
|
||||
{
|
||||
loff_t total = 1 + tseq_node_total(ent->node.rb_left) +
|
||||
tseq_node_total(ent->node.rb_right);
|
||||
|
||||
if (exit && ent->total == total)
|
||||
return true;
|
||||
|
||||
ent->total = total;
|
||||
return false;
|
||||
}
|
||||
|
||||
RB_DECLARE_CALLBACKS(static, tseq_rb_callbacks, struct scoutfs_tseq_entry,
|
||||
node, total, tseq_compute_total);
|
||||
#else
|
||||
|
||||
static loff_t tseq_compute_total(struct scoutfs_tseq_entry *ent)
|
||||
{
|
||||
return 1 + tseq_node_total(ent->node.rb_left) +
|
||||
@@ -53,7 +70,8 @@ static loff_t tseq_compute_total(struct scoutfs_tseq_entry *ent)
|
||||
}
|
||||
|
||||
RB_DECLARE_CALLBACKS(static, tseq_rb_callbacks, struct scoutfs_tseq_entry,
|
||||
node, loff_t, total, tseq_compute_total)
|
||||
node, loff_t, total, tseq_compute_total);
|
||||
#endif
|
||||
|
||||
void scoutfs_tseq_tree_init(struct scoutfs_tseq_tree *tree,
|
||||
scoutfs_tseq_show_t show)
|
||||
|
||||
@@ -17,4 +17,15 @@ static inline void down_write_two(struct rw_semaphore *a,
|
||||
down_write_nested(b, SINGLE_DEPTH_NESTING);
|
||||
}
|
||||
|
||||
/*
|
||||
* When returning shrinker counts from scan_objects, we should steer
|
||||
* clear of the magic SHRINK_STOP and SHRINK_EMPTY values, which are near
|
||||
* ~0UL values. Hence, we cap count to ~0L, which is arbitarily high
|
||||
* enough to avoid it.
|
||||
*/
|
||||
static inline long shrinker_min_long(long count)
|
||||
{
|
||||
return min(count, LONG_MAX);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -773,7 +773,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
|
||||
|
||||
/* XXX do these want i_mutex or anything? */
|
||||
inode_inc_iversion(inode);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_ctime = current_time(inode);
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
@@ -850,6 +850,7 @@ unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifndef KC_XATTR_STRUCT_XATTR_HANDLER
|
||||
/*
|
||||
* Future kernels have this amazing hack to rewind the name to get the
|
||||
* skipped prefix. We're back in the stone ages without the handler
|
||||
@@ -857,22 +858,41 @@ unlock:
|
||||
* compat hook to either call the kernel's xattr_full_name(handler), or
|
||||
* our hack to use the flags as the prefix length.
|
||||
*/
|
||||
static const char *full_name_hack(void *handler, const char *name, int len)
|
||||
static const char *full_name_hack(const char *name, int len)
|
||||
{
|
||||
return name - len;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int scoutfs_xattr_get_handler(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size, int handler_flags)
|
||||
static int scoutfs_xattr_get_handler
|
||||
#ifdef KC_XATTR_STRUCT_XATTR_HANDLER
|
||||
(const struct xattr_handler *handler, struct dentry *dentry,
|
||||
struct inode *inode, const char *name, void *value,
|
||||
size_t size)
|
||||
{
|
||||
name = full_name_hack(NULL, name, handler_flags);
|
||||
name = xattr_full_name(handler, name);
|
||||
#else
|
||||
(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size, int handler_flags)
|
||||
{
|
||||
name = full_name_hack(name, handler_flags);
|
||||
#endif
|
||||
return scoutfs_xattr_get(dentry, name, value, size);
|
||||
}
|
||||
|
||||
static int scoutfs_xattr_set_handler(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags, int handler_flags)
|
||||
static int scoutfs_xattr_set_handler
|
||||
#ifdef KC_XATTR_STRUCT_XATTR_HANDLER
|
||||
(const struct xattr_handler *handler, struct dentry *dentry,
|
||||
struct inode *inode, const char *name, const void *value,
|
||||
size_t size, int flags)
|
||||
{
|
||||
name = full_name_hack(NULL, name, handler_flags);
|
||||
name = xattr_full_name(handler, name);
|
||||
#else
|
||||
(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags, int handler_flags)
|
||||
{
|
||||
name = full_name_hack(name, handler_flags);
|
||||
#endif
|
||||
return scoutfs_xattr_set(dentry, name, value, size, flags);
|
||||
}
|
||||
|
||||
@@ -905,14 +925,22 @@ static const struct xattr_handler scoutfs_xattr_security_handler = {
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_acl_access_handler = {
|
||||
#ifdef KC_XATTR_HANDLER_NAME
|
||||
.name = XATTR_NAME_POSIX_ACL_ACCESS,
|
||||
#else
|
||||
.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
|
||||
#endif
|
||||
.flags = ACL_TYPE_ACCESS,
|
||||
.get = scoutfs_acl_get_xattr,
|
||||
.set = scoutfs_acl_set_xattr,
|
||||
};
|
||||
|
||||
static const struct xattr_handler scoutfs_xattr_acl_default_handler = {
|
||||
#ifdef KC_XATTR_HANDLER_NAME
|
||||
.name = XATTR_NAME_POSIX_ACL_DEFAULT,
|
||||
#else
|
||||
.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
|
||||
#endif
|
||||
.flags = ACL_TYPE_DEFAULT,
|
||||
.get = scoutfs_acl_get_xattr,
|
||||
.set = scoutfs_acl_set_xattr,
|
||||
|
||||
@@ -25,8 +25,9 @@ All options can be seen by running with -h.
|
||||
This script is built to test multi-node systems on one host by using
|
||||
different mounts of the same devices. The script creates a fake block
|
||||
device in front of each fs block device for each mount that will be
|
||||
tested. Currently it will create free loop devices and will mount on
|
||||
/mnt/test.[0-9].
|
||||
tested. It will create predictable device mapper devices and mounts
|
||||
them on /mnt/test.N. These static device names and mount paths limit
|
||||
the script to a single execution per host.
|
||||
|
||||
All tests will be run by default. Particular tests can be included or
|
||||
excluded by providing test name regular expressions with the -I and -E
|
||||
@@ -104,8 +105,8 @@ used during the test.
|
||||
|
||||
| Variable | Description | Origin | Example |
|
||||
| ---------------- | ------------------- | --------------- | ----------------- |
|
||||
| T\_MB[0-9] | per-mount meta bdev | created per run | /dev/loop0 |
|
||||
| T\_DB[0-9] | per-mount data bdev | created per run | /dev/loop1 |
|
||||
| T\_MB[0-9] | per-mount meta bdev | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9] |
|
||||
| T\_DB[0-9] | per-mount data bdev | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9] |
|
||||
| T\_D[0-9] | per-mount test dir | made for test | /mnt/test.[0-9]/t |
|
||||
| T\_META\_DEVICE | main FS meta bdev | -M | /dev/vda |
|
||||
| T\_DATA\_DEVICE | main FS data bdev | -D | /dev/vdb |
|
||||
|
||||
@@ -35,7 +35,7 @@ t_fail()
|
||||
t_quiet()
|
||||
{
|
||||
echo "# $*" >> "$T_TMPDIR/quiet.log"
|
||||
"$@" > "$T_TMPDIR/quiet.log" 2>&1 || \
|
||||
"$@" >> "$T_TMPDIR/quiet.log" 2>&1 || \
|
||||
t_fail "quiet command failed"
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,61 @@ t_filter_fs()
|
||||
-e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
|
||||
}
|
||||
|
||||
#
|
||||
# We can hit a spurious kasan warning that was fixed upstream:
|
||||
#
|
||||
# e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
|
||||
#
|
||||
# KASAN can get mad when the unwinder doesn't find ORC metadata and
|
||||
# wanders up without using frames and hits the KASAN stack red zones.
|
||||
# We can ignore these messages.
|
||||
#
|
||||
# They're bracketed by:
|
||||
# [ 2687.690127] ==================================================================
|
||||
# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
|
||||
# ...
|
||||
# [ 2687.706220] ==================================================================
|
||||
# [ 2687.707284] Disabling lock debugging due to kernel taint
|
||||
#
|
||||
# That final lock debugging message may not be included.
|
||||
#
|
||||
ignore_harmless_unwind_kasan_stack_oob()
|
||||
{
|
||||
awk '
|
||||
BEGIN {
|
||||
in_soob = 0
|
||||
soob_nr = 0
|
||||
}
|
||||
( !in_soob && $0 ~ /==================================================================/ ) {
|
||||
in_soob = 1
|
||||
soob_nr = NR
|
||||
saved = $0
|
||||
}
|
||||
( in_soob == 1 && NR == (soob_nr + 1) ) {
|
||||
if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
|
||||
in_soob = 2
|
||||
} else {
|
||||
in_soob = 0
|
||||
print saved
|
||||
}
|
||||
saved=""
|
||||
}
|
||||
( in_soob == 2 && $0 ~ /==================================================================/ ) {
|
||||
in_soob = 3
|
||||
soob_nr = NR
|
||||
}
|
||||
( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
|
||||
in_soob = 0
|
||||
}
|
||||
( !in_soob ) { print $0 }
|
||||
END {
|
||||
if (saved) {
|
||||
print saved
|
||||
}
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
#
|
||||
# Filter out expected messages. Putting messages here implies that
|
||||
# tests aren't relying on messages to discover failures.. they're
|
||||
@@ -85,5 +140,13 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* error.*server failed to bind to.*"
|
||||
re="$re|scoutfs .* critical transaction commit failure.*"
|
||||
|
||||
egrep -v "($re)"
|
||||
# change-devices causes loop device resizing
|
||||
re="$re|loop: module loaded"
|
||||
re="$re|loop[0-9].* detected capacity change from.*"
|
||||
|
||||
# ignore systemd-journal rotating
|
||||
re="$re|systemd-journald.*"
|
||||
|
||||
egrep -v "($re)" | \
|
||||
ignore_harmless_unwind_kasan_stack_oob
|
||||
}
|
||||
|
||||
@@ -265,6 +265,15 @@ t_trigger_get() {
|
||||
cat "$(t_trigger_path "$nr")/$which"
|
||||
}
|
||||
|
||||
t_trigger_set() {
|
||||
local which="$1"
|
||||
local nr="$2"
|
||||
local val="$3"
|
||||
local path=$(t_trigger_path "$nr")
|
||||
|
||||
echo "$val" > "$path/$which"
|
||||
}
|
||||
|
||||
t_trigger_show() {
|
||||
local which="$1"
|
||||
local string="$2"
|
||||
@@ -276,9 +285,8 @@ t_trigger_show() {
|
||||
t_trigger_arm_silent() {
|
||||
local which="$1"
|
||||
local nr="$2"
|
||||
local path=$(t_trigger_path "$nr")
|
||||
|
||||
echo 1 > "$path/$which"
|
||||
t_trigger_set "$which" "$nr" 1
|
||||
}
|
||||
|
||||
t_trigger_arm() {
|
||||
|
||||
@@ -47,7 +47,7 @@ four
|
||||
--- dir within dir
|
||||
--- overwrite file
|
||||
--- can't overwrite non-empty dir
|
||||
mv: cannot move ‘/mnt/test/test/basic-posix-consistency/dir/c/clobber’ to ‘/mnt/test/test/basic-posix-consistency/dir/a/dir’: Directory not empty
|
||||
mv: cannot move '/mnt/test/test/basic-posix-consistency/dir/c/clobber' to '/mnt/test/test/basic-posix-consistency/dir/a/dir': Directory not empty
|
||||
--- can overwrite empty dir
|
||||
--- can rename into root
|
||||
== path resoluion
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
== measure initial createmany
|
||||
== measure initial createmany
|
||||
== measure two concurrent createmany runs
|
||||
== cleanup
|
||||
|
||||
@@ -17,7 +17,7 @@ ino not found in dseq index
|
||||
mount 0 contents after mount 1 rm: contents
|
||||
ino found in dseq index
|
||||
ino found in dseq index
|
||||
stat: cannot stat ‘/mnt/test/test/inode-deletion/file’: No such file or directory
|
||||
stat: cannot stat '/mnt/test/test/inode-deletion/file': No such file or directory
|
||||
ino not found in dseq index
|
||||
ino not found in dseq index
|
||||
== lots of deletions use one open map
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
== setting longer hung task timeout
|
||||
== creating fragmented extents
|
||||
== unlink file with moved extents to free extents per block
|
||||
== cleanup
|
||||
|
||||
@@ -20,10 +20,10 @@ offline waiting should now have two known entries:
|
||||
data_wait_err found 2 waiters.
|
||||
offline waiting should now have 0 known entries:
|
||||
0
|
||||
dd: error reading ‘/mnt/test/test/offline-extent-waiting/dir/file’: Input/output error
|
||||
dd: error reading '/mnt/test/test/offline-extent-waiting/dir/file': Input/output error
|
||||
0+0 records in
|
||||
0+0 records out
|
||||
dd: error reading ‘/mnt/test/test/offline-extent-waiting/dir/file’: Input/output error
|
||||
dd: error reading '/mnt/test/test/offline-extent-waiting/dir/file': Input/output error
|
||||
0+0 records in
|
||||
0+0 records out
|
||||
offline waiting should be empty again:
|
||||
|
||||
37
tests/golden/srch-safe-merge-pos
Normal file
37
tests/golden/srch-safe-merge-pos
Normal file
@@ -0,0 +1,37 @@
|
||||
== initialize per-mount values
|
||||
== arm compaction triggers
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_merge_stop_safe armed: 1
|
||||
== compact more often
|
||||
== create padded sorted inputs by forcing log rotation
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_force_log_rotate armed: 1
|
||||
trigger srch_compact_logs_pad_safe armed: 1
|
||||
== compaction of padded should stop at safe
|
||||
== verify no compaction errors
|
||||
== cleanup
|
||||
@@ -241,7 +241,6 @@ generic/312
|
||||
generic/314
|
||||
generic/316
|
||||
generic/317
|
||||
generic/318
|
||||
generic/324
|
||||
generic/326
|
||||
generic/327
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#!/usr/bin/bash
|
||||
|
||||
# Force system tools to use ASCII quotes
|
||||
export LC_ALL=C
|
||||
|
||||
#
|
||||
# XXX
|
||||
# - could have helper functions for waiting for pids
|
||||
@@ -323,16 +326,10 @@ unmount_all() {
|
||||
cmd wait $p
|
||||
done
|
||||
|
||||
# delete all temp meta devices
|
||||
for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
|
||||
if [ -e "$dev" ]; then
|
||||
cmd losetup -d "$dev"
|
||||
fi
|
||||
done
|
||||
# delete all temp data devices
|
||||
for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
|
||||
if [ -e "$dev" ]; then
|
||||
cmd losetup -d "$dev"
|
||||
# delete all temp devices
|
||||
for dev in /dev/mapper/_scoutfs_test_*; do
|
||||
if [ -b "$dev" ]; then
|
||||
cmd dmsetup remove $dev
|
||||
fi
|
||||
done
|
||||
}
|
||||
@@ -431,6 +428,12 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
|
||||
fenced_pid=$!
|
||||
fenced_log "started fenced pid $fenced_pid in the background"
|
||||
|
||||
# setup dm tables
|
||||
echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
|
||||
$T_RESULTS/dmtable.meta
|
||||
echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
|
||||
$T_RESULTS/dmtable.data
|
||||
|
||||
#
|
||||
# mount concurrently so that a quorum is present to elect the leader and
|
||||
# start a server.
|
||||
@@ -439,10 +442,13 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
|
||||
pids=""
|
||||
for i in $(seq 0 $((T_NR_MOUNTS - 1))); do
|
||||
|
||||
meta_dev=$(losetup --find --show $T_META_DEVICE)
|
||||
test -b "$meta_dev" || die "failed to create temp device $meta_dev"
|
||||
data_dev=$(losetup --find --show $T_DATA_DEVICE)
|
||||
test -b "$data_dev" || die "failed to create temp device $data_dev"
|
||||
name="_scoutfs_test_meta_$i"
|
||||
cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
|
||||
meta_dev="/dev/mapper/$name"
|
||||
|
||||
name="_scoutfs_test_data_$i"
|
||||
cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
|
||||
data_dev="/dev/mapper/$name"
|
||||
|
||||
dir="/mnt/test.$i"
|
||||
test -d "$dir" || cmd mkdir -p "$dir"
|
||||
|
||||
@@ -14,6 +14,7 @@ offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
large-fragmented-free.sh
|
||||
enospc.sh
|
||||
srch-safe-merge-pos.sh
|
||||
srch-basic-functionality.sh
|
||||
simple-xattr-unit.sh
|
||||
totl-xattr-tag.sh
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
@@ -35,10 +36,10 @@ struct opts {
|
||||
unsigned int dry_run:1,
|
||||
ls_output:1,
|
||||
quiet:1,
|
||||
user_xattr:1,
|
||||
same_srch_xattr:1,
|
||||
group_srch_xattr:1,
|
||||
unique_srch_xattr:1;
|
||||
xattr_set:1,
|
||||
xattr_file:1,
|
||||
xattr_group:1;
|
||||
char *xattr_name;
|
||||
};
|
||||
|
||||
struct stats {
|
||||
@@ -149,12 +150,31 @@ static void free_dir(struct dir *dir)
|
||||
free(dir);
|
||||
}
|
||||
|
||||
static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
int ret;
|
||||
|
||||
if (off >= sz)
|
||||
return sz;
|
||||
|
||||
va_start(ap, fmt);
|
||||
ret = vsnprintf(buf + off, sz - off, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
if (ret <= 0)
|
||||
return sz;
|
||||
|
||||
return off + ret;
|
||||
}
|
||||
|
||||
static void create_dir(struct dir *dir, struct opts *opts,
|
||||
struct stats *stats)
|
||||
{
|
||||
struct str_list *s;
|
||||
char name[100];
|
||||
char name[256]; /* max len and null term */
|
||||
char val = 'v';
|
||||
size_t off;
|
||||
int rc;
|
||||
int i;
|
||||
|
||||
@@ -175,29 +195,21 @@ static void create_dir(struct dir *dir, struct opts *opts,
|
||||
rc = mknod(s->str, S_IFREG | 0644, 0);
|
||||
error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);
|
||||
|
||||
rc = 0;
|
||||
if (rc == 0 && opts->user_xattr) {
|
||||
strcpy(name, "user.scoutfs_bcp");
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
}
|
||||
if (rc == 0 && opts->same_srch_xattr) {
|
||||
strcpy(name, "scoutfs.srch.scoutfs_bcp");
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
}
|
||||
if (rc == 0 && opts->group_srch_xattr) {
|
||||
snprintf(name, sizeof(name),
|
||||
"scoutfs.srch.scoutfs_bcp.group.%lu",
|
||||
stats->files / 10000);
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
}
|
||||
if (rc == 0 && opts->unique_srch_xattr) {
|
||||
snprintf(name, sizeof(name),
|
||||
"scoutfs.srch.scoutfs_bcp.unique.%lu",
|
||||
stats->files);
|
||||
if (opts->xattr_set) {
|
||||
off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
|
||||
if (opts->xattr_file)
|
||||
off = snprintf_off(name, sizeof(name), off,
|
||||
"-f-%lu", stats->files);
|
||||
if (opts->xattr_group)
|
||||
off = snprintf_off(name, sizeof(name), off,
|
||||
"-g-%lu", stats->files / 10000);
|
||||
|
||||
error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
|
||||
|
||||
rc = setxattr(s->str, name, &val, 1, 0);
|
||||
error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
|
||||
}
|
||||
|
||||
error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
|
||||
|
||||
stats->files++;
|
||||
rate_banner(opts, stats);
|
||||
@@ -365,11 +377,10 @@ static void usage(void)
|
||||
" -d DIR | create all files in DIR top level directory\n"
|
||||
" -n | dry run, only parse, don't create any files\n"
|
||||
" -q | quiet, don't regularly print rates\n"
|
||||
" -F | append \"-f-NR\" file nr to xattr name, requires -X\n"
|
||||
" -G | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
|
||||
" -L | parse ls output; only reg, skip meta, paths at ./\n"
|
||||
" -X | set the same user. xattr name in all files\n"
|
||||
" -S | set the same .srch. xattr name in all files\n"
|
||||
" -G | set a .srch. xattr name shared by groups of files\n"
|
||||
" -U | set a unique .srch. xattr name in all files\n");
|
||||
" -X NAM | set named xattr in all files\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
@@ -386,7 +397,7 @@ int main(int argc, char **argv)
|
||||
|
||||
memset(&opts, 0, sizeof(opts));
|
||||
|
||||
while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
|
||||
while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
|
||||
switch(c) {
|
||||
case 'd':
|
||||
top_dir = strdup(optarg);
|
||||
@@ -397,20 +408,19 @@ int main(int argc, char **argv)
|
||||
case 'q':
|
||||
opts.quiet = 1;
|
||||
break;
|
||||
case 'F':
|
||||
opts.xattr_file = 1;
|
||||
break;
|
||||
case 'G':
|
||||
opts.xattr_group = 1;
|
||||
break;
|
||||
case 'L':
|
||||
opts.ls_output = 1;
|
||||
break;
|
||||
case 'X':
|
||||
opts.user_xattr = 1;
|
||||
break;
|
||||
case 'S':
|
||||
opts.same_srch_xattr = 1;
|
||||
break;
|
||||
case 'G':
|
||||
opts.group_srch_xattr = 1;
|
||||
break;
|
||||
case 'U':
|
||||
opts.unique_srch_xattr = 1;
|
||||
opts.xattr_set = 1;
|
||||
opts.xattr_name = strdup(optarg);
|
||||
error_exit(!opts.xattr_name, "error allocating xattr name");
|
||||
break;
|
||||
case '?':
|
||||
printf("Unknown option '%c'\n", optopt);
|
||||
@@ -419,6 +429,11 @@ int main(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
error_exit(opts.xattr_file && !opts.xattr_set,
|
||||
"must specify xattr -X when appending file nr with -F");
|
||||
error_exit(opts.xattr_group && !opts.xattr_set,
|
||||
"must specify xattr -X when appending file nr with -G");
|
||||
|
||||
if (!opts.dry_run) {
|
||||
error_exit(!top_dir,
|
||||
"must specify top level directory with -d");
|
||||
|
||||
@@ -48,7 +48,7 @@ struct our_handle {
|
||||
static void exit_usage(void)
|
||||
{
|
||||
printf(" -h/-? output this usage message and exit\n"
|
||||
" -e keep trying on enoent, consider success an error\n"
|
||||
" -e keep trying on enoent and estale, consider success an error\n"
|
||||
" -i <num> 64bit inode number for handle open, can be multiple\n"
|
||||
" -m <string> scoutfs mount path string for ioctl fd\n"
|
||||
" -n <string> optional xattr name string, defaults to \""DEFAULT_NAME"\"\n"
|
||||
@@ -149,7 +149,7 @@ int main(int argc, char **argv)
|
||||
|
||||
fd = open_by_handle_at(mntfd, &handle.handle, O_RDWR);
|
||||
if (fd == -1) {
|
||||
if (!enoent_success_err || errno != ENOENT) {
|
||||
if (!enoent_success_err || ( errno != ENOENT && errno != ESTALE )) {
|
||||
perror("open_by_handle_at");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -11,8 +11,13 @@ FILE="$T_D0/file"
|
||||
# final block as we truncated past it.
|
||||
#
|
||||
echo "== truncate writes zeroed partial end of file block"
|
||||
yes | dd of="$FILE" bs=8K count=1 status=none
|
||||
yes | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
|
||||
sync
|
||||
|
||||
# not passing iflag=fullblock causes the file occasionally to just be
|
||||
# 4K, so just to be safe we should at least check size once
|
||||
test `stat --printf="%s\n" "$FILE"` -eq 8192 || t_fail "test file incorrect start size"
|
||||
|
||||
truncate -s 6K "$FILE"
|
||||
truncate -s 12K "$FILE"
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
|
||||
@@ -7,9 +7,11 @@ t_require_mounts 2
|
||||
|
||||
COUNT=50000
|
||||
|
||||
# Prep dirs for test. Each mount needs to make their own parent dir for
|
||||
# the createmany run, otherwise both dirs will end up in the same inode
|
||||
# group, causing updates to bounce that lock around.
|
||||
#
|
||||
# Prep dirs for test. We have per-directory inode number allocators so
|
||||
# by putting each createmany in a per-mount dir they get their own inode
|
||||
# number region and cluster locks.
|
||||
#
|
||||
echo "== measure initial createmany"
|
||||
mkdir -p $T_D0/dir/0
|
||||
mkdir $T_D1/dir/1
|
||||
@@ -17,18 +19,20 @@ mkdir $T_D1/dir/1
|
||||
echo "== measure initial createmany"
|
||||
START=$SECONDS
|
||||
createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
|
||||
sync
|
||||
SINGLE=$((SECONDS - START))
|
||||
echo single $SINGLE >> $T_TMP.full
|
||||
|
||||
echo "== measure two concurrent createmany runs"
|
||||
START=$SECONDS
|
||||
createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
|
||||
(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
|
||||
pids="$!"
|
||||
createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
|
||||
(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
|
||||
pids="$pids $!"
|
||||
for p in $pids; do
|
||||
wait $p
|
||||
done
|
||||
sync
|
||||
BOTH=$((SECONDS - START))
|
||||
echo both $BOTH >> $T_TMP.full
|
||||
|
||||
@@ -41,7 +45,10 @@ echo both $BOTH >> $T_TMP.full
|
||||
# synchronized operation.
|
||||
FACTOR=200
|
||||
if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
|
||||
echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
|
||||
t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
|
||||
fi
|
||||
|
||||
echo "== cleanup"
|
||||
find $T_D0/dir -delete
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -95,7 +95,7 @@ print_logical_extents()
|
||||
}
|
||||
print $2, $6, flags
|
||||
}
|
||||
'
|
||||
' | sed 's/last,eof/eof/'
|
||||
}
|
||||
|
||||
t_save_all_sysfs_mount_options data_prealloc_blocks
|
||||
|
||||
@@ -7,14 +7,11 @@ t_require_mounts 2
|
||||
|
||||
#
|
||||
# Make sure that all mounts can read the results of a write from each
|
||||
# mount. And make sure that the greatest of all the written seqs is
|
||||
# visible after the writes were commited by remote reads.
|
||||
# mount.
|
||||
#
|
||||
check_read_write()
|
||||
{
|
||||
local expected
|
||||
local greatest=0
|
||||
local seq
|
||||
local path
|
||||
local saw
|
||||
local w
|
||||
@@ -25,11 +22,6 @@ check_read_write()
|
||||
eval path="\$T_D${w}/written"
|
||||
echo "$expected" > "$path"
|
||||
|
||||
seq=$(scoutfs stat -s meta_seq $path)
|
||||
if [ "$seq" -gt "$greatest" ]; then
|
||||
greatest=$seq
|
||||
fi
|
||||
|
||||
for r in $(t_fs_nrs); do
|
||||
eval path="\$T_D${r}/written"
|
||||
saw=$(cat "$path")
|
||||
@@ -38,11 +30,6 @@ check_read_write()
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
seq=$(scoutfs statfs -s committed_seq -p $T_D0)
|
||||
if [ "$seq" -lt "$greatest" ]; then
|
||||
echo "committed_seq $seq less than greatest $greatest"
|
||||
fi
|
||||
}
|
||||
|
||||
# verify that fenced ran our testing fence script
|
||||
|
||||
@@ -72,7 +72,7 @@ check_ino_index "$ino" "$dseq" "$T_M0"
|
||||
check_ino_index "$ino" "$dseq" "$T_M1"
|
||||
exec {FD}>&- # close
|
||||
# we know that revalidating will unhash the remote dentry
|
||||
stat "$T_D0/file" 2>&1 | t_filter_fs
|
||||
stat "$T_D0/file" 2>&1 | sed 's/cannot statx/cannot stat/' | t_filter_fs
|
||||
check_ino_index "$ino" "$dseq" "$T_M0"
|
||||
check_ino_index "$ino" "$dseq" "$T_M1"
|
||||
|
||||
|
||||
@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
|
||||
EXTENTS_PER_LIST_BLOCK=8192
|
||||
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
|
||||
|
||||
#
|
||||
# This test specifically creates a pathologically sparse file that will
|
||||
# be as expensive as possible to free. This is usually fine on
|
||||
# dedicated or reasonable hardware, but trying to run this in
|
||||
# virtualized debug kernels can take a very long time. This test is
|
||||
# about making sure that the server doesn't fail, not that the platform
|
||||
# can handle the scale of work that our btree formats happen to require
|
||||
# while execution is bogged down with use-after-free memory reference
|
||||
# tracking. So we give the test a lot more breathing room before
|
||||
# deciding that its hung.
|
||||
#
|
||||
echo "== setting longer hung task timeout"
|
||||
if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
|
||||
secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
|
||||
test "$secs" -gt 0 || \
|
||||
t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
|
||||
restore_hung_task_timeout()
|
||||
{
|
||||
echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
|
||||
}
|
||||
trap restore_hung_task_timeout EXIT
|
||||
echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
|
||||
fi
|
||||
|
||||
echo "== creating fragmented extents"
|
||||
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
# Some basic tests of online resizing metadata and data devices.
|
||||
#
|
||||
|
||||
t_require_commands bc
|
||||
|
||||
statfs_total() {
|
||||
local single="total_$1_blocks"
|
||||
local mnt="$2"
|
||||
|
||||
@@ -55,10 +55,17 @@ scoutfs setattr -t 67305985.999999999 -V 1 -s 1 "$FILE" 2>&1 | t_filter_fs
|
||||
TZ=GMT stat -c "%z" "$FILE"
|
||||
rm "$FILE"
|
||||
|
||||
#
|
||||
# With e2fsprogs-v1.42.10-10-g29758d2f, the output of filefrag 'flags' changes
|
||||
# significantly. First, the _LAST flag is now output. Second, the 'unknown'
|
||||
# flag is now printed out as 'unknown_loc'. To compensate for this, we check
|
||||
# and replace the "correct" output for new versions here with the expected
|
||||
# value.
|
||||
#
|
||||
echo "== large offline extents are created"
|
||||
touch "$FILE"
|
||||
scoutfs setattr -V 1 -o -s $((10007 * 4096)) "$FILE" 2>&1 | t_filter_fs
|
||||
filefrag -v -b4096 "$FILE" 2>&1 | t_filter_fs
|
||||
filefrag -v -b4096 "$FILE" 2>&1 | sed 's/last,unknown_loc,eof$/unknown,eof/' | t_filter_fs
|
||||
rm "$FILE"
|
||||
|
||||
# had a bug where we were creating extents that were too long
|
||||
|
||||
@@ -27,15 +27,9 @@ test_xattr_lengths() {
|
||||
echo "key len $name_len val len $val_len" >> "$T_TMP.log"
|
||||
setfattr -n $name -v \"$val\" "$FILE"
|
||||
|
||||
# grep has trouble with enormous args? so we dump the
|
||||
# name=value to a file and compare with a known good file
|
||||
getfattr -d --absolute-names "$FILE" | grep "$name" > "$T_TMP.got"
|
||||
getfattr -d --only-values --absolute-names "$FILE" -n "$name" > "$T_TMP.got"
|
||||
echo -n "$val" > "$T_TMP.good"
|
||||
|
||||
if [ $val_len == 0 ]; then
|
||||
echo "$name" > "$T_TMP.good"
|
||||
else
|
||||
echo "$name=\"$val\"" > "$T_TMP.good"
|
||||
fi
|
||||
cmp "$T_TMP.good" "$T_TMP.got" || \
|
||||
t_fail "cmp failed name len $name_len val len $val_len"
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ LOG=340000
|
||||
LIM=1000000
|
||||
|
||||
SEQF="%.20g"
|
||||
SXA="scoutfs.srch.test-srch-basic-functionality"
|
||||
|
||||
t_require_commands touch rm setfattr scoutfs find_xattrs
|
||||
|
||||
@@ -27,20 +28,20 @@ diff_srch_find()
|
||||
|
||||
echo "== create new xattrs"
|
||||
touch "$T_D0/"{create,update}
|
||||
setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
|
||||
diff_srch_find scoutfs.srch.test
|
||||
setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== update existing xattr"
|
||||
setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
|
||||
diff_srch_find scoutfs.srch.test
|
||||
setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== remove an xattr"
|
||||
setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
|
||||
diff_srch_find scoutfs.srch.test
|
||||
setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== remove xattr with files"
|
||||
rm -f "$T_D0/"{create,update}
|
||||
diff_srch_find scoutfs.srch.test
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== trigger small log merges by rotating single block with unmount"
|
||||
sv=$(t_server_nr)
|
||||
@@ -56,7 +57,7 @@ while [ "$i" -lt "8" ]; do
|
||||
|
||||
eval path="\$T_D${nr}/single-block-$i"
|
||||
touch "$path"
|
||||
setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
|
||||
setfattr -n $SXA -v $i "$path"
|
||||
t_umount $nr
|
||||
t_mount $nr
|
||||
|
||||
@@ -65,51 +66,51 @@ while [ "$i" -lt "8" ]; do
|
||||
done
|
||||
# wait for srch compaction worker delay
|
||||
sleep 10
|
||||
rm -rf "$T_D0/single-block-*"
|
||||
find "$T_D0" -type f -name 'single-block-*' -delete
|
||||
|
||||
echo "== create entries in current log"
|
||||
DIR="$T_D0/dir"
|
||||
NR=$((LOG / 4))
|
||||
mkdir -p "$DIR"
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== delete small fraction"
|
||||
seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== remove files"
|
||||
rm -rf "$DIR"
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== create entries that exceed one log"
|
||||
NR=$((LOG * 3 / 2))
|
||||
mkdir -p "$DIR"
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== delete fractions in phases"
|
||||
for i in $(seq 1 3); do
|
||||
seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
|
||||
diff_srch_find $SXA
|
||||
done
|
||||
|
||||
echo "== remove files"
|
||||
rm -rf "$DIR"
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== create entries for exceed search entry limit"
|
||||
NR=$((LIM * 3 / 2))
|
||||
mkdir -p "$DIR"
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== delete half"
|
||||
seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
|
||||
diff_srch_find $SXA
|
||||
|
||||
echo "== entirely remove third batch"
|
||||
rm -rf "$DIR"
|
||||
diff_srch_find scoutfs.srch.scoutfs_bcp
|
||||
diff_srch_find $SXA
|
||||
|
||||
t_pass
|
||||
|
||||
90
tests/tests/srch-safe-merge-pos.sh
Normal file
90
tests/tests/srch-safe-merge-pos.sh
Normal file
@@ -0,0 +1,90 @@
|
||||
#
|
||||
# There was a bug where srch file compaction could get stuck if a
|
||||
# partial compaction finished at the specific _SAFE_BYTES offset in a
|
||||
# block. Resuming from that position would return an error and
|
||||
# compaction would stop making forward progress.
|
||||
#
|
||||
# We use triggers to pad the output of log compaction to end on the safe
|
||||
# offset and then cause compaction of those padded inputs to stop at the
|
||||
# safe offset. Continuation will either succeed or return errors.
|
||||
#
|
||||
|
||||
# forcing rotation, so just a few
|
||||
NR=10
|
||||
SEQF="%.20g"
|
||||
COMPACT_NR=4
|
||||
|
||||
echo "== initialize per-mount values"
|
||||
declare -a err
|
||||
declare -a compact_delay
|
||||
for nr in $(t_fs_nrs); do
|
||||
err[$nr]=$(t_counter srch_compact_error $nr)
|
||||
compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
|
||||
done
|
||||
restore_compact_delay()
|
||||
{
|
||||
for nr in $(t_fs_nrs); do
|
||||
echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
|
||||
done
|
||||
}
|
||||
trap restore_compact_delay EXIT
|
||||
|
||||
echo "== arm compaction triggers"
|
||||
for nr in $(t_fs_nrs); do
|
||||
t_trigger_arm srch_compact_logs_pad_safe $nr
|
||||
t_trigger_arm srch_merge_stop_safe $nr
|
||||
done
|
||||
|
||||
echo "== compact more often"
|
||||
for nr in $(t_fs_nrs); do
|
||||
echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
|
||||
done
|
||||
|
||||
echo "== create padded sorted inputs by forcing log rotation"
|
||||
sv=$(t_server_nr)
|
||||
for i in $(seq 1 $COMPACT_NR); do
|
||||
for j in $(seq 1 $COMPACT_NR); do
|
||||
t_trigger_arm srch_force_log_rotate $sv
|
||||
|
||||
seq -f "f-$i-$j-$SEQF" 1 10 | \
|
||||
bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
|
||||
/dev/null
|
||||
sync
|
||||
|
||||
test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
|
||||
t_fail "srch_force_log_rotate didn't trigger"
|
||||
done
|
||||
|
||||
padded=0
|
||||
while test $padded == 0 && sleep .5; do
|
||||
for nr in $(t_fs_nrs); do
|
||||
if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
|
||||
t_trigger_arm srch_compact_logs_pad_safe $nr
|
||||
padded=1
|
||||
break
|
||||
fi
|
||||
test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
|
||||
t_fail "srch_compact_error counter increased on mount $nr"
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
echo "== compaction of padded should stop at safe"
|
||||
sleep 2
|
||||
for nr in $(t_fs_nrs); do
|
||||
if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "== verify no compaction errors"
|
||||
sleep 2
|
||||
for nr in $(t_fs_nrs); do
|
||||
test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
|
||||
t_fail "srch_compact_error counter increased on mount $nr"
|
||||
done
|
||||
|
||||
echo "== cleanup"
|
||||
find "$T_D0" -type f -name 'f-*' -delete
|
||||
|
||||
t_pass
|
||||
@@ -75,6 +75,7 @@ generic/215 # mmap missing
|
||||
generic/246 # mmap missing
|
||||
generic/247 # mmap missing
|
||||
generic/248 # mmap missing
|
||||
generic/318 # can't support user namespaces until v5.11
|
||||
generic/321 # requires selinux enabled for '+' in ls?
|
||||
generic/325 # mmap missing
|
||||
generic/338 # BUG_ON update inode error handling
|
||||
|
||||
@@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to
|
||||
different regions) and wasted space isn't an issue (perhaps because the
|
||||
file population contains few small files).
|
||||
.TP
|
||||
.B log_merge_wait_timeout_ms=<number>
|
||||
This option sets the amount of time, in milliseconds, that log merge
|
||||
creation can wait before timing out. This setting is per-mount, only
|
||||
changes the behavior of that mount, and only affects the server when it
|
||||
is running in that mount.
|
||||
.sp
|
||||
This determines how long it may take for mounts to synchronize
|
||||
committing their log trees to create a log merge operation. Setting it
|
||||
too high can create long latencies in the event that a mount takes a
|
||||
long time to commit their log. Setting it too low can result in the
|
||||
creation of excessive numbers of log trees that are never merged. The
|
||||
default is 500 and it can not be less than 100 nor greater than 60000.
|
||||
.TP
|
||||
.B metadev_path=<device>
|
||||
The metadev_path option specifies the path to the block device that
|
||||
contains the filesystem's metadata.
|
||||
|
||||
@@ -61,7 +61,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
|
||||
%files
|
||||
%defattr(644,root,root,755)
|
||||
%{_mandir}/man*/scoutfs*.gz
|
||||
%{_unitdir}/scoutfs-fenced.service
|
||||
/%{_unitdir}/scoutfs-fenced.service
|
||||
%{_sysconfdir}/scoutfs
|
||||
%defattr(755,root,root,755)
|
||||
%{_sbindir}/scoutfs
|
||||
|
||||
Reference in New Issue
Block a user