Compare commits

..

7 Commits

Author SHA1 Message Date
Zach Brown
6541ccfdd0 Add test_bit to utils bitmap
Add test_bit() to the trivial utils bitmap.c implementation.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-06 17:18:57 -08:00
Zach Brown
b56836b395 Add {read,write}-metadata-image scoutfs commands
Signed-off-by: Zach Brown <zab@versity.com>
2024-03-06 17:18:57 -08:00
Zach Brown
de8395a9cf Fix partial rename to check_meta_alloc
As I was committing the initial check command I had only partially
completed a rename of the function that checks the metadata allocators.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-06 17:18:01 -08:00
Zach Brown
94c608f281 (wip) add check command 2024-03-04 15:13:46 -08:00
Zach Brown
84c1460e4f Fix printing alloc list block extents
The list alloc blocks have an array of blknos that are offset by a start
field in the block header.  The print code wasn't using that and was
always referencing the beginning of the array, which could miss blocks.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-04 15:13:46 -08:00
Zach Brown
e407f67fcc Import a few more functions to our list.h
Import a few more functions from the kernel's list.h into our imported
copy.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-04 15:13:46 -08:00
Zach Brown
13c4b35bed Add lk rbtree wrapper
Import the kernel's rbtree implementation with a wrapper so we can use
it from userspace.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-04 15:13:46 -08:00
97 changed files with 4764 additions and 6259 deletions

View File

@@ -1,36 +1,6 @@
Versity ScoutFS Release Notes
=============================
---
v1.21
\
*Jul 1, 2024*
This release adds features that rely on incompatible changes to
structure the file system. The process of advancing the format version
to enable these features is described in scoutfs(5).
Added the ".indx." extended attribute tag which can be used to determine
the sorting of files in a global index.
Added ScoutFS quotas which let rules define file size and count limits
in terms of ".totl." extended attribute totals.
Added the project ID file attribute which is inherited from parent
directories on creation. ScoutFS quota rules can reference project IDs.
Add a retention attribute for files which prevents modification once
enabled.
---
v1.20
\
*Apr 22, 2024*
Minor changes to packaging to better support "weak" module linking of
the kernel module, and to including git hashes in the built package. No
changes in runtime behaviour.
---
v1.19
\

View File

@@ -12,22 +12,17 @@ else
SP = @:
endif
SCOUTFS_GIT_DESCRIBE ?= \
SCOUTFS_GIT_DESCRIBE := \
$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
echo no-git)
ESCAPED_GIT_DESCRIBE := \
$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
RPM_GITHASH=$(RPM_GITHASH) \
CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
EXTRA_CFLAGS="-Werror"
# - We use the git describe from tags to set up the RPM versioning
RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
RPM_GITHASH := $(shell git rev-parse --short HEAD)
TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
@@ -46,8 +41,7 @@ modules_install:
%.spec: %.spec.in .FORCE
sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
-e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
-e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
mv $@+ $@

View File

@@ -1,7 +1,6 @@
%define kmod_name scoutfs
%define kmod_version @@VERSION@@
%define kmod_git_hash @@GITHASH@@
%define kmod_git_describe @@GITDESCRIBE@@
%define pkg_date %(date +%%Y%%m%%d)
# Disable the building of the debug package(s).
@@ -76,7 +75,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
for flavor in %flavors_to_build; do
rm -rf obj/$flavor
cp -r source obj/$flavor
make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
done
%install
@@ -98,21 +97,10 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
/lib/modules
%post
echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
weak-modules --add-kernel --no-initramfs
depmod -a
%endif
%clean
rm -rf %{buildroot}
%preun
# stash our modules for postun cleanup
SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
%postun
if [ -x /sbin/weak-modules ]; then
cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
fi
rm /var/run/%{name}-modules-%{version}-%{release} || true

View File

@@ -9,7 +9,6 @@ CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
scoutfs-y += \
acl.o \
attr_x.o \
avl.o \
alloc.o \
block.o \
@@ -35,7 +34,6 @@ scoutfs-y += \
options.o \
per_task.o \
quorum.o \
quota.o \
recov.o \
scoutfs_trace.o \
server.o \
@@ -44,12 +42,10 @@ scoutfs-y += \
srch.o \
super.o \
sysfs.o \
totl.o \
trans.o \
triggers.o \
tseq.o \
volopt.o \
wkic.o \
xattr.o
#

View File

@@ -1,252 +0,0 @@
/*
* Copyright (C) 2024 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include "format.h"
#include "super.h"
#include "inode.h"
#include "ioctl.h"
#include "lock.h"
#include "trans.h"
#include "attr_x.h"
static int validate_attr_x_input(struct super_block *sb, struct scoutfs_ioctl_inode_attr_x *iax)
{
int ret;
if ((iax->x_mask & SCOUTFS_IOC_IAX__UNKNOWN) ||
(iax->x_flags & SCOUTFS_IOC_IAX_F__UNKNOWN))
return -EINVAL;
if ((iax->x_mask & SCOUTFS_IOC_IAX_RETENTION) &&
(ret = scoutfs_fmt_vers_unsupported(sb, SCOUTFS_FORMAT_VERSION_FEAT_RETENTION)))
return ret;
if ((iax->x_mask & SCOUTFS_IOC_IAX_PROJECT_ID) &&
(ret = scoutfs_fmt_vers_unsupported(sb, SCOUTFS_FORMAT_VERSION_FEAT_PROJECT_ID)))
return ret;
return 0;
}
/*
* If the mask indicates interest in the given attr then set the field
* to the caller's value and return the new size if it didn't already
* include the attr field.
*/
#define fill_attr(size, iax, bit, field, val) \
({ \
__typeof__(iax) _iax = (iax); \
__typeof__(size) _size = (size); \
\
if (_iax->x_mask & (bit)) { \
_iax->field = (val); \
_size = max(_size, offsetof(struct scoutfs_ioctl_inode_attr_x, field) + \
sizeof_field(struct scoutfs_ioctl_inode_attr_x, field)); \
} \
\
_size; \
})
/*
* Returns -errno on error, or >= number of bytes filled by the
* response. 0 can be returned if no attributes are requested in the
* input x_mask.
*/
int scoutfs_get_attr_x(struct inode *inode, struct scoutfs_ioctl_inode_attr_x *iax)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct scoutfs_lock *lock = NULL;
size_t size = 0;
u64 offline;
u64 online;
u64 bits;
int ret;
if (iax->x_mask == 0) {
ret = 0;
goto out;
}
ret = validate_attr_x_input(sb, iax);
if (ret < 0)
goto out;
inode_lock(inode);
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
if (ret)
goto unlock;
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_META_SEQ,
meta_seq, scoutfs_inode_meta_seq(inode));
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_DATA_SEQ,
data_seq, scoutfs_inode_data_seq(inode));
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_DATA_VERSION,
data_version, scoutfs_inode_data_version(inode));
if (iax->x_mask & (SCOUTFS_IOC_IAX_ONLINE_BLOCKS | SCOUTFS_IOC_IAX_OFFLINE_BLOCKS)) {
scoutfs_inode_get_onoff(inode, &online, &offline);
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_ONLINE_BLOCKS,
online_blocks, online);
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_OFFLINE_BLOCKS,
offline_blocks, offline);
}
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_CTIME, ctime_sec, inode->i_ctime.tv_sec);
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_CTIME, ctime_nsec, inode->i_ctime.tv_nsec);
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_CRTIME, crtime_sec, si->crtime.tv_sec);
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_CRTIME, crtime_nsec, si->crtime.tv_nsec);
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_SIZE, size, i_size_read(inode));
if (iax->x_mask & SCOUTFS_IOC_IAX__BITS) {
bits = 0;
if ((iax->x_mask & SCOUTFS_IOC_IAX_RETENTION) &&
(scoutfs_inode_get_flags(inode) & SCOUTFS_INO_FLAG_RETENTION))
bits |= SCOUTFS_IOC_IAX_B_RETENTION;
size = fill_attr(size, iax, SCOUTFS_IOC_IAX__BITS, bits, bits);
}
size = fill_attr(size, iax, SCOUTFS_IOC_IAX_PROJECT_ID,
project_id, scoutfs_inode_get_proj(inode));
ret = size;
unlock:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
inode_unlock(inode);
out:
return ret;
}
static bool valid_attr_changes(struct inode *inode, struct scoutfs_ioctl_inode_attr_x *iax)
{
/* provided data_version must be non-zero */
if ((iax->x_mask & SCOUTFS_IOC_IAX_DATA_VERSION) && (iax->data_version == 0))
return false;
/* can only set size or data version in new regular files */
if (((iax->x_mask & SCOUTFS_IOC_IAX_SIZE) ||
(iax->x_mask & SCOUTFS_IOC_IAX_DATA_VERSION)) &&
(!S_ISREG(inode->i_mode) || scoutfs_inode_data_version(inode) != 0))
return false;
/* must provide non-zero data_version with non-zero size */
if (((iax->x_mask & SCOUTFS_IOC_IAX_SIZE) && (iax->size > 0)) &&
(!(iax->x_mask & SCOUTFS_IOC_IAX_DATA_VERSION) || (iax->data_version == 0)))
return false;
/* must provide non-zero size when setting offline extents to that size */
if ((iax->x_flags & SCOUTFS_IOC_IAX_F_SIZE_OFFLINE) &&
(!(iax->x_mask & SCOUTFS_IOC_IAX_SIZE) || (iax->size == 0)))
return false;
/* the retention bit only applies to regular files */
if ((iax->x_mask & SCOUTFS_IOC_IAX_RETENTION) && !S_ISREG(inode->i_mode))
return false;
return true;
}
int scoutfs_set_attr_x(struct inode *inode, struct scoutfs_ioctl_inode_attr_x *iax)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct scoutfs_lock *lock = NULL;
LIST_HEAD(ind_locks);
bool set_data_seq;
int ret;
/* initially all setting is root only, could loosen with finer grained checks */
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out;
}
if (iax->x_mask == 0) {
ret = 0;
goto out;
}
ret = validate_attr_x_input(sb, iax);
if (ret < 0)
goto out;
inode_lock(inode);
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
if (ret)
goto unlock;
/* check for errors before making any changes */
if (!valid_attr_changes(inode, iax)) {
ret = -EINVAL;
goto unlock;
}
/* retention prevents modification unless also clearing retention */
ret = scoutfs_inode_check_retention(inode);
if (ret < 0 && !((iax->x_mask & SCOUTFS_IOC_IAX_RETENTION) &&
!(iax->bits & SCOUTFS_IOC_IAX_B_RETENTION)))
goto unlock;
/* setting only so we don't see 0 data seq with nonzero data_version */
if ((iax->x_mask & SCOUTFS_IOC_IAX_DATA_VERSION) && (iax->data_version > 0))
set_data_seq = true;
else
set_data_seq = false;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, true);
if (ret)
goto unlock;
ret = scoutfs_dirty_inode_item(inode, lock);
if (ret < 0)
goto release;
/* creating offline extent first, it might fail */
if (iax->x_flags & SCOUTFS_IOC_IAX_F_SIZE_OFFLINE) {
ret = scoutfs_data_init_offline_extent(inode, iax->size, lock);
if (ret)
goto release;
}
/* make all changes once they're all checked and will succeed */
if (iax->x_mask & SCOUTFS_IOC_IAX_DATA_VERSION)
scoutfs_inode_set_data_version(inode, iax->data_version);
if (iax->x_mask & SCOUTFS_IOC_IAX_SIZE)
i_size_write(inode, iax->size);
if (iax->x_mask & SCOUTFS_IOC_IAX_CTIME) {
inode->i_ctime.tv_sec = iax->ctime_sec;
inode->i_ctime.tv_nsec = iax->ctime_nsec;
}
if (iax->x_mask & SCOUTFS_IOC_IAX_CRTIME) {
si->crtime.tv_sec = iax->crtime_sec;
si->crtime.tv_nsec = iax->crtime_nsec;
}
if (iax->x_mask & SCOUTFS_IOC_IAX_RETENTION) {
scoutfs_inode_set_flags(inode, ~SCOUTFS_INO_FLAG_RETENTION,
(iax->bits & SCOUTFS_IOC_IAX_B_RETENTION) ?
SCOUTFS_INO_FLAG_RETENTION : 0);
}
if (iax->x_mask & SCOUTFS_IOC_IAX_PROJECT_ID)
scoutfs_inode_set_proj(inode, iax->project_id);
scoutfs_update_inode_item(inode, lock, &ind_locks);
ret = 0;
release:
scoutfs_release_trans(sb);
unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
inode_unlock(inode);
out:
return ret;
}

View File

@@ -1,11 +0,0 @@
#ifndef _SCOUTFS_ATTR_X_H_
#define _SCOUTFS_ATTR_X_H_
#include <linux/kernel.h>
#include <linux/fs.h>
#include "ioctl.h"
int scoutfs_get_attr_x(struct inode *inode, struct scoutfs_ioctl_inode_attr_x *iax);
int scoutfs_set_attr_x(struct inode *inode, struct scoutfs_ioctl_inode_attr_x *iax);
#endif

View File

@@ -683,7 +683,6 @@ int scoutfs_block_read_ref(struct super_block *sb, struct scoutfs_block_ref *ref
struct scoutfs_block_header *hdr;
struct block_private *bp = NULL;
bool retried = false;
__le32 crc = 0;
int ret;
retry:
@@ -696,9 +695,7 @@ retry:
/* corrupted writes might be a sign of a stale reference */
if (!test_bit(BLOCK_BIT_CRC_VALID, &bp->bits)) {
crc = block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE);
if (hdr->crc != crc) {
trace_scoutfs_block_stale(sb, ref, hdr, magic, le32_to_cpu(crc));
if (hdr->crc != block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE)) {
ret = -ESTALE;
goto out;
}
@@ -708,7 +705,6 @@ retry:
if (hdr->magic != cpu_to_le32(magic) || hdr->fsid != cpu_to_le64(sbi->fsid) ||
hdr->seq != ref->seq || hdr->blkno != ref->blkno) {
trace_scoutfs_block_stale(sb, ref, hdr, magic, 0);
ret = -ESTALE;
goto out;
}

View File

@@ -199,7 +199,10 @@
EXPAND_COUNTER(srch_read_stale) \
EXPAND_COUNTER(statfs) \
EXPAND_COUNTER(totl_read_copied) \
EXPAND_COUNTER(totl_read_finalized) \
EXPAND_COUNTER(totl_read_fs) \
EXPAND_COUNTER(totl_read_item) \
EXPAND_COUNTER(totl_read_logged) \
EXPAND_COUNTER(trans_commit_data_alloc_low) \
EXPAND_COUNTER(trans_commit_dirty_meta_full) \
EXPAND_COUNTER(trans_commit_fsync) \

View File

@@ -586,12 +586,6 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
goto out;
}
if (create && !si->staging) {
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto out;
}
/* convert unwritten to written, could be staging */
if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) {
un.start = iblock;
@@ -1110,10 +1104,6 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
while(iblock <= last) {
ret = scoutfs_quota_check_data(sb, inode);
if (ret)
goto out_extent;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
if (ret)
goto out_extent;
@@ -1165,9 +1155,9 @@ out:
* on regular files with no data extents. It's used to restore a file
* with an offline extent which can then trigger staging.
*
* The caller must take care of cluster locking, transactions, inode
* updates, and index updates (so that they can atomically make this
* change along with other metadata changes).
* The caller has taken care of locking the inode. We're updating the
* inode offline count as we create the offline extent so we take care
* of the index locking, updating, and transaction.
*/
int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
struct scoutfs_lock *lock)
@@ -1181,6 +1171,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
.lock = lock,
};
const u64 count = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE);
LIST_HEAD(ind_locks);
u64 on;
u64 off;
int ret;
@@ -1193,10 +1184,28 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
goto out;
}
/* we're updating meta_seq with offline block count */
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
if (ret < 0)
goto out;
ret = scoutfs_dirty_inode_item(inode, lock);
if (ret < 0)
goto unlock;
down_write(&si->extent_sem);
ret = scoutfs_ext_insert(sb, &data_ext_ops, &args,
0, count, 0, SEF_OFFLINE);
up_write(&si->extent_sem);
if (ret < 0)
goto unlock;
scoutfs_update_inode_item(inode, lock, &ind_locks);
unlock:
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
ret = 0;
out:
return ret;
}
@@ -1264,9 +1273,6 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
if (ret)
goto out;
if (!is_stage && (ret = scoutfs_inode_check_retention(to)))
goto out;
if ((from_off & SCOUTFS_BLOCK_SM_MASK) ||
(to_off & SCOUTFS_BLOCK_SM_MASK) ||
((byte_len & SCOUTFS_BLOCK_SM_MASK) &&
@@ -1801,6 +1807,37 @@ int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
return ret;
}
int scoutfs_data_wait_check_iter(struct inode *inode, loff_t pos, struct iov_iter *iter,
u8 sef, u8 op, struct scoutfs_data_wait *dw,
struct scoutfs_lock *lock)
{
size_t count = iov_iter_count(iter);
size_t off = iter->iov_offset;
const struct iovec *iov;
size_t len;
int ret = 0;
for (iov = iter->iov; count > 0; iov++) {
len = iov->iov_len - off;
if (len == 0)
continue;
/* aren't we waiting on too much data here ? */
ret = scoutfs_data_wait_check(inode, pos, len,
sef, op, dw, lock);
if (ret != 0)
break;
pos += len;
count -= len;
off = 0;
}
return ret;
}
int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw)
{
DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt);

View File

@@ -65,6 +65,9 @@ int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, u8 sef,
u8 op, struct scoutfs_data_wait *ow,
struct scoutfs_lock *lock);
int scoutfs_data_wait_check_iter(struct inode *inode, loff_t pos, struct iov_iter *iter,
u8 sef, u8 op, struct scoutfs_data_wait *ow,
struct scoutfs_lock *lock);
bool scoutfs_data_wait_found(struct scoutfs_data_wait *ow);
int scoutfs_data_wait(struct inode *inode,
struct scoutfs_data_wait *ow);

View File

@@ -34,7 +34,6 @@
#include "forest.h"
#include "acl.h"
#include "counters.h"
#include "quota.h"
#include "scoutfs_trace.h"
/*
@@ -652,10 +651,6 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
if (ret)
goto out_unlock;
ret = scoutfs_quota_check_inode(sb, dir);
if (ret)
goto out_unlock;
if (orph_lock) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
if (ret < 0)
@@ -677,8 +672,6 @@ retry:
if (ret < 0)
goto out;
scoutfs_inode_set_proj(inode, scoutfs_inode_get_proj(dir));
ret = scoutfs_dirty_inode_item(dir, *dir_lock);
out:
if (ret)
@@ -933,16 +926,12 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
goto unlock;
}
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto unlock;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, dentry->d_name.len, hash,
&dent, dir_lock);
if (ret < 0)
goto unlock;
goto out;
if (should_orphan(inode)) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
@@ -1643,10 +1632,6 @@ static int scoutfs_rename_common(struct inode *old_dir,
goto out_unlock;
}
if ((old_inode && (ret = scoutfs_inode_check_retention(old_inode))) ||
(new_inode && (ret = scoutfs_inode_check_retention(new_inode))))
goto out_unlock;
if (should_orphan(new_inode)) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
&orph_lock);

View File

@@ -28,7 +28,6 @@
#include "inode.h"
#include "per_task.h"
#include "omap.h"
#include "quota.h"
#ifdef KC_LINUX_HAVE_FOP_AIO_READ
/*
@@ -109,10 +108,6 @@ retry:
if (ret)
goto out;
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto out;
ret = scoutfs_complete_truncate(inode, scoutfs_inode_lock);
if (ret)
goto out;
@@ -127,10 +122,6 @@ retry:
goto out;
}
ret = scoutfs_quota_check_data(sb, inode);
if (ret)
goto out;
/* XXX: remove SUID bit */
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
@@ -180,8 +171,10 @@ retry:
goto out;
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
ret = scoutfs_data_wait_check(inode, iocb->ki_pos, iov_iter_count(to), SEF_OFFLINE,
SCOUTFS_IOC_DWO_READ, &dw, scoutfs_inode_lock);
ret = scoutfs_data_wait_check_iter(inode, iocb->ki_pos, to,
SEF_OFFLINE,
SCOUTFS_IOC_DWO_READ,
&dw, scoutfs_inode_lock);
if (ret != 0)
goto out;
} else {
@@ -212,7 +205,8 @@ ssize_t scoutfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct scoutfs_lock *scoutfs_inode_lock = NULL;
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
DECLARE_DATA_WAIT(dw);
ssize_t ret;
int ret;
int written;
retry:
inode_lock(inode);
@@ -225,29 +219,23 @@ retry:
if (ret <= 0)
goto out;
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto out;
ret = scoutfs_complete_truncate(inode, scoutfs_inode_lock);
if (ret)
goto out;
ret = scoutfs_quota_check_data(sb, inode);
if (ret)
goto out;
if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
/* data_version is per inode, whole file must be online */
ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode), SEF_OFFLINE,
SCOUTFS_IOC_DWO_WRITE, &dw, scoutfs_inode_lock);
ret = scoutfs_data_wait_check_iter(inode, iocb->ki_pos, from,
SEF_OFFLINE,
SCOUTFS_IOC_DWO_WRITE,
&dw, scoutfs_inode_lock);
if (ret != 0)
goto out;
}
/* XXX: remove SUID bit */
ret = __generic_file_write_iter(iocb, from);
written = __generic_file_write_iter(iocb, from);
out:
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
@@ -260,10 +248,10 @@ out:
goto retry;
}
if (ret > 0)
ret = generic_write_sync(iocb, ret);
if (ret > 0 || ret == -EIOCBQUEUED)
ret = generic_write_sync(iocb, written);
return ret;
return written ? written : ret;
}
#endif

View File

@@ -238,16 +238,19 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
* We return -ESTALE if we hit stale blocks to give the caller a chance
* to reset their state and retry with a newer version of the btrees.
*/
int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
struct scoutfs_key *key, struct scoutfs_key *bloom_key,
struct scoutfs_key *start, struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg)
int scoutfs_forest_read_items(struct super_block *sb,
struct scoutfs_key *key,
struct scoutfs_key *bloom_key,
struct scoutfs_key *start,
struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg)
{
struct forest_read_items_data rid = {
.cb = cb,
.cb_arg = arg,
};
struct scoutfs_log_trees lt;
struct scoutfs_net_roots roots;
struct scoutfs_bloom_block *bb;
struct forest_bloom_nrs bloom;
SCOUTFS_BTREE_ITEM_REF(iref);
@@ -261,14 +264,18 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
scoutfs_inc_counter(sb, forest_read_items);
calc_bloom_nrs(&bloom, bloom_key);
trace_scoutfs_forest_using_roots(sb, &roots->fs_root, &roots->logs_root);
ret = scoutfs_client_get_roots(sb, &roots);
if (ret)
goto out;
trace_scoutfs_forest_using_roots(sb, &roots.fs_root, &roots.logs_root);
*start = orig_start;
*end = orig_end;
/* start with fs root items */
rid.fic |= FIC_FS_ROOT;
ret = scoutfs_btree_read_items(sb, &roots->fs_root, key, start, end,
ret = scoutfs_btree_read_items(sb, &roots.fs_root, key, start, end,
forest_read_items, &rid);
if (ret < 0)
goto out;
@@ -276,7 +283,7 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
scoutfs_key_init_log_trees(&ltk, 0, 0);
for (;; scoutfs_key_inc(&ltk)) {
ret = scoutfs_btree_next(sb, &roots->logs_root, &ltk, &iref);
ret = scoutfs_btree_next(sb, &roots.logs_root, &ltk, &iref);
if (ret == 0) {
if (iref.val_len == sizeof(lt)) {
ltk = *iref.key;
@@ -333,23 +340,6 @@ out:
return ret;
}
int scoutfs_forest_read_items(struct super_block *sb,
struct scoutfs_key *key,
struct scoutfs_key *bloom_key,
struct scoutfs_key *start,
struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg)
{
struct scoutfs_net_roots roots;
int ret;
ret = scoutfs_client_get_roots(sb, &roots);
if (ret == 0)
ret = scoutfs_forest_read_items_roots(sb, &roots, key, bloom_key, start, end,
cb, arg);
return ret;
}
/*
* If the items are deltas then combine the src with the destination
* value and store the result in the destination.

View File

@@ -4,7 +4,6 @@
struct scoutfs_alloc;
struct scoutfs_block_writer;
struct scoutfs_block;
struct scoutfs_lock;
#include "btree.h"
@@ -24,10 +23,6 @@ int scoutfs_forest_read_items(struct super_block *sb,
struct scoutfs_key *start,
struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg);
int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_roots *roots,
struct scoutfs_key *key, struct scoutfs_key *bloom_key,
struct scoutfs_key *start, struct scoutfs_key *end,
scoutfs_forest_item_cb cb, void *arg);
int scoutfs_forest_set_bloom_bits(struct super_block *sb,
struct scoutfs_lock *lock);
void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);

View File

@@ -8,14 +8,9 @@
*/
#define SCOUTFS_FORMAT_VERSION_MIN 1
#define SCOUTFS_FORMAT_VERSION_MIN_STR __stringify(SCOUTFS_FORMAT_VERSION_MIN)
#define SCOUTFS_FORMAT_VERSION_MAX 2
#define SCOUTFS_FORMAT_VERSION_MAX 1
#define SCOUTFS_FORMAT_VERSION_MAX_STR __stringify(SCOUTFS_FORMAT_VERSION_MAX)
#define SCOUTFS_FORMAT_VERSION_FEAT_RETENTION 2
#define SCOUTFS_FORMAT_VERSION_FEAT_PROJECT_ID 2
#define SCOUTFS_FORMAT_VERSION_FEAT_QUOTA 2
#define SCOUTFS_FORMAT_VERSION_FEAT_INDX_TAG 2
/* statfs(2) f_type */
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
@@ -180,10 +175,6 @@ struct scoutfs_key {
#define sko_rid _sk_first
#define sko_ino _sk_second
/* quota rules */
#define skqr_hash _sk_second
#define skqr_coll_nr _sk_third
/* xattr totl */
#define skxt_a _sk_first
#define skxt_b _sk_second
@@ -594,9 +585,7 @@ struct scoutfs_log_merge_freeing {
*/
#define SCOUTFS_INODE_INDEX_ZONE 4
#define SCOUTFS_ORPHAN_ZONE 8
#define SCOUTFS_QUOTA_ZONE 10
#define SCOUTFS_XATTR_TOTL_ZONE 12
#define SCOUTFS_XATTR_INDX_ZONE 14
#define SCOUTFS_FS_ZONE 16
#define SCOUTFS_LOCK_ZONE 20
/* Items only stored in server btrees */
@@ -619,9 +608,6 @@ struct scoutfs_log_merge_freeing {
/* orphan zone, redundant type used for clarity */
#define SCOUTFS_ORPHAN_TYPE 4
/* quota zone */
#define SCOUTFS_QUOTA_RULE_TYPE 4
/* fs zone */
#define SCOUTFS_INODE_TYPE 4
#define SCOUTFS_XATTR_TYPE 8
@@ -675,34 +661,6 @@ struct scoutfs_xattr_totl_val {
__le64 count;
};
#define SQ_RF_TOTL_COUNT (1 << 0)
#define SQ_RF__UNKNOWN (~((1 << 1) - 1))
#define SQ_NS_LITERAL 0
#define SQ_NS_PROJ 1
#define SQ_NS_UID 2
#define SQ_NS_GID 3
#define SQ_NS__NR 4
#define SQ_NS__NR_SELECT (SQ_NS__NR - 1) /* !literal */
#define SQ_NF_SELECT (1 << 0)
#define SQ_NF__UNKNOWN (~((1 << 1) - 1))
#define SQ_OP_INODE 0
#define SQ_OP_DATA 1
#define SQ_OP__NR 2
struct scoutfs_quota_rule_val {
__le64 name_val[3];
__le64 limit;
__u8 prio;
__u8 op;
__u8 rule_flags;
__u8 name_source[3];
__u8 name_flags[3];
__u8 _pad[7];
};
/* XXX does this exist upstream somewhere? */
#define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
@@ -901,38 +859,9 @@ struct scoutfs_inode {
struct scoutfs_timespec ctime;
struct scoutfs_timespec mtime;
struct scoutfs_timespec crtime;
__le64 proj;
};
#define SCOUTFS_INODE_FMT_V1_BYTES offsetof(struct scoutfs_inode, proj)
/*
* There are so few versions that we don't mind doing this work inline
* so that both utils and kernel can share these. Mounting has already
* checked that the format version is within the supported min and max,
* so these functions only deal with size variance within that band.
*/
/* Returns the native written inode size for the given format version, 0 for bad version */
static inline int scoutfs_inode_vers_bytes(__u64 fmt_vers)
{
if (fmt_vers == 1)
return SCOUTFS_INODE_FMT_V1_BYTES;
else
return sizeof(struct scoutfs_inode);
}
/*
* Returns true if bytes is a valid inode size to read from the given
* version. The given version must be greater than the version that
* introduced the size.
*/
static inline int scoutfs_inode_valid_vers_bytes(__u64 fmt_vers, int bytes)
{
return (bytes == sizeof(struct scoutfs_inode) && fmt_vers == SCOUTFS_FORMAT_VERSION_MAX) ||
(bytes == SCOUTFS_INODE_FMT_V1_BYTES);
}
#define SCOUTFS_INO_FLAG_TRUNCATE 0x1
#define SCOUTFS_INO_FLAG_RETENTION 0x2
#define SCOUTFS_INO_FLAG_TRUNCATE 0x1
#define SCOUTFS_ROOT_INO 1

View File

@@ -91,7 +91,7 @@ static void scoutfs_inode_ctor(void *obj)
init_rwsem(&si->extent_sem);
mutex_init(&si->item_mutex);
seqlock_init(&si->seqlock);
seqcount_init(&si->seqcount);
si->staging = false;
scoutfs_per_task_init(&si->pt_data_lock);
atomic64_set(&si->data_waitq.changed, 0);
@@ -250,7 +250,7 @@ static void set_item_info(struct scoutfs_inode_info *si,
set_item_major(si, SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE, sinode->data_seq);
}
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode, int inode_bytes)
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
@@ -278,7 +278,6 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode, int in
si->flags = le32_to_cpu(cinode->flags);
si->crtime.tv_sec = le64_to_cpu(cinode->crtime.sec);
si->crtime.tv_nsec = le32_to_cpu(cinode->crtime.nsec);
si->proj = le64_to_cpu(cinode->proj);
/*
* i_blocks is initialized from online and offline and is then
@@ -299,24 +298,6 @@ void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino)
};
}
/*
* Read an inode item into the caller's buffer and return the size that
* we read. Returns errors if the inode size is unsupported or doesn't
* make sense for the format version.
*/
static int lookup_inode_item(struct super_block *sb, struct scoutfs_key *key,
struct scoutfs_inode *sinode, struct scoutfs_lock *lock)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
int ret;
ret = scoutfs_item_lookup_smaller_zero(sb, key, sinode, sizeof(struct scoutfs_inode), lock);
if (ret >= 0 && !scoutfs_inode_valid_vers_bytes(sbi->fmt_vers, ret))
return -EIO;
return ret;
}
/*
* Refresh the vfs inode fields if the lock indicates that the current
* contents could be stale.
@@ -352,12 +333,12 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
mutex_lock(&si->item_mutex);
if (atomic64_read(&si->last_refreshed) < refresh_gen) {
ret = lookup_inode_item(sb, &key, &sinode, lock);
if (ret > 0) {
load_inode(inode, &sinode, ret);
ret = scoutfs_item_lookup_exact(sb, &key, &sinode,
sizeof(sinode), lock);
if (ret == 0) {
load_inode(inode, &sinode);
atomic64_set(&si->last_refreshed, refresh_gen);
scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov);
ret = 0;
}
} else {
ret = 0;
@@ -505,10 +486,6 @@ retry:
if (ret)
goto out;
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto out;
attr_size = (attr->ia_valid & ATTR_SIZE) ? attr->ia_size :
i_size_read(inode);
@@ -589,9 +566,11 @@ static void set_trans_seq(struct inode *inode, u64 *seq)
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
if (*seq != sbi->trans_seq) {
write_seqlock(&si->seqlock);
preempt_disable();
write_seqcount_begin(&si->seqcount);
*seq = sbi->trans_seq;
write_sequnlock(&si->seqlock);
write_seqcount_end(&si->seqcount);
preempt_enable();
}
}
@@ -613,18 +592,22 @@ void scoutfs_inode_inc_data_version(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
write_seqlock(&si->seqlock);
preempt_disable();
write_seqcount_begin(&si->seqcount);
si->data_version++;
write_sequnlock(&si->seqlock);
write_seqcount_end(&si->seqcount);
preempt_enable();
}
void scoutfs_inode_set_data_version(struct inode *inode, u64 data_version)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
write_seqlock(&si->seqlock);
preempt_disable();
write_seqcount_begin(&si->seqcount);
si->data_version = data_version;
write_sequnlock(&si->seqlock);
write_seqcount_end(&si->seqcount);
preempt_enable();
}
void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
@@ -633,7 +616,8 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
if (inode && (on || off)) {
si = SCOUTFS_I(inode);
write_seqlock(&si->seqlock);
preempt_disable();
write_seqcount_begin(&si->seqcount);
/* inode and extents out of sync, bad callers */
if (((s64)si->online_blocks + on < 0) ||
@@ -654,7 +638,8 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
si->online_blocks,
si->offline_blocks);
write_sequnlock(&si->seqlock);
write_seqcount_end(&si->seqcount);
preempt_enable();
}
/* any time offline extents decreased we try and wake waiters */
@@ -662,16 +647,16 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
scoutfs_data_wait_changed(inode);
}
static u64 read_seqlock_u64(struct inode *inode, u64 *val)
static u64 read_seqcount_u64(struct inode *inode, u64 *val)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
unsigned seq;
unsigned int seq;
u64 v;
do {
seq = read_seqbegin(&si->seqlock);
seq = read_seqcount_begin(&si->seqcount);
v = *val;
} while (read_seqretry(&si->seqlock, seq));
} while (read_seqcount_retry(&si->seqcount, seq));
return v;
}
@@ -680,82 +665,33 @@ u64 scoutfs_inode_meta_seq(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
return read_seqlock_u64(inode, &si->meta_seq);
return read_seqcount_u64(inode, &si->meta_seq);
}
u64 scoutfs_inode_data_seq(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
return read_seqlock_u64(inode, &si->data_seq);
return read_seqcount_u64(inode, &si->data_seq);
}
u64 scoutfs_inode_data_version(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
return read_seqlock_u64(inode, &si->data_version);
return read_seqcount_u64(inode, &si->data_version);
}
void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
unsigned seq;
unsigned int seq;
do {
seq = read_seqbegin(&si->seqlock);
seq = read_seqcount_begin(&si->seqcount);
*on = SCOUTFS_I(inode)->online_blocks;
*off = SCOUTFS_I(inode)->offline_blocks;
} while (read_seqretry(&si->seqlock, seq));
}
/*
* Get our private scoutfs inode flags, not the vfs i_flags.
*/
u32 scoutfs_inode_get_flags(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
unsigned seq;
u32 flags;
do {
seq = read_seqbegin(&si->seqlock);
flags = si->flags;
} while (read_seqretry(&si->seqlock, seq));
return flags;
}
void scoutfs_inode_set_flags(struct inode *inode, u32 and, u32 or)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
write_seqlock(&si->seqlock);
si->flags = (si->flags & and) | or;
write_sequnlock(&si->seqlock);
}
u64 scoutfs_inode_get_proj(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
unsigned seq;
u64 proj;
do {
seq = read_seqbegin(&si->seqlock);
proj = si->proj;
} while (read_seqretry(&si->seqlock, seq));
return proj;
}
void scoutfs_inode_set_proj(struct inode *inode, u64 proj)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
write_seqlock(&si->seqlock);
si->proj = proj;
write_sequnlock(&si->seqlock);
} while (read_seqcount_retry(&si->seqcount, seq));
}
static int scoutfs_iget_test(struct inode *inode, void *arg)
@@ -867,7 +803,7 @@ out:
return inode;
}
static void store_inode(struct scoutfs_inode *cinode, struct inode *inode, int inode_bytes)
static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
u64 online_blocks;
@@ -903,7 +839,6 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode, int i
cinode->crtime.sec = cpu_to_le64(si->crtime.tv_sec);
cinode->crtime.nsec = cpu_to_le32(si->crtime.tv_nsec);
memset(cinode->crtime.__pad, 0, sizeof(cinode->crtime.__pad));
cinode->proj = cpu_to_le64(si->proj);
}
/*
@@ -927,18 +862,15 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode, int i
int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_inode sinode;
struct scoutfs_key key;
int inode_bytes;
int ret;
inode_bytes = scoutfs_inode_vers_bytes(sbi->fmt_vers);
store_inode(&sinode, inode, inode_bytes);
store_inode(&sinode, inode);
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
ret = scoutfs_item_update(sb, &key, &sinode, inode_bytes, lock);
ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
if (!ret)
trace_scoutfs_dirty_inode(inode);
return ret;
@@ -1140,11 +1072,9 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
const u64 ino = scoutfs_ino(inode);
struct scoutfs_inode sinode;
struct scoutfs_key key;
int inode_bytes;
struct scoutfs_inode sinode;
int ret;
int err;
@@ -1153,17 +1083,15 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
/* set the meta version once per trans for any inode updates */
scoutfs_inode_set_meta_seq(inode);
inode_bytes = scoutfs_inode_vers_bytes(sbi->fmt_vers);
/* only race with other inode field stores once */
store_inode(&sinode, inode, inode_bytes);
store_inode(&sinode, inode);
ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list, lock);
BUG_ON(ret);
scoutfs_inode_init_key(&key, ino);
err = scoutfs_item_update(sb, &key, &sinode, inode_bytes, lock);
err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
if (err) {
scoutfs_err(sb, "inode %llu update err %d", ino, err);
BUG_ON(err);
@@ -1531,12 +1459,10 @@ out:
int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev,
u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_inode_info *si;
struct scoutfs_inode sinode;
struct scoutfs_key key;
struct scoutfs_inode sinode;
struct inode *inode;
int inode_bytes;
int ret;
inode = new_inode(sb);
@@ -1552,7 +1478,6 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
si->offline_blocks = 0;
si->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
si->next_xattr_id = 0;
si->proj = 0;
si->have_item = false;
atomic64_set(&si->last_refreshed, lock->refresh_gen);
scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov);
@@ -1568,16 +1493,14 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
inode->i_rdev = rdev;
set_inode_ops(inode);
inode_bytes = scoutfs_inode_vers_bytes(sbi->fmt_vers);
store_inode(&sinode, inode, inode_bytes);
store_inode(&sinode, inode);
scoutfs_inode_init_key(&key, scoutfs_ino(inode));
ret = scoutfs_omap_set(sb, ino);
if (ret < 0)
goto out;
ret = scoutfs_item_create(sb, &key, &sinode, inode_bytes, lock);
ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
if (ret < 0)
scoutfs_omap_clear(sb, ino);
out:
@@ -1831,7 +1754,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
}
scoutfs_inode_init_key(&key, ino);
ret = lookup_inode_item(sb, &key, &sinode, lock);
ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -2220,17 +2143,6 @@ out:
return ret;
}
/*
* Return an error if the inode has the retention flag set and can not
* be modified. This mimics the errno returned by the vfs whan an
* inode's immutable flag is set. The flag won't be set on older format
* versions so we don't check the mounted format version here.
*/
int scoutfs_inode_check_retention(struct inode *inode)
{
return (scoutfs_inode_get_flags(inode) & SCOUTFS_INO_FLAG_RETENTION) ? -EPERM : 0;
}
int scoutfs_inode_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);

View File

@@ -21,7 +21,6 @@ struct scoutfs_inode_info {
u64 data_version;
u64 online_blocks;
u64 offline_blocks;
u64 proj;
u32 flags;
struct kc_timespec crtime;
@@ -48,7 +47,7 @@ struct scoutfs_inode_info {
atomic64_t last_refreshed;
/* initialized once for slab object */
seqlock_t seqlock;
seqcount_t seqcount;
bool staging; /* holder of i_mutex is staging */
struct scoutfs_per_task pt_data_lock;
struct scoutfs_data_waitq data_waitq;
@@ -121,15 +120,8 @@ u64 scoutfs_inode_meta_seq(struct inode *inode);
u64 scoutfs_inode_data_seq(struct inode *inode);
u64 scoutfs_inode_data_version(struct inode *inode);
void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off);
u32 scoutfs_inode_get_flags(struct inode *inode);
void scoutfs_inode_set_flags(struct inode *inode, u32 and, u32 or);
u64 scoutfs_inode_get_proj(struct inode *inode);
void scoutfs_inode_set_proj(struct inode *inode, u64 proj);
int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock);
int scoutfs_inode_check_retention(struct inode *inode);
int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock);
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,

View File

@@ -42,10 +42,6 @@
#include "alloc.h"
#include "server.h"
#include "counters.h"
#include "attr_x.h"
#include "totl.h"
#include "wkic.h"
#include "quota.h"
#include "scoutfs_trace.h"
/*
@@ -549,41 +545,20 @@ out:
static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
struct scoutfs_ioctl_inode_attr_x *iax = NULL;
struct scoutfs_ioctl_stat_more *stm = NULL;
int ret;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct scoutfs_ioctl_stat_more stm;
iax = kmalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL);
stm = kmalloc(sizeof(struct scoutfs_ioctl_stat_more), GFP_KERNEL);
if (!iax || !stm) {
ret = -ENOMEM;
goto out;
}
stm.meta_seq = scoutfs_inode_meta_seq(inode);
stm.data_seq = scoutfs_inode_data_seq(inode);
stm.data_version = scoutfs_inode_data_version(inode);
scoutfs_inode_get_onoff(inode, &stm.online_blocks, &stm.offline_blocks);
stm.crtime_sec = si->crtime.tv_sec;
stm.crtime_nsec = si->crtime.tv_nsec;
iax->x_mask = SCOUTFS_IOC_IAX_META_SEQ | SCOUTFS_IOC_IAX_DATA_SEQ |
SCOUTFS_IOC_IAX_DATA_VERSION | SCOUTFS_IOC_IAX_ONLINE_BLOCKS |
SCOUTFS_IOC_IAX_OFFLINE_BLOCKS | SCOUTFS_IOC_IAX_CRTIME;
iax->x_flags = 0;
ret = scoutfs_get_attr_x(inode, iax);
if (ret < 0)
goto out;
if (copy_to_user((void __user *)arg, &stm, sizeof(stm)))
return -EFAULT;
stm->meta_seq = iax->meta_seq;
stm->data_seq = iax->data_seq;
stm->data_version = iax->data_version;
stm->online_blocks = iax->online_blocks;
stm->offline_blocks = iax->offline_blocks;
stm->crtime_sec = iax->crtime_sec;
stm->crtime_nsec = iax->crtime_nsec;
if (copy_to_user((void __user *)arg, stm, sizeof(struct scoutfs_ioctl_stat_more)))
ret = -EFAULT;
else
ret = 0;
out:
kfree(iax);
kfree(stm);
return ret;
return 0;
}
static bool inc_wrapped(u64 *ino, u64 *iblock)
@@ -640,19 +615,24 @@ static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg)
* This is used when restoring files, it lets the caller set all the
* inode attributes which are otherwise unreachable. Changing the file
* size can only be done for regular files with a data_version of 0.
*
* We unconditionally fill the iax attributes from the sm set and let
* set_attr_x check them.
*/
static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
struct inode *inode = file->f_inode;
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct scoutfs_ioctl_setattr_more __user *usm = (void __user *)arg;
struct scoutfs_ioctl_inode_attr_x *iax = NULL;
struct scoutfs_ioctl_setattr_more sm;
struct scoutfs_lock *lock = NULL;
LIST_HEAD(ind_locks);
bool set_data_seq;
int ret;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out;
}
if (!(file->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out;
@@ -663,38 +643,65 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
goto out;
}
if (sm.flags & SCOUTFS_IOC_SETATTR_MORE_UNKNOWN) {
if ((sm.i_size > 0 && sm.data_version == 0) ||
((sm.flags & SCOUTFS_IOC_SETATTR_MORE_OFFLINE) && !sm.i_size) ||
(sm.flags & SCOUTFS_IOC_SETATTR_MORE_UNKNOWN)) {
ret = -EINVAL;
goto out;
}
iax = kzalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL);
if (!iax) {
ret = -ENOMEM;
ret = mnt_want_write_file(file);
if (ret)
goto out;
inode_lock(inode);
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
if (ret)
goto unlock;
/* can only change size/dv on untouched regular files */
if ((sm.i_size != 0 || sm.data_version != 0) &&
((!S_ISREG(inode->i_mode) ||
scoutfs_inode_data_version(inode) != 0))) {
ret = -EINVAL;
goto unlock;
}
iax->x_mask = SCOUTFS_IOC_IAX_DATA_VERSION | SCOUTFS_IOC_IAX_CTIME |
SCOUTFS_IOC_IAX_CRTIME | SCOUTFS_IOC_IAX_SIZE;
iax->data_version = sm.data_version;
iax->ctime_sec = sm.ctime_sec;
iax->ctime_nsec = sm.ctime_nsec;
iax->crtime_sec = sm.crtime_sec;
iax->crtime_nsec = sm.crtime_nsec;
iax->size = sm.i_size;
/* create offline extents in potentially many transactions */
if (sm.flags & SCOUTFS_IOC_SETATTR_MORE_OFFLINE) {
ret = scoutfs_data_init_offline_extent(inode, sm.i_size, lock);
if (ret)
goto unlock;
}
if (sm.flags & SCOUTFS_IOC_SETATTR_MORE_OFFLINE)
iax->x_flags |= SCOUTFS_IOC_IAX_F_SIZE_OFFLINE;
/* setting only so we don't see 0 data seq with nonzero data_version */
set_data_seq = sm.data_version != 0 ? true : false;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false);
if (ret)
goto unlock;
ret = mnt_want_write_file(file);
if (ret < 0)
goto out;
if (sm.data_version)
scoutfs_inode_set_data_version(inode, sm.data_version);
if (sm.i_size)
i_size_write(inode, sm.i_size);
inode->i_ctime.tv_sec = sm.ctime_sec;
inode->i_ctime.tv_nsec = sm.ctime_nsec;
si->crtime.tv_sec = sm.crtime_sec;
si->crtime.tv_nsec = sm.crtime_nsec;
ret = scoutfs_set_attr_x(inode, iax);
scoutfs_update_inode_item(inode, lock, &ind_locks);
ret = 0;
scoutfs_release_trans(sb);
unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
inode_unlock(inode);
mnt_drop_write_file(file);
out:
kfree(iax);
return ret;
}
@@ -1028,32 +1035,124 @@ out:
return ret;
}
struct read_xattr_total_iter_cb_args {
struct scoutfs_ioctl_xattr_total *xt;
unsigned int copied;
unsigned int total;
struct xattr_total_entry {
struct rb_node node;
struct scoutfs_ioctl_xattr_total xt;
u64 fs_seq;
u64 fs_total;
u64 fs_count;
u64 fin_seq;
u64 fin_total;
s64 fin_count;
u64 log_seq;
u64 log_total;
s64 log_count;
};
/*
* This is called under an RCU read lock so it can't copy to userspace.
*/
static int read_xattr_total_iter_cb(struct scoutfs_key *key, void *val, unsigned int val_len,
void *cb_arg)
static int cmp_xt_entry_name(const struct xattr_total_entry *a,
const struct xattr_total_entry *b)
{
return scoutfs_cmp_u64s(a->xt.name[0], b->xt.name[0]) ?:
scoutfs_cmp_u64s(a->xt.name[1], b->xt.name[1]) ?:
scoutfs_cmp_u64s(a->xt.name[2], b->xt.name[2]);
}
/*
* Record the contribution of the three classes of logged items we can
* see: the item in the fs_root, items from finalized log btrees, and
* items from active log btrees. Once we have the full set the caller
* can decide which of the items contribute to the total it sends to the
* user.
*/
static int read_xattr_total_item(struct super_block *sb, struct scoutfs_key *key,
u64 seq, u8 flags, void *val, int val_len, int fic, void *arg)
{
struct read_xattr_total_iter_cb_args *cba = cb_arg;
struct scoutfs_xattr_totl_val *tval = val;
struct scoutfs_ioctl_xattr_total *xt = &cba->xt[cba->copied];
struct xattr_total_entry *ent;
struct xattr_total_entry rd;
struct rb_root *root = arg;
struct rb_node *parent;
struct rb_node **node;
int cmp;
xt->name[0] = le64_to_cpu(key->skxt_a);
xt->name[1] = le64_to_cpu(key->skxt_b);
xt->name[2] = le64_to_cpu(key->skxt_c);
xt->total = le64_to_cpu(tval->total);
xt->count = le64_to_cpu(tval->count);
rd.xt.name[0] = le64_to_cpu(key->skxt_a);
rd.xt.name[1] = le64_to_cpu(key->skxt_b);
rd.xt.name[2] = le64_to_cpu(key->skxt_c);
if (++cba->copied < cba->total)
return -EAGAIN;
else
return 0;
/* find entry matching name */
node = &root->rb_node;
parent = NULL;
cmp = -1;
while (*node) {
parent = *node;
ent = container_of(*node, struct xattr_total_entry, node);
/* sort merge items by key then newest to oldest */
cmp = cmp_xt_entry_name(&rd, ent);
if (cmp < 0)
node = &(*node)->rb_left;
else if (cmp > 0)
node = &(*node)->rb_right;
else
break;
}
/* allocate and insert new node if we need to */
if (cmp != 0) {
ent = kzalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
memcpy(&ent->xt.name, &rd.xt.name, sizeof(ent->xt.name));
rb_link_node(&ent->node, parent, node);
rb_insert_color(&ent->node, root);
}
if (fic & FIC_FS_ROOT) {
ent->fs_seq = seq;
ent->fs_total = le64_to_cpu(tval->total);
ent->fs_count = le64_to_cpu(tval->count);
} else if (fic & FIC_FINALIZED) {
ent->fin_seq = seq;
ent->fin_total += le64_to_cpu(tval->total);
ent->fin_count += le64_to_cpu(tval->count);
} else {
ent->log_seq = seq;
ent->log_total += le64_to_cpu(tval->total);
ent->log_count += le64_to_cpu(tval->count);
}
scoutfs_inc_counter(sb, totl_read_item);
return 0;
}
/* these are always _safe, node stores next */
#define for_each_xt_ent(ent, node, root) \
for (node = rb_first(root); \
node && (ent = rb_entry(node, struct xattr_total_entry, node), \
node = rb_next(node), 1); )
#define for_each_xt_ent_reverse(ent, node, root) \
for (node = rb_last(root); \
node && (ent = rb_entry(node, struct xattr_total_entry, node), \
node = rb_prev(node), 1); )
static void free_xt_ent(struct rb_root *root, struct xattr_total_entry *ent)
{
rb_erase(&ent->node, root);
kfree(ent);
}
static void free_all_xt_ents(struct rb_root *root)
{
struct xattr_total_entry *ent;
struct rb_node *node;
for_each_xt_ent(ent, node, root)
free_xt_ent(root, ent);
}
/*
@@ -1063,6 +1162,30 @@ static int read_xattr_total_iter_cb(struct scoutfs_key *key, void *val, unsigned
* have been committed. It doesn't use locking to force commits and
* block writers so it can be a little bit out of date with respect to
* dirty xattrs in memory across the system.
*
* Our reader has to be careful because the log btree merging code can
* write partial results to the fs_root. This means that a reader can
* see both cases where new finalized logs should be applied to the old
* fs items and where old finalized logs have already been applied to
* the partially merged fs items. Currently active logged items are
* always applied on top of all cases.
*
* These cases are differentiated with a combination of sequence numbers
* in items, the count of contributing xattrs, and a flag
* differentiating finalized and active logged items. This lets us
* recognize all cases, including when finalized logs were merged and
* deleted the fs item.
*
* We're allocating a tracking struct for each totl name we see while
* traversing the item btrees. The forest reader is providing the items
* it finds in leaf blocks that contain the search key. In the worst
* case all of these blocks are full and none of the items overlap. At
* most, figure order a thousand names per mount. But in practice many
* of these factors fall away: leaf blocks aren't fill, leaf items
* overlap, there aren't finalized log btrees, and not all mounts are
* actively changing totals. We're much more likely to only read a
* leaf block's worth of totals that have been long since merged into
* the fs_root.
*/
static long scoutfs_ioc_read_xattr_totals(struct file *file, unsigned long arg)
{
@@ -1070,13 +1193,14 @@ static long scoutfs_ioc_read_xattr_totals(struct file *file, unsigned long arg)
struct scoutfs_ioctl_read_xattr_totals __user *urxt = (void __user *)arg;
struct scoutfs_ioctl_read_xattr_totals rxt;
struct scoutfs_ioctl_xattr_total __user *uxt;
struct read_xattr_total_iter_cb_args cba = {NULL, };
struct scoutfs_key range_start;
struct scoutfs_key range_end;
struct xattr_total_entry *ent;
struct scoutfs_key key;
unsigned int copied = 0;
unsigned int total;
unsigned int ready;
struct scoutfs_key bloom_key;
struct scoutfs_key start;
struct scoutfs_key end;
struct rb_root root = RB_ROOT;
struct rb_node *node;
int count = 0;
int ret;
if (!(file->f_mode & FMODE_READ)) {
@@ -1089,13 +1213,6 @@ static long scoutfs_ioc_read_xattr_totals(struct file *file, unsigned long arg)
goto out;
}
cba.xt = (void *)__get_free_page(GFP_KERNEL);
if (!cba.xt) {
ret = -ENOMEM;
goto out;
}
cba.total = PAGE_SIZE / sizeof(struct scoutfs_ioctl_xattr_total);
if (copy_from_user(&rxt, urxt, sizeof(rxt))) {
ret = -EFAULT;
goto out;
@@ -1108,40 +1225,101 @@ static long scoutfs_ioc_read_xattr_totals(struct file *file, unsigned long arg)
goto out;
}
total = div_u64(min_t(u64, rxt.totals_bytes, INT_MAX),
sizeof(struct scoutfs_ioctl_xattr_total));
scoutfs_key_set_zeros(&bloom_key);
bloom_key.sk_zone = SCOUTFS_XATTR_TOTL_ZONE;
scoutfs_xattr_init_totl_key(&start, rxt.pos_name);
scoutfs_totl_set_range(&range_start, &range_end);
scoutfs_xattr_init_totl_key(&key, rxt.pos_name);
while (rxt.totals_bytes >= sizeof(struct scoutfs_ioctl_xattr_total)) {
while (copied < total) {
cba.copied = 0;
ret = scoutfs_wkic_iterate(sb, &key, &range_end, &range_start, &range_end,
read_xattr_total_iter_cb, &cba);
if (ret < 0)
goto out;
if (cba.copied == 0)
scoutfs_key_set_ones(&end);
end.sk_zone = SCOUTFS_XATTR_TOTL_ZONE;
if (scoutfs_key_compare(&start, &end) > 0)
break;
ready = min(total - copied, cba.copied);
if (copy_to_user(&uxt[copied], cba.xt, ready * sizeof(cba.xt[0]))) {
ret = -EFAULT;
key = start;
ret = scoutfs_forest_read_items(sb, &key, &bloom_key, &start, &end,
read_xattr_total_item, &root);
if (ret < 0) {
if (ret == -ESTALE) {
free_all_xt_ents(&root);
continue;
}
goto out;
}
scoutfs_xattr_init_totl_key(&key, cba.xt[ready - 1].name);
scoutfs_key_inc(&key);
copied += ready;
if (RB_EMPTY_ROOT(&root))
break;
/* trim totals that fall outside of the consistent range */
for_each_xt_ent(ent, node, &root) {
scoutfs_xattr_init_totl_key(&key, ent->xt.name);
if (scoutfs_key_compare(&key, &start) < 0) {
free_xt_ent(&root, ent);
} else {
break;
}
}
for_each_xt_ent_reverse(ent, node, &root) {
scoutfs_xattr_init_totl_key(&key, ent->xt.name);
if (scoutfs_key_compare(&key, &end) > 0) {
free_xt_ent(&root, ent);
} else {
break;
}
}
/* copy resulting unique non-zero totals to userspace */
for_each_xt_ent(ent, node, &root) {
if (rxt.totals_bytes < sizeof(ent->xt))
break;
/* start with the fs item if we have it */
if (ent->fs_seq != 0) {
ent->xt.total = ent->fs_total;
ent->xt.count = ent->fs_count;
scoutfs_inc_counter(sb, totl_read_fs);
}
/* apply finalized logs if they're newer or creating */
if (((ent->fs_seq != 0) && (ent->fin_seq > ent->fs_seq)) ||
((ent->fs_seq == 0) && (ent->fin_count > 0))) {
ent->xt.total += ent->fin_total;
ent->xt.count += ent->fin_count;
scoutfs_inc_counter(sb, totl_read_finalized);
}
/* always apply active logs which must be newer than fs and finalized */
if (ent->log_seq > 0) {
ent->xt.total += ent->log_total;
ent->xt.count += ent->log_count;
scoutfs_inc_counter(sb, totl_read_logged);
}
if (ent->xt.total != 0 || ent->xt.count != 0) {
if (copy_to_user(uxt, &ent->xt, sizeof(ent->xt))) {
ret = -EFAULT;
goto out;
}
uxt++;
rxt.totals_bytes -= sizeof(ent->xt);
count++;
scoutfs_inc_counter(sb, totl_read_copied);
}
free_xt_ent(&root, ent);
}
/* continue after the last possible key read */
start = end;
scoutfs_key_inc(&start);
}
ret = 0;
out:
if (cba.xt)
free_page((long)cba.xt);
free_all_xt_ents(&root);
return ret ?: copied;
return ret ?: count;
}
static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg)
@@ -1326,265 +1504,6 @@ out:
return nr ?: ret;
}
static long scoutfs_ioc_get_attr_x(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
struct scoutfs_ioctl_inode_attr_x __user *uiax = (void __user *)arg;
struct scoutfs_ioctl_inode_attr_x *iax = NULL;
int ret;
iax = kmalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL);
if (!iax) {
ret = -ENOMEM;
goto out;
}
ret = get_user(iax->x_mask, &uiax->x_mask) ?:
get_user(iax->x_flags, &uiax->x_flags);
if (ret < 0)
goto out;
ret = scoutfs_get_attr_x(inode, iax);
if (ret < 0)
goto out;
/* only copy results after dropping cluster locks (could fault) */
if (ret > 0 && copy_to_user(uiax, iax, ret) != 0)
ret = -EFAULT;
else
ret = 0;
out:
kfree(iax);
return ret;
}
static long scoutfs_ioc_set_attr_x(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
struct scoutfs_ioctl_inode_attr_x __user *uiax = (void __user *)arg;
struct scoutfs_ioctl_inode_attr_x *iax = NULL;
int ret;
iax = kmalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL);
if (!iax) {
ret = -ENOMEM;
goto out;
}
if (copy_from_user(iax, uiax, sizeof(struct scoutfs_ioctl_inode_attr_x))) {
ret = -EFAULT;
goto out;
}
ret = mnt_want_write_file(file);
if (ret < 0)
goto out;
ret = scoutfs_set_attr_x(inode, iax);
mnt_drop_write_file(file);
out:
kfree(iax);
return ret;
}
static long scoutfs_ioc_get_quota_rules(struct file *file, unsigned long arg)
{
struct super_block *sb = file_inode(file)->i_sb;
struct scoutfs_ioctl_get_quota_rules __user *ugqr = (void __user *)arg;
struct scoutfs_ioctl_get_quota_rules gqr;
struct scoutfs_ioctl_quota_rule __user *uirules;
struct scoutfs_ioctl_quota_rule *irules;
struct page *page = NULL;
int copied = 0;
int nr;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&gqr, ugqr, sizeof(gqr)))
return -EFAULT;
if (gqr.rules_nr == 0)
return 0;
uirules = (void __user *)gqr.rules_ptr;
/* limit rules copied per call */
gqr.rules_nr = min_t(u64, gqr.rules_nr, INT_MAX);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
ret = -ENOMEM;
goto out;
}
irules = page_address(page);
while (copied < gqr.rules_nr) {
nr = min_t(u64, gqr.rules_nr - copied,
PAGE_SIZE / sizeof(struct scoutfs_ioctl_quota_rule));
ret = scoutfs_quota_get_rules(sb, gqr.iterator, page_address(page), nr);
if (ret <= 0)
goto out;
if (copy_to_user(&uirules[copied], irules, ret * sizeof(irules[0]))) {
ret = -EFAULT;
goto out;
}
copied += ret;
}
ret = 0;
out:
if (page)
__free_page(page);
if (ret == 0 && copy_to_user(ugqr->iterator, gqr.iterator, sizeof(gqr.iterator)))
ret = -EFAULT;
return ret ?: copied;
}
static long scoutfs_ioc_mod_quota_rule(struct file *file, unsigned long arg, bool is_add)
{
struct super_block *sb = file_inode(file)->i_sb;
struct scoutfs_ioctl_quota_rule __user *uirule = (void __user *)arg;
struct scoutfs_ioctl_quota_rule irule;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&irule, uirule, sizeof(irule)))
return -EFAULT;
return scoutfs_quota_mod_rule(sb, is_add, &irule);
}
struct read_index_buf {
int nr;
int size;
struct scoutfs_ioctl_xattr_index_entry ents[0];
};
#define READ_INDEX_BUF_MAX_ENTS \
((PAGE_SIZE - sizeof(struct read_index_buf)) / \
sizeof(struct scoutfs_ioctl_xattr_index_entry))
/*
* This doesn't filter out duplicates, the caller filters them out to
* catch duplicates between iteration calls.
*/
static int read_index_cb(struct scoutfs_key *key, void *val, unsigned int val_len, void *cb_arg)
{
struct read_index_buf *rib = cb_arg;
struct scoutfs_ioctl_xattr_index_entry *ent = &rib->ents[rib->nr];
u64 xid;
if (val_len != 0)
return -EIO;
/* discard the xid, they're not exposed to ioctl callers */
scoutfs_xattr_get_indx_key(key, &ent->major, &ent->minor, &ent->ino, &xid);
if (++rib->nr == rib->size)
return rib->nr;
return -EAGAIN;
}
static long scoutfs_ioc_read_xattr_index(struct file *file, unsigned long arg)
{
struct super_block *sb = file_inode(file)->i_sb;
struct scoutfs_ioctl_read_xattr_index __user *urxi = (void __user *)arg;
struct scoutfs_ioctl_xattr_index_entry __user *uents;
struct scoutfs_ioctl_xattr_index_entry *ent;
struct scoutfs_ioctl_xattr_index_entry prev;
struct scoutfs_ioctl_read_xattr_index rxi;
struct read_index_buf *rib;
struct page *page = NULL;
struct scoutfs_key first;
struct scoutfs_key last;
struct scoutfs_key start;
struct scoutfs_key end;
int copied = 0;
int ret;
int i;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out;
}
if (copy_from_user(&rxi, urxi, sizeof(rxi))) {
ret = -EFAULT;
goto out;
}
uents = (void __user *)rxi.entries_ptr;
rxi.entries_nr = min_t(u64, rxi.entries_nr, INT_MAX);
page = alloc_page(GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
goto out;
}
rib = page_address(page);
scoutfs_xattr_init_indx_key(&first, rxi.first.major, rxi.first.minor, rxi.first.ino, 0);
scoutfs_xattr_init_indx_key(&last, rxi.last.major, rxi.last.minor, rxi.last.ino, U64_MAX);
scoutfs_xattr_indx_get_range(&start, &end);
if (scoutfs_key_compare(&first, &last) > 0) {
ret = -EINVAL;
goto out;
}
/* 0 ino doesn't exist, can't ever match entry to return */
memset(&prev, 0, sizeof(prev));
while (copied < rxi.entries_nr) {
rib->nr = 0;
rib->size = min_t(u64, rxi.entries_nr - copied, READ_INDEX_BUF_MAX_ENTS);
ret = scoutfs_wkic_iterate(sb, &first, &last, &start, &end,
read_index_cb, rib);
if (ret < 0)
goto out;
if (rib->nr == 0)
break;
/*
* Copy entries to userspace, skipping duplicate entries
* that can result from multiple xattrs indexing an
* inode at the same position and which can span
* multiple cache iterations. (Comparing in order of
* most likely to change to fail fast.)
*/
for (i = 0, ent = rib->ents; i < rib->nr; i++, ent++) {
if (ent->ino == prev.ino && ent->minor == prev.minor &&
ent->major == prev.major)
continue;
if (copy_to_user(&uents[copied], ent, sizeof(*ent))) {
ret = -EFAULT;
goto out;
}
prev = *ent;
copied++;
}
scoutfs_xattr_init_indx_key(&first, prev.major, prev.minor, prev.ino, U64_MAX);
scoutfs_key_inc(&first);
}
ret = copied;
out:
if (page)
__free_page(page);
return ret;
}
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -1622,18 +1541,6 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return scoutfs_ioc_get_allocated_inos(file, arg);
case SCOUTFS_IOC_GET_REFERRING_ENTRIES:
return scoutfs_ioc_get_referring_entries(file, arg);
case SCOUTFS_IOC_GET_ATTR_X:
return scoutfs_ioc_get_attr_x(file, arg);
case SCOUTFS_IOC_SET_ATTR_X:
return scoutfs_ioc_set_attr_x(file, arg);
case SCOUTFS_IOC_GET_QUOTA_RULES:
return scoutfs_ioc_get_quota_rules(file, arg);
case SCOUTFS_IOC_ADD_QUOTA_RULE:
return scoutfs_ioc_mod_quota_rule(file, arg, true);
case SCOUTFS_IOC_DEL_QUOTA_RULE:
return scoutfs_ioc_mod_quota_rule(file, arg, false);
case SCOUTFS_IOC_READ_XATTR_INDEX:
return scoutfs_ioc_read_xattr_index(file, arg);
}
return -ENOTTY;

View File

@@ -673,174 +673,4 @@ struct scoutfs_ioctl_dirent {
#define SCOUTFS_IOC_GET_REFERRING_ENTRIES \
_IOW(SCOUTFS_IOCTL_MAGIC, 17, struct scoutfs_ioctl_get_referring_entries)
struct scoutfs_ioctl_inode_attr_x {
__u64 x_mask;
__u64 x_flags;
__u64 meta_seq;
__u64 data_seq;
__u64 data_version;
__u64 online_blocks;
__u64 offline_blocks;
__u64 ctime_sec;
__u32 ctime_nsec;
__u32 crtime_nsec;
__u64 crtime_sec;
__u64 size;
__u64 bits;
__u64 project_id;
};
/*
* Behavioral flags set in the x_flags field. These flags don't
* necessarily correspond to specific attributes, but instead change the
* behaviour of a _get_ or _set_ operation.
*
* @SCOUTFS_IOC_IAX_F_SIZE_OFFLINE: When setting i_size, also create
* extents which are marked offline for the region of the file from
* offset 0 to the new set size. This can only be set when setting the
* size and has no effect if setting the size fails.
*/
#define SCOUTFS_IOC_IAX_F_SIZE_OFFLINE (1ULL << 0)
#define SCOUTFS_IOC_IAX_F__UNKNOWN (U64_MAX << 1)
/*
* Single-bit values stored in the @bits field. These indicate whether
* the bit is set, or not. The main _IAX_ bits set in the mask indicate
* whether this value bit is populated by _get or stored by _set.
*/
#define SCOUTFS_IOC_IAX_B_RETENTION (1ULL << 0)
/*
* x_mask bits which indicate which attributes of the inode to populate
* on return for _get or to set on the inode for _set. Each mask bit
* corresponds to the matching named field in the attr_x struct passed
* to the _get_ and _set_ calls.
*
* Each field can have different permissions or other attribute
* requirements which can cause calls to fail. If _set_ fails then no
* other attribute changes will have been made by the same call.
*
* @SCOUTFS_IOC_IAX_RETENTION: Mark a file for retention. When marked,
* no modification can be made to the file other than changing extended
* attributes outside the "user." prefix and clearing the retention
* mark. This can only be set on regular files and requires root (the
* CAP_SYS_ADMIN capability). Other attributes can be set with a
* set_attr_x call on a retention inode as long as that call also
* successfully clears the retention mark.
*/
#define SCOUTFS_IOC_IAX_META_SEQ (1ULL << 0)
#define SCOUTFS_IOC_IAX_DATA_SEQ (1ULL << 1)
#define SCOUTFS_IOC_IAX_DATA_VERSION (1ULL << 2)
#define SCOUTFS_IOC_IAX_ONLINE_BLOCKS (1ULL << 3)
#define SCOUTFS_IOC_IAX_OFFLINE_BLOCKS (1ULL << 4)
#define SCOUTFS_IOC_IAX_CTIME (1ULL << 5)
#define SCOUTFS_IOC_IAX_CRTIME (1ULL << 6)
#define SCOUTFS_IOC_IAX_SIZE (1ULL << 7)
#define SCOUTFS_IOC_IAX_RETENTION (1ULL << 8)
#define SCOUTFS_IOC_IAX_PROJECT_ID (1ULL << 9)
/* single bit attributes that are packed in the bits field as _B_ */
#define SCOUTFS_IOC_IAX__BITS (SCOUTFS_IOC_IAX_RETENTION)
/* inverse of all the bits we understand */
#define SCOUTFS_IOC_IAX__UNKNOWN (U64_MAX << 10)
#define SCOUTFS_IOC_GET_ATTR_X \
_IOW(SCOUTFS_IOCTL_MAGIC, 18, struct scoutfs_ioctl_inode_attr_x)
#define SCOUTFS_IOC_SET_ATTR_X \
_IOW(SCOUTFS_IOCTL_MAGIC, 19, struct scoutfs_ioctl_inode_attr_x)
/*
* (These fields are documented in the order that they're displayed by
* the scoutfs cli utility which matches the sort order of the rules.)
*
* @prio: The priority of the rule. Rules are sorted by their fields
* with prio at the highest magnitude. When multiple rules match the
* rule with the highest sort order is enforced. The priority field
* lets rules override the default field sort order.
*
* @name_val[3]: The three 64bit values that make up the name of the
* totl xattr whose total will be checked against the rule's limit to
* see if the quota rule has been exceeded. The behavior of the values
* can be changed by their corresponding name_source and name_flags.
*
* @name_source[3]: The SQ_NS_ enums that control where the value comes
* from. _LITERAL uses the value from name_val. Inode attribute
* sources (_PROJ, _UID, _GID) are taken from the inode of the operation
* that is being checked against the rule.
*
* @name_flags[3]: The SQ_NF_ enums that alter the name values. _SELECT
* makes the rule only match if the inode attribute of the operation
* matches the attribute value stored in name_val. This lets rules
* match a specific value of an attribute rather than mapping all
* attribute values of to totl names.
*
* @op: The SQ_OP_ enums which specify the operation that can't exceed
* the rule's limit. _INODE checks inode creation and the inode
* attributes are taken from the inode that would be created. _DATA
* checks file data block allocation and the inode fields come from the
* inode that is allocating the blocks.
*
* @limit: The 64bit value that is checked against the totl value
* described by the rule. If the totl value is greater than or equal to
* this value of the matching rule then the operation will return
* -EDQUOT.
*
* @rule_flags: SQ_RF_TOTL_COUNT indicates that the rule's limit should
* be checked against the number of xattrs contributing to a totl value
* instead of the sum of the xattrs.
*/
struct scoutfs_ioctl_quota_rule {
__u64 name_val[3];
__u64 limit;
__u8 prio;
__u8 op;
__u8 rule_flags;
__u8 name_source[3];
__u8 name_flags[3];
__u8 _pad[7];
};
struct scoutfs_ioctl_get_quota_rules {
__u64 iterator[2];
__u64 rules_ptr;
__u64 rules_nr;
};
/*
* Rules are uniquely identified by their non-padded fields. Addition will fail
* with -EEXIST if the specified rule already exists and deletion must find a rule
* with all matching fields to delete.
*/
#define SCOUTFS_IOC_GET_QUOTA_RULES \
_IOR(SCOUTFS_IOCTL_MAGIC, 20, struct scoutfs_ioctl_get_quota_rules)
#define SCOUTFS_IOC_ADD_QUOTA_RULE \
_IOW(SCOUTFS_IOCTL_MAGIC, 21, struct scoutfs_ioctl_quota_rule)
#define SCOUTFS_IOC_DEL_QUOTA_RULE \
_IOW(SCOUTFS_IOCTL_MAGIC, 22, struct scoutfs_ioctl_quota_rule)
/*
* Inodes can be indexed in a global key space at a position determined
* by a .indx. tagged xattr. The xattr name specifies the two index
* position values, with major having the more significant comparison
* order.
*/
struct scoutfs_ioctl_xattr_index_entry {
__u64 minor;
__u64 ino;
__u8 major;
__u8 _pad[7];
};
struct scoutfs_ioctl_read_xattr_index {
__u64 flags;
struct scoutfs_ioctl_xattr_index_entry first;
struct scoutfs_ioctl_xattr_index_entry last;
__u64 entries_ptr;
__u64 entries_nr;
};
#define SCOUTFS_IOC_READ_XATTR_INDEX \
_IOR(SCOUTFS_IOCTL_MAGIC, 23, struct scoutfs_ioctl_read_xattr_index)
#endif

View File

@@ -24,7 +24,6 @@
#include "item.h"
#include "forest.h"
#include "block.h"
#include "msg.h"
#include "trans.h"
#include "counters.h"
#include "scoutfs_trace.h"
@@ -1671,24 +1670,13 @@ out:
return ret;
}
static int lock_safe(struct super_block *sb, struct scoutfs_lock *lock, struct scoutfs_key *key,
static int lock_safe(struct scoutfs_lock *lock, struct scoutfs_key *key,
int mode)
{
bool prot = scoutfs_lock_protected(lock, key, mode);
if (!prot) {
static bool once = false;
if (!once) {
scoutfs_err(sb, "lock (start "SK_FMT" end "SK_FMT" mode 0x%x) does not protect operation (key "SK_FMT" mode 0x%x)",
SK_ARG(&lock->start), SK_ARG(&lock->end), lock->mode,
SK_ARG(key), mode);
dump_stack();
once = true;
}
if (WARN_ON_ONCE(!scoutfs_lock_protected(lock, key, mode)))
return -EINVAL;
}
return 0;
else
return 0;
}
static int optional_lock_mode_match(struct scoutfs_lock *lock, int mode)
@@ -1720,8 +1708,8 @@ static int copy_val(void *dst, int dst_len, void *src, int src_len)
* The amount of bytes copied is returned which can be 0 or truncated if
* the caller's buffer isn't big enough.
*/
static int item_lookup(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, int len_limit, struct scoutfs_lock *lock)
int scoutfs_item_lookup(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, struct scoutfs_lock *lock)
{
DECLARE_ITEM_CACHE_INFO(sb, cinf);
struct cached_item *item;
@@ -1730,7 +1718,7 @@ static int item_lookup(struct super_block *sb, struct scoutfs_key *key,
scoutfs_inc_counter(sb, item_lookup);
if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_READ)))
if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_READ)))
goto out;
ret = get_cached_page(sb, cinf, lock, key, false, false, 0, &pg);
@@ -1741,8 +1729,6 @@ static int item_lookup(struct super_block *sb, struct scoutfs_key *key,
item = item_rbtree_walk(&pg->item_root, key, NULL, NULL, NULL);
if (!item || item->deletion)
ret = -ENOENT;
else if (len_limit > 0 && item->val_len > len_limit)
ret = -EIO;
else
ret = copy_val(val, val_len, item->val, item->val_len);
@@ -1751,38 +1737,13 @@ out:
return ret;
}
int scoutfs_item_lookup(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, struct scoutfs_lock *lock)
{
return item_lookup(sb, key, val, val_len, 0, lock);
}
/*
* Copy an item's value into the caller's buffer. If the item's value
* is larger than the caller's buffer then -EIO is returned. If the
* item is smaller then the bytes from the end of the copied value to
* the end of the buffer are zeroed. The number of value bytes copied
* is returned, and 0 can be returned for an item with no value.
*/
int scoutfs_item_lookup_smaller_zero(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, struct scoutfs_lock *lock)
{
int ret;
ret = item_lookup(sb, key, val, val_len, val_len, lock);
if (ret >= 0 && ret < val_len)
memset(val + ret, 0, val_len - ret);
return ret;
}
int scoutfs_item_lookup_exact(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len,
struct scoutfs_lock *lock)
{
int ret;
ret = item_lookup(sb, key, val, val_len, 0, lock);
ret = scoutfs_item_lookup(sb, key, val, val_len, lock);
if (ret == val_len)
ret = 0;
else if (ret >= 0)
@@ -1832,7 +1793,7 @@ int scoutfs_item_next(struct super_block *sb, struct scoutfs_key *key,
goto out;
}
if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_READ)))
if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_READ)))
goto out;
pos = *key;
@@ -1913,7 +1874,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
scoutfs_inc_counter(sb, item_dirty);
if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_WRITE)))
if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE)))
goto out;
ret = scoutfs_forest_set_bloom_bits(sb, lock);
@@ -1959,7 +1920,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key,
scoutfs_inc_counter(sb, item_create);
if ((ret = lock_safe(sb, lock, key, mode)) ||
if ((ret = lock_safe(lock, key, mode)) ||
(ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE)))
goto out;
@@ -2002,7 +1963,7 @@ int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, struct scoutfs_lock *lock)
{
return item_create(sb, key, val, val_len, lock, NULL,
SCOUTFS_LOCK_WRITE, false);
SCOUTFS_LOCK_READ, false);
}
int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key,
@@ -2033,7 +1994,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
scoutfs_inc_counter(sb, item_update);
if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_WRITE)))
if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE)))
goto out;
ret = scoutfs_forest_set_bloom_bits(sb, lock);
@@ -2101,7 +2062,7 @@ int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key,
scoutfs_inc_counter(sb, item_delta);
if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_WRITE_ONLY)))
if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE_ONLY)))
goto out;
ret = scoutfs_forest_set_bloom_bits(sb, lock);
@@ -2174,7 +2135,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
scoutfs_inc_counter(sb, item_delete);
if ((ret = lock_safe(sb, lock, key, mode)) ||
if ((ret = lock_safe(lock, key, mode)) ||
(ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE)))
goto out;

View File

@@ -3,8 +3,6 @@
int scoutfs_item_lookup(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, struct scoutfs_lock *lock);
int scoutfs_item_lookup_smaller_zero(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len, struct scoutfs_lock *lock);
int scoutfs_item_lookup_exact(struct super_block *sb, struct scoutfs_key *key,
void *val, int val_len,
struct scoutfs_lock *lock);

View File

@@ -36,8 +36,6 @@
#include "item.h"
#include "omap.h"
#include "util.h"
#include "totl.h"
#include "quota.h"
/*
* scoutfs uses a lock service to manage item cache consistency between
@@ -187,9 +185,6 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
return ret;
}
if (lock->start.sk_zone == SCOUTFS_QUOTA_ZONE && !lock_mode_can_read(mode))
scoutfs_quota_invalidate(sb);
/* have to invalidate if we're not in the only usable case */
if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
retry:
@@ -1249,29 +1244,10 @@ int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode,
struct scoutfs_key start;
struct scoutfs_key end;
scoutfs_totl_set_range(&start, &end);
return lock_key_range(sb, mode, flags, &start, &end, lock);
}
int scoutfs_lock_xattr_indx(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
struct scoutfs_lock **lock)
{
struct scoutfs_key start;
struct scoutfs_key end;
scoutfs_xattr_indx_get_range(&start, &end);
return lock_key_range(sb, mode, flags, &start, &end, lock);
}
int scoutfs_lock_quota(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
struct scoutfs_lock **lock)
{
struct scoutfs_key start;
struct scoutfs_key end;
scoutfs_quota_get_lock_range(&start, &end);
scoutfs_key_set_zeros(&start);
start.sk_zone = SCOUTFS_XATTR_TOTL_ZONE;
scoutfs_key_set_ones(&end);
end.sk_zone = SCOUTFS_XATTR_TOTL_ZONE;
return lock_key_range(sb, mode, flags, &start, &end, lock);
}

View File

@@ -86,10 +86,6 @@ int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int
u64 ino, struct scoutfs_lock **lock);
int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
struct scoutfs_lock **lock);
int scoutfs_lock_xattr_indx(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
struct scoutfs_lock **lock);
int scoutfs_lock_quota(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
struct scoutfs_lock **lock);
void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
enum scoutfs_lock_mode mode);

File diff suppressed because it is too large Load Diff

View File

@@ -1,48 +0,0 @@
#ifndef _SCOUTFS_QUOTA_H_
#define _SCOUTFS_QUOTA_H_
#include "ioctl.h"
/*
* Each rule's name can be in the ruleset's rbtree associated with the
* source attr that it selects. This lets checks only test rules that
* the inputs could match. The 'i' field indicates which name is in the
* tree so we can find the containing rule.
*
* This is mostly private to quota.c but we expose it for tracing.
*/
struct squota_rule {
u64 limit;
u8 prio;
u8 op;
u8 rule_flags;
struct squota_rule_name {
struct rb_node node;
u64 val;
u8 source;
u8 flags;
u8 i;
} names[3];
};
/* private to quota.c, only here for tracing */
struct squota_input {
u64 attrs[SQ_NS__NR_SELECT];
u8 op;
};
int scoutfs_quota_check_inode(struct super_block *sb, struct inode *dir);
int scoutfs_quota_check_data(struct super_block *sb, struct inode *inode);
int scoutfs_quota_get_rules(struct super_block *sb, u64 *iterator,
struct scoutfs_ioctl_quota_rule *irules, int nr);
int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
struct scoutfs_ioctl_quota_rule *irule);
void scoutfs_quota_get_lock_range(struct scoutfs_key *start, struct scoutfs_key *end);
void scoutfs_quota_invalidate(struct super_block *sb);
int scoutfs_quota_setup(struct super_block *sb);
void scoutfs_quota_destroy(struct super_block *sb);
#endif

View File

@@ -37,10 +37,6 @@
#include "net.h"
#include "data.h"
#include "ext.h"
#include "quota.h"
#include "trace/quota.h"
#include "trace/wkic.h"
struct lock_info;
@@ -2399,44 +2395,6 @@ TRACE_EVENT(scoutfs_block_dirty_ref,
__entry->block_blkno, __entry->block_seq)
);
TRACE_EVENT(scoutfs_block_stale,
TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
struct scoutfs_block_header *hdr, u32 magic, u32 crc),
TP_ARGS(sb, ref, hdr, magic, crc),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, ref_blkno)
__field(__u64, ref_seq)
__field(__u32, hdr_crc)
__field(__u32, hdr_magic)
__field(__u64, hdr_fsid)
__field(__u64, hdr_seq)
__field(__u64, hdr_blkno)
__field(__u32, magic)
__field(__u32, crc)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ref_blkno = le64_to_cpu(ref->blkno);
__entry->ref_seq = le64_to_cpu(ref->seq);
__entry->hdr_crc = le32_to_cpu(hdr->crc);
__entry->hdr_magic = le32_to_cpu(hdr->magic);
__entry->hdr_fsid = le64_to_cpu(hdr->fsid);
__entry->hdr_seq = le64_to_cpu(hdr->seq);
__entry->hdr_blkno = le64_to_cpu(hdr->blkno);
__entry->magic = magic;
__entry->crc = crc;
),
TP_printk(SCSBF" ref_blkno %llu ref_seq %016llx hdr_crc %08x hdr_magic %08x hdr_fsid %016llx hdr_seq %016llx hdr_blkno %llu magic %08x crc %08x",
SCSB_TRACE_ARGS, __entry->ref_blkno, __entry->ref_seq, __entry->hdr_crc,
__entry->hdr_magic, __entry->hdr_fsid, __entry->hdr_seq, __entry->hdr_blkno,
__entry->magic, __entry->crc)
);
DECLARE_EVENT_CLASS(scoutfs_block_class,
TP_PROTO(struct super_block *sb, void *bp, u64 blkno, int refcount, int io_count,
unsigned long bits, __u64 accessed),

View File

@@ -49,8 +49,6 @@
#include "volopt.h"
#include "fence.h"
#include "xattr.h"
#include "wkic.h"
#include "quota.h"
#include "scoutfs_trace.h"
static struct dentry *scoutfs_debugfs_root;
@@ -196,9 +194,7 @@ static void scoutfs_put_super(struct super_block *sb)
scoutfs_shutdown_trans(sb);
scoutfs_volopt_destroy(sb);
scoutfs_client_destroy(sb);
scoutfs_quota_destroy(sb);
scoutfs_inode_destroy(sb);
scoutfs_wkic_destroy(sb);
scoutfs_item_destroy(sb);
scoutfs_forest_destroy(sb);
scoutfs_data_destroy(sb);
@@ -548,9 +544,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
scoutfs_block_setup(sb) ?:
scoutfs_forest_setup(sb) ?:
scoutfs_item_setup(sb) ?:
scoutfs_wkic_setup(sb) ?:
scoutfs_inode_setup(sb) ?:
scoutfs_quota_setup(sb) ?:
scoutfs_data_setup(sb) ?:
scoutfs_setup_trans(sb) ?:
scoutfs_omap_setup(sb) ?:

View File

@@ -30,8 +30,6 @@ struct recov_info;
struct omap_info;
struct volopt_info;
struct fence_info;
struct wkic_info;
struct squota_info;
struct scoutfs_sb_info {
struct super_block *sb;
@@ -57,8 +55,6 @@ struct scoutfs_sb_info {
struct omap_info *omap_info;
struct volopt_info *volopt_info;
struct item_cache_info *item_cache_info;
struct wkic_info *wkic_info;
struct squota_info *squota_info;
struct fence_info *fence_info;
/* tracks tasks waiting for data extents */
@@ -160,17 +156,4 @@ int scoutfs_write_super(struct super_block *sb,
/* to keep this out of the ioctl.h public interface definition */
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/*
* Returns 0 when supported, non-zero -errno when unsupported.
*/
static inline int scoutfs_fmt_vers_unsupported(struct super_block *sb, u64 vers)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
if (sbi && (sbi->fmt_vers < vers))
return -EOPNOTSUPP;
else
return 0;
}
#endif

View File

@@ -1,90 +0,0 @@
/*
* Copyright (C) 2023 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/string.h>
#include "format.h"
#include "forest.h"
#include "totl.h"
void scoutfs_totl_set_range(struct scoutfs_key *start, struct scoutfs_key *end)
{
scoutfs_key_set_zeros(start);
start->sk_zone = SCOUTFS_XATTR_TOTL_ZONE;
scoutfs_key_set_ones(end);
end->sk_zone = SCOUTFS_XATTR_TOTL_ZONE;
}
void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg)
{
memset(merg, 0, sizeof(struct scoutfs_totl_merging));
}
void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
u64 seq, u8 flags, void *val, int val_len, int fic)
{
struct scoutfs_xattr_totl_val *tval = val;
if (fic & FIC_FS_ROOT) {
merg->fs_seq = seq;
merg->fs_total = le64_to_cpu(tval->total);
merg->fs_count = le64_to_cpu(tval->count);
} else if (fic & FIC_FINALIZED) {
merg->fin_seq = seq;
merg->fin_total += le64_to_cpu(tval->total);
merg->fin_count += le64_to_cpu(tval->count);
} else {
merg->log_seq = seq;
merg->log_total += le64_to_cpu(tval->total);
merg->log_count += le64_to_cpu(tval->count);
}
}
/*
* .totl. item merging has to be careful because the log btree merging
* code can write partial results to the fs_root. This means that a
* reader can see both cases where new finalized logs should be applied
* to the old fs items and where old finalized logs have already been
* applied to the partially merged fs items. Currently active logged
* items are always applied on top of all cases.
*
* These cases are differentiated with a combination of sequence numbers
* in items, the count of contributing xattrs, and a flag
* differentiating finalized and active logged items. This lets us
* recognize all cases, including when finalized logs were merged and
* deleted the fs item.
*/
void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count)
{
*total = 0;
*count = 0;
/* start with the fs item if we have it */
if (merg->fs_seq != 0) {
*total = merg->fs_total;
*count = merg->fs_count;
}
/* apply finalized logs if they're newer or creating */
if (((merg->fs_seq != 0) && (merg->fin_seq > merg->fs_seq)) ||
((merg->fs_seq == 0) && (merg->fin_count > 0))) {
*total += merg->fin_total;
*count += merg->fin_count;
}
/* always apply active logs which must be newer than fs and finalized */
if (merg->log_seq > 0) {
*total += merg->log_total;
*count += merg->log_count;
}
}

View File

@@ -1,24 +0,0 @@
#ifndef _SCOUTFS_TOTL_H_
#define _SCOUTFS_TOTL_H_
#include "key.h"
struct scoutfs_totl_merging {
u64 fs_seq;
u64 fs_total;
u64 fs_count;
u64 fin_seq;
u64 fin_total;
s64 fin_count;
u64 log_seq;
u64 log_total;
s64 log_count;
};
void scoutfs_totl_set_range(struct scoutfs_key *start, struct scoutfs_key *end);
void scoutfs_totl_merge_init(struct scoutfs_totl_merging *merg);
void scoutfs_totl_merge_contribute(struct scoutfs_totl_merging *merg,
u64 seq, u8 flags, void *val, int val_len, int fic);
void scoutfs_totl_merge_resolve(struct scoutfs_totl_merging *merg, __u64 *total, __u64 *count);
#endif

View File

@@ -1,143 +0,0 @@
/*
* Tracing squota_input
*/
#define SQI_FMT "[%u %llu %llu %llu]"
#define SQI_ARGS(i) \
(i)->op, (i)->attrs[0], (i)->attrs[1], (i)->attrs[2]
#define SQI_FIELDS(pref) \
__array(__u64, pref##_attrs, SQ_NS__NR_SELECT) \
__field(__u8, pref##_op)
#define SQI_ASSIGN(pref, i) \
__entry->pref##_attrs[0] = (i)->attrs[0]; \
__entry->pref##_attrs[1] = (i)->attrs[1]; \
__entry->pref##_attrs[2] = (i)->attrs[2]; \
__entry->pref##_op = (i)->op;
#define SQI_ENTRY_ARGS(pref) \
__entry->pref##_op, __entry->pref##_attrs[0], \
__entry->pref##_attrs[1], __entry->pref##_attrs[2]
/*
* Tracing squota_rule
*/
#define SQR_FMT "[%u %llu,%u,%x %llu,%u,%x %llu,%u,%x %u %llu]"
#define SQR_ARGS(r) \
(r)->prio, \
(r)->name_val[0], (r)->name_source[0], (r)->name_flags[0], \
(r)->name_val[1], (r)->name_source[1], (r)->name_flags[1], \
(r)->name_val[2], (r)->name_source[2], (r)->name_flags[2], \
(r)->op, (r)->limit \
#define SQR_FIELDS(pref) \
__array(__u64, pref##_name_val, 3) \
__field(__u64, pref##_limit) \
__array(__u8, pref##_name_source, 3) \
__array(__u8, pref##_name_flags, 3) \
__field(__u8, pref##_prio) \
__field(__u8, pref##_op)
#define SQR_ASSIGN(pref, r) \
__entry->pref##_name_val[0] = (r)->names[0].val; \
__entry->pref##_name_val[1] = (r)->names[1].val; \
__entry->pref##_name_val[2] = (r)->names[2].val; \
__entry->pref##_limit = (r)->limit; \
__entry->pref##_name_source[0] = (r)->names[0].source; \
__entry->pref##_name_source[1] = (r)->names[1].source; \
__entry->pref##_name_source[2] = (r)->names[2].source; \
__entry->pref##_name_flags[0] = (r)->names[0].flags; \
__entry->pref##_name_flags[1] = (r)->names[1].flags; \
__entry->pref##_name_flags[2] = (r)->names[2].flags; \
__entry->pref##_prio = (r)->prio; \
__entry->pref##_op = (r)->op;
#define SQR_ENTRY_ARGS(pref) \
__entry->pref##_prio, __entry->pref##_name_val[0], \
__entry->pref##_name_source[0], __entry->pref##_name_flags[0], \
__entry->pref##_name_val[1], __entry->pref##_name_source[1], \
__entry->pref##_name_flags[1], __entry->pref##_name_val[2], \
__entry->pref##_name_source[2], __entry->pref##_name_flags[2], \
__entry->pref##_op, __entry->pref##_limit
TRACE_EVENT(scoutfs_quota_check,
TP_PROTO(struct super_block *sb, long rs_ptr, struct squota_input *inp, int ret),
TP_ARGS(sb, rs_ptr, inp, ret),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(long, rs_ptr)
SQI_FIELDS(i)
__field(int, ret)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->rs_ptr = rs_ptr;
SQI_ASSIGN(i, inp);
__entry->ret = ret;
),
TP_printk(SCSBF" rs_ptr %ld ret %d inp "SQI_FMT,
SCSB_TRACE_ARGS, __entry->rs_ptr, __entry->ret, SQI_ENTRY_ARGS(i))
);
DECLARE_EVENT_CLASS(scoutfs_quota_rule_op_class,
TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret),
TP_ARGS(sb, rule, ret),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
SQR_FIELDS(r)
__field(int, ret)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
SQR_ASSIGN(r, rule);
__entry->ret = ret;
),
TP_printk(SCSBF" "SQR_FMT" ret %d",
SCSB_TRACE_ARGS, SQR_ENTRY_ARGS(r), __entry->ret)
);
DEFINE_EVENT(scoutfs_quota_rule_op_class, scoutfs_quota_add_rule,
TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret),
TP_ARGS(sb, rule, ret)
);
DEFINE_EVENT(scoutfs_quota_rule_op_class, scoutfs_quota_del_rule,
TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret),
TP_ARGS(sb, rule, ret)
);
TRACE_EVENT(scoutfs_quota_totl_check,
TP_PROTO(struct super_block *sb, struct squota_input *inp, struct scoutfs_key *key,
u64 limit, int ret),
TP_ARGS(sb, inp, key, limit, ret),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
SQI_FIELDS(i)
sk_trace_define(k)
__field(__u64, limit)
__field(int, ret)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
SQI_ASSIGN(i, inp);
sk_trace_assign(k, key);
__entry->limit = limit;
__entry->ret = ret;
),
TP_printk(SCSBF" inp "SQI_FMT" key "SK_FMT" limit %llu ret %d",
SCSB_TRACE_ARGS, SQI_ENTRY_ARGS(i), sk_trace_args(k), __entry->limit,
__entry->ret)
);

View File

@@ -1,112 +0,0 @@
DECLARE_EVENT_CLASS(scoutfs_wkic_wpage_class,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(void *, ptr)
__field(int, which)
__field(bool, n0l)
__field(bool, n1l)
sk_trace_define(start)
sk_trace_define(end)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ptr = ptr;
__entry->which = which;
__entry->n0l = n0l;
__entry->n1l = n1l;
sk_trace_assign(start, start);
sk_trace_assign(end, end);
__entry->which = which;
),
TP_printk(SCSBF" ptr %p wh %d nl %u,%u start "SK_FMT " end "SK_FMT, SCSB_TRACE_ARGS,
__entry->ptr, __entry->which, __entry->n0l, __entry->n1l,
sk_trace_args(start), sk_trace_args(end))
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_alloced,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_freeing,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_found,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_trimmed,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_erased,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_inserting,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_inserted,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_shrinking,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_dropping,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_replaying,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
DEFINE_EVENT(scoutfs_wkic_wpage_class, scoutfs_wkic_wpage_filled,
TP_PROTO(struct super_block *sb, void *ptr, int which, bool n0l, bool n1l,
struct scoutfs_key *start, struct scoutfs_key *end),
TP_ARGS(sb, ptr, which, n0l, n1l, start, end)
);
TRACE_EVENT(scoutfs_wkic_read_items,
TP_PROTO(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_key *start,
struct scoutfs_key *end),
TP_ARGS(sb, key, start, end),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
sk_trace_define(key)
sk_trace_define(start)
sk_trace_define(end)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
sk_trace_assign(key, start);
sk_trace_assign(start, start);
sk_trace_assign(end, end);
),
TP_printk(SCSBF" key "SK_FMT" start "SK_FMT " end "SK_FMT, SCSB_TRACE_ARGS,
sk_trace_args(key), sk_trace_args(start), sk_trace_args(end))
);

File diff suppressed because it is too large Load Diff

View File

@@ -1,19 +0,0 @@
#ifndef _SCOUTFS_WKIC_H_
#define _SCOUTFS_WKIC_H_
#include "format.h"
typedef int (*wkic_iter_cb_t)(struct scoutfs_key *key, void *val, unsigned int val_len,
void *cb_arg);
int scoutfs_wkic_iterate(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_key *last,
struct scoutfs_key *range_start, struct scoutfs_key *range_end,
wkic_iter_cb_t cb, void *cb_arg);
int scoutfs_wkic_iterate_stable(struct super_block *sb, struct scoutfs_key *key,
struct scoutfs_key *last, struct scoutfs_key *range_start,
struct scoutfs_key *range_end, wkic_iter_cb_t cb, void *cb_arg);
int scoutfs_wkic_setup(struct super_block *sb);
void scoutfs_wkic_destroy(struct super_block *sb);
#endif

View File

@@ -81,20 +81,7 @@ static void init_xattr_key(struct scoutfs_key *key, u64 ino, u32 name_hash,
#define SCOUTFS_XATTR_PREFIX "scoutfs."
#define SCOUTFS_XATTR_PREFIX_LEN (sizeof(SCOUTFS_XATTR_PREFIX) - 1)
/*
* We could have hidden the logic that needs this in a user-prefix
* specific .set handler, but I wanted to make sure that we always
* applied that logic from any call chains to _xattr_set. The
* additional strcmp isn't so expensive given all the rest of the work
* we're doing in here.
*/
static inline bool is_user(const char *name)
{
return !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
#define HIDE_TAG "hide."
#define INDX_TAG "indx."
#define SRCH_TAG "srch."
#define TOTL_TAG "totl."
#define TAG_LEN (sizeof(HIDE_TAG) - 1)
@@ -116,9 +103,6 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
if (!strncmp(name, HIDE_TAG, TAG_LEN)) {
if (++tgs->hide == 0)
return -EINVAL;
} else if (!strncmp(name, INDX_TAG, TAG_LEN)) {
if (++tgs->indx == 0)
return -EINVAL;
} else if (!strncmp(name, SRCH_TAG, TAG_LEN)) {
if (++tgs->srch == 0)
return -EINVAL;
@@ -556,57 +540,47 @@ static int parse_totl_u64(const char *s, int len, u64 *res)
}
/*
* non-destructive relatively quick parse of final dotted u64s in an
* xattr name. If the required number of values are found then we
* return the number of bytes in the name that are not the final dotted
* u64s with their dots. -EINVAL is returned if we didn't find the
* required number of values.
* non-destructive relatively quick parse of the last 3 dotted u64s that
* make up the name of the xattr total. -EINVAL is returned if there
* are anything but 3 valid u64 encodings between single dots at the end
* of the name.
*/
static int parse_dotted_u64s(u64 *u64s, int nr, const char *name, int name_len)
static int parse_totl_key(struct scoutfs_key *key, const char *name, int name_len)
{
u64 tot_name[3];
int end = name_len;
int nr = 0;
int len;
int ret;
int i;
int u;
/* parse name elements in reserve order from end of xattr name string */
for (u = nr - 1, i = name_len - 1; u >= 0 && i >= 0; i--) {
for (i = name_len - 1; i >= 0 && nr < ARRAY_SIZE(tot_name); i--) {
if (name[i] != '.')
continue;
len = end - (i + 1);
ret = parse_totl_u64(&name[i + 1], len, &u64s[u]);
ret = parse_totl_u64(&name[i + 1], len, &tot_name[nr]);
if (ret < 0)
goto out;
end = i;
u--;
nr++;
}
if (u == -1)
ret = end;
else
if (nr == ARRAY_SIZE(tot_name)) {
/* swap to account for parsing in reverse */
swap(tot_name[0], tot_name[2]);
scoutfs_xattr_init_totl_key(key, tot_name);
ret = 0;
} else {
ret = -EINVAL;
}
out:
return ret;
}
static int parse_totl_key(struct scoutfs_key *key, const char *name, int name_len)
{
u64 u64s[3];
int ret;
ret = parse_dotted_u64s(u64s, ARRAY_SIZE(u64s), name, name_len);
if (ret >= 0) {
scoutfs_xattr_init_totl_key(key, u64s);
ret = 0;
}
return ret;
}
static int apply_totl_delta(struct super_block *sb, struct scoutfs_key *key,
struct scoutfs_xattr_totl_val *tval, struct scoutfs_lock *lock)
{
@@ -633,72 +607,6 @@ int scoutfs_xattr_combine_totl(void *dst, int dst_len, void *src, int src_len)
return SCOUTFS_DELTA_COMBINED;
}
void scoutfs_xattr_indx_get_range(struct scoutfs_key *start, struct scoutfs_key *end)
{
scoutfs_key_set_zeros(start);
start->sk_zone = SCOUTFS_XATTR_INDX_ZONE;
scoutfs_key_set_ones(end);
end->sk_zone = SCOUTFS_XATTR_INDX_ZONE;
}
/*
* .indx. keys are a bit funny because we're iterating over index keys
* by major:minor:inode:xattr_id. That doesn't map nicely to the
* comparison precedence of the key fields. We have to mess around a
* little bit to get the major into the most significant key bits and
* the low bits of xattr id into the least significant key bits.
*/
void scoutfs_xattr_init_indx_key(struct scoutfs_key *key, u8 major, u64 minor, u64 ino, u64 xid)
{
scoutfs_key_set_zeros(key);
key->sk_zone = SCOUTFS_XATTR_INDX_ZONE;
key->_sk_first = cpu_to_le64(((u64)major << 56) | (minor >> 8));
key->_sk_second = cpu_to_le64((minor << 56) | (ino >> 8));
key->_sk_third = cpu_to_le64((ino << 56) | (xid >> 8));
key->_sk_fourth = xid & 0xff;
}
void scoutfs_xattr_get_indx_key(struct scoutfs_key *key, u8 *major, u64 *minor, u64 *ino, u64 *xid)
{
*major = le64_to_cpu(key->_sk_first) >> 56;
*minor = (le64_to_cpu(key->_sk_first) << 8) | (le64_to_cpu(key->_sk_second) >> 56);
*ino = (le64_to_cpu(key->_sk_second) << 8) | (le64_to_cpu(key->_sk_third) >> 56);
*xid = (le64_to_cpu(key->_sk_third) << 8) | key->_sk_fourth;
}
void scoutfs_xattr_set_indx_key_xid(struct scoutfs_key *key, u64 xid)
{
u8 major;
u64 minor;
u64 ino;
u64 dummy;
scoutfs_xattr_get_indx_key(key, &major, &minor, &ino, &dummy);
scoutfs_xattr_init_indx_key(key, major, minor, ino, xid);
}
/*
* This initial parsing of the name doesn't yet have access to an xattr
* id to put in the key. That's added later as the existing xattr is
* found or a new xattr's id is allocated.
*/
static int parse_indx_key(struct scoutfs_key *key, const char *name, int name_len, u64 ino)
{
u64 u64s[2];
int ret;
ret = parse_dotted_u64s(u64s, ARRAY_SIZE(u64s), name, name_len);
if (ret < 0)
return ret;
if (u64s[0] > U8_MAX)
return -EINVAL;
scoutfs_xattr_init_indx_key(key, u64s[0], u64s[1], ino, 0);
return 0;
}
/*
* The confusing swiss army knife of creating, modifying, and deleting
* xattrs.
@@ -719,7 +627,7 @@ static int parse_indx_key(struct scoutfs_key *key, const char *name, int name_le
int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_len,
const void *value, size_t size, int flags,
const struct scoutfs_xattr_prefix_tags *tgs,
struct scoutfs_lock *lck, struct scoutfs_lock *tag_lock,
struct scoutfs_lock *lck, struct scoutfs_lock *totl_lock,
struct list_head *ind_locks)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
@@ -727,11 +635,10 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
const u64 ino = scoutfs_ino(inode);
struct scoutfs_xattr_totl_val tval = {0,};
struct scoutfs_xattr *xat = NULL;
struct scoutfs_key tag_key;
struct scoutfs_key totl_key;
struct scoutfs_key key;
bool undo_srch = false;
bool undo_totl = false;
bool undo_indx = false;
u8 found_parts;
unsigned int xat_bytes_totl;
unsigned int xat_bytes;
@@ -744,8 +651,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
trace_scoutfs_xattr_set(sb, name_len, value, size, flags);
if (WARN_ON_ONCE(tgs->totl && tgs->indx) ||
WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock))
if (WARN_ON_ONCE(tgs->totl && !totl_lock))
return -EINVAL;
/* mirror the syscall's errors for large names and values */
@@ -758,22 +664,10 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
(flags & ~(XATTR_CREATE | XATTR_REPLACE)))
return -EINVAL;
if ((tgs->hide | tgs->indx | tgs->srch | tgs->totl) && !capable(CAP_SYS_ADMIN))
if ((tgs->hide | tgs->srch | tgs->totl) && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (tgs->totl && ((ret = parse_totl_key(&tag_key, name, name_len)) != 0))
return ret;
if (tgs->indx &&
(ret = scoutfs_fmt_vers_unsupported(sb, SCOUTFS_FORMAT_VERSION_FEAT_INDX_TAG)))
return ret;
if (tgs->indx && ((ret = parse_indx_key(&tag_key, name, name_len, ino)) != 0))
return ret;
/* retention blocks user. xattr modification, all else allowed */
ret = scoutfs_inode_check_retention(inode);
if (ret < 0 && is_user(name))
if (tgs->totl && ((ret = parse_totl_key(&totl_key, name, name_len)) != 0))
return ret;
/* allocate enough to always read an existing xattr's totl */
@@ -814,12 +708,6 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
/* found fields in key will also be used */
found_parts = ret >= 0 ? xattr_nr_parts(xat) : 0;
/* use existing xattr's id or allocate new when creating */
if (found_parts)
id = le64_to_cpu(key.skx_id);
else if (value)
id = si->next_xattr_id++;
if (found_parts && tgs->totl) {
/* parse old totl value before we clobber xat buf */
val_len = ret - offsetof(struct scoutfs_xattr, name[xat->name_len]);
@@ -830,25 +718,12 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
le64_add_cpu(&tval.total, -total);
}
/*
* indx xattrs don't have a value. After returning an error for
* non-zero val length or short circuiting modifying with the
* same 0 length, all we're left with is creating or deleting
* the xattr.
*/
if (tgs->indx) {
if (size != 0) {
ret = -EINVAL;
goto out;
}
if (found_parts && value) {
ret = 0;
goto out;
}
}
/* prepare the xattr header, name, and start of value in first item */
if (value) {
if (found_parts)
id = le64_to_cpu(key.skx_id);
else
id = si->next_xattr_id++;
xat->name_len = name_len;
xat->val_len = cpu_to_le16(size);
memset(xat->__pad, 0, sizeof(xat->__pad));
@@ -866,18 +741,9 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
le64_add_cpu(&tval.total, total);
}
if (tgs->indx) {
scoutfs_xattr_set_indx_key_xid(&tag_key, id);
if (value)
ret = scoutfs_item_create_force(sb, &tag_key, NULL, 0, tag_lock, NULL);
else
ret = scoutfs_item_delete_force(sb, &tag_key, tag_lock, NULL);
if (ret < 0)
goto out;
undo_indx = true;
}
if (tgs->srch && !(found_parts && value)) {
if (found_parts)
id = le64_to_cpu(key.skx_id);
hash = scoutfs_hash64(name, name_len);
ret = scoutfs_forest_srch_add(sb, hash, ino, id);
if (ret < 0)
@@ -886,7 +752,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
}
if (tgs->totl) {
ret = apply_totl_delta(sb, &tag_key, &tval, tag_lock);
ret = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
if (ret < 0)
goto out;
undo_totl = true;
@@ -911,13 +777,6 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
ret = 0;
out:
if (ret < 0 && undo_indx) {
if (value)
err = scoutfs_item_delete_force(sb, &tag_key, tag_lock, NULL);
else
err = scoutfs_item_create_force(sb, &tag_key, NULL, 0, tag_lock, NULL);
BUG_ON(err); /* inconsistent */
}
if (ret < 0 && undo_srch) {
err = scoutfs_forest_srch_add(sb, hash, ino, id);
BUG_ON(err);
@@ -926,7 +785,7 @@ out:
/* _delta() on dirty items shouldn't fail */
tval.total = cpu_to_le64(-le64_to_cpu(tval.total));
tval.count = cpu_to_le64(-le64_to_cpu(tval.count));
err = apply_totl_delta(sb, &tag_key, &tval, tag_lock);
err = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
BUG_ON(err);
}
@@ -942,7 +801,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, const void
struct inode *inode = dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct scoutfs_xattr_prefix_tags tgs;
struct scoutfs_lock *tag_lock = NULL;
struct scoutfs_lock *totl_lock = NULL;
struct scoutfs_lock *lck = NULL;
size_t name_len = strlen(name);
LIST_HEAD(ind_locks);
@@ -957,11 +816,8 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, const void
if (ret)
goto unlock;
if (tgs.totl || tgs.indx) {
if (tgs.totl)
ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &tag_lock);
else
ret = scoutfs_lock_xattr_indx(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &tag_lock);
if (tgs.totl) {
ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &totl_lock);
if (ret)
goto unlock;
}
@@ -980,7 +836,7 @@ retry:
goto release;
ret = scoutfs_xattr_set_locked(dentry->d_inode, name, name_len, value, size, flags, &tgs,
lck, tag_lock, &ind_locks);
lck, totl_lock, &ind_locks);
if (ret == 0)
scoutfs_update_inode_item(inode, lck, &ind_locks);
@@ -989,7 +845,7 @@ release:
scoutfs_inode_index_unlock(sb, &ind_locks);
unlock:
scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, tag_lock, SCOUTFS_LOCK_WRITE_ONLY);
scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
@@ -1199,15 +1055,14 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
{
struct scoutfs_xattr_prefix_tags tgs;
struct scoutfs_xattr *xat = NULL;
struct scoutfs_lock *tag_lock = NULL;
struct scoutfs_lock *totl_lock = NULL;
struct scoutfs_xattr_totl_val tval;
struct scoutfs_key tag_key;
struct scoutfs_key totl_key;
struct scoutfs_key last;
struct scoutfs_key key;
bool release = false;
unsigned int bytes;
unsigned int val_len;
u8 locked_zone = 0;
void *value;
u64 total;
u64 hash;
@@ -1253,32 +1108,16 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
goto out;
}
ret = parse_totl_key(&tag_key, xat->name, xat->name_len) ?:
ret = parse_totl_key(&totl_key, xat->name, xat->name_len) ?:
parse_totl_u64(value, val_len, &total);
if (ret < 0)
break;
}
if (tgs.indx) {
ret = parse_indx_key(&tag_key, xat->name, xat->name_len, ino);
if (ret < 0)
goto out;
}
if ((tgs.totl || tgs.indx) && locked_zone != tag_key.sk_zone) {
if (tag_lock) {
scoutfs_unlock(sb, tag_lock, SCOUTFS_LOCK_WRITE_ONLY);
tag_lock = NULL;
}
if (tgs.totl)
ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0,
&tag_lock);
else
ret = scoutfs_lock_xattr_indx(sb, SCOUTFS_LOCK_WRITE_ONLY, 0,
&tag_lock);
if (tgs.totl && totl_lock == NULL) {
ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &totl_lock);
if (ret < 0)
break;
locked_zone = tag_key.sk_zone;
}
ret = scoutfs_hold_trans(sb, false);
@@ -1301,13 +1140,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
if (tgs.totl) {
tval.total = cpu_to_le64(-total);
tval.count = cpu_to_le64(-1LL);
ret = apply_totl_delta(sb, &tag_key, &tval, tag_lock);
if (ret < 0)
break;
}
if (tgs.indx) {
ret = scoutfs_item_delete_force(sb, &tag_key, tag_lock, NULL);
ret = apply_totl_delta(sb, &totl_key, &tval, totl_lock);
if (ret < 0)
break;
}
@@ -1320,7 +1153,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
if (release)
scoutfs_release_trans(sb);
scoutfs_unlock(sb, tag_lock, SCOUTFS_LOCK_WRITE_ONLY);
scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY);
kfree(xat);
out:
return ret;

View File

@@ -3,7 +3,6 @@
struct scoutfs_xattr_prefix_tags {
unsigned long hide:1,
indx:1,
srch:1,
totl:1;
};
@@ -31,9 +30,4 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
void scoutfs_xattr_init_totl_key(struct scoutfs_key *key, u64 *name);
int scoutfs_xattr_combine_totl(void *dst, int dst_len, void *src, int src_len);
void scoutfs_xattr_indx_get_range(struct scoutfs_key *start, struct scoutfs_key *end);
void scoutfs_xattr_init_indx_key(struct scoutfs_key *key, u8 major, u64 minor, u64 ino, u64 xid);
void scoutfs_xattr_get_indx_key(struct scoutfs_key *key, u8 *major, u64 *minor, u64 *ino, u64 *xid);
void scoutfs_xattr_set_indx_key_xid(struct scoutfs_key *key, u64 xid);
#endif

1
tests/.gitignore vendored
View File

@@ -9,4 +9,3 @@ src/find_xattrs
src/stage_tmpfile
src/create_xattr_loop
src/o_tmpfile_umask
src/o_tmpfile_linkat

View File

@@ -12,8 +12,7 @@ BIN := src/createmany \
src/find_xattrs \
src/create_xattr_loop \
src/fragmented_data_extents \
src/o_tmpfile_umask \
src/o_tmpfile_linkat
src/o_tmpfile_umask
DEPS := $(wildcard src/*.d)

View File

@@ -113,7 +113,6 @@ used during the test.
| T\_EX\_META\_DEV | scratch meta bdev | -f | /dev/vdd |
| T\_EX\_DATA\_DEV | scratch meta bdev | -e | /dev/vdc |
| T\_M[0-9] | mount paths | mounted per run | /mnt/test.[0-9]/ |
| T\_MODULE | built kernel module | created per run | ../kmod/src/..ko |
| T\_NR\_MOUNTS | number of mounts | -n | 3 |
| T\_O[0-9] | mount options | created per run | -o server\_addr= |
| T\_QUORUM | quorum count | -q | 2 |

View File

@@ -147,10 +147,6 @@ t_filter_dmesg()
# ignore systemd-journal rotating
re="$re|systemd-journald.*"
# format vers back/compat tries bad mounts
re="$re|scoutfs .* error.*outside of supported version.*"
re="$re|scoutfs .* error.*could not get .*super.*"
egrep -v "($re)" | \
ignore_harmless_unwind_kasan_stack_oob
}

View File

@@ -29,12 +29,13 @@ t_mount_rid()
}
#
# Output the "f.$fsid.r.$rid" identifier string for the given path
# in a mounted scoutfs volume.
# Output the "f.$fsid.r.$rid" identifier string for the given mount
# number, 0 is used by default if none is specified.
#
t_ident_from_mnt()
t_ident()
{
local mnt="$1"
local nr="${1:-0}"
local mnt="$(eval echo \$T_M$nr)"
local fsid
local rid
@@ -44,38 +45,6 @@ t_ident_from_mnt()
echo "f.${fsid:0:6}.r.${rid:0:6}"
}
#
# Output the "f.$fsid.r.$rid" identifier string for the given mount
# number, 0 is used by default if none is specified.
#
t_ident()
{
local nr="${1:-0}"
local mnt="$(eval echo \$T_M$nr)"
t_ident_from_mnt "$mnt"
}
#
# Output the sysfs path for a path in a mounted fs.
#
t_sysfs_path_from_ident()
{
local ident="$1"
echo "/sys/fs/scoutfs/$ident"
}
#
# Output the sysfs path for a path in a mounted fs.
#
t_sysfs_path_from_mnt()
{
local mnt="$1"
t_sysfs_path_from_ident $(t_ident_from_mnt $mnt)
}
#
# Output the mount's sysfs path, defaulting to mount 0 if none is
# specified.
@@ -84,7 +53,7 @@ t_sysfs_path()
{
local nr="$1"
t_sysfs_path_from_ident $(t_ident $nr)
echo "/sys/fs/scoutfs/$(t_ident $nr)"
}
#

View File

@@ -1,4 +0,0 @@
== ensuring utils and module for old versions
== unmounting test fs and removing test module
== testing combinations of old and new format versions
== restoring test module and mount

View File

@@ -1,24 +0,0 @@
== default new files don't have project
0
== set new project on files and dirs
8675309
8675309
== non-root can see id
8675309
== can use IDs around long width limits
2147483647
2147483648
4294967295
9223372036854775807
9223372036854775808
18446744073709551615
== created files and dirs inherit project id
8675309
8675309
== inheritance continues
8675309
== clearing project id stops inheritance
0
0
== o_tmpfile creations inherit dir
8675309

View File

@@ -1,40 +0,0 @@
== prepare dir with write perm for test ids
== test assumes starting with no rules, empty list
== add rule
7 13,L,- 15,L,- 17,L,- I 33 -
== list is empty again after delete
== can change limits without deleting
1 1,L,- 1,L,- 1,L,- I 100 -
1 1,L,- 1,L,- 1,L,- I 101 -
1 1,L,- 1,L,- 1,L,- I 99 -
== wipe and restore rules in bulk
7 15,L,- 0,L,- 0,L,- I 33 -
7 14,L,- 0,L,- 0,L,- I 33 -
7 13,L,- 0,L,- 0,L,- I 33 -
7 12,L,- 0,L,- 0,L,- I 33 -
7 11,L,- 0,L,- 0,L,- I 33 -
7 10,L,- 0,L,- 0,L,- I 33 -
7 15,L,- 0,L,- 0,L,- I 33 -
7 14,L,- 0,L,- 0,L,- I 33 -
7 13,L,- 0,L,- 0,L,- I 33 -
7 12,L,- 0,L,- 0,L,- I 33 -
7 11,L,- 0,L,- 0,L,- I 33 -
7 10,L,- 0,L,- 0,L,- I 33 -
== default rule prevents file creation
touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded
== decreasing totl allows file creation again
== attr selecting rules prevent creation
touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded
touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded
== multi attr selecting doesn't prevent partial
touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded
== op differentiates
== higher priority rule applies
touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded
== data rules with total and count prevent write and fallocate
dd: error writing '/mnt/test/test/quota/dir/file': Disk quota exceeded
fallocate: fallocate failed: Disk quota exceeded
dd: error writing '/mnt/test/test/quota/dir/file': Disk quota exceeded
fallocate: fallocate failed: Disk quota exceeded
== added rules work after bulk restore
touch: cannot touch '/mnt/test/test/quota/dir/file': Disk quota exceeded

View File

@@ -1,28 +0,0 @@
== setting retention on dir fails
attr_x ioctl failed on '/mnt/test/test/retention-basic': Invalid argument (22)
scoutfs: set-attr-x failed: Invalid argument (22)
== set retention
== get-attr-x shows retention
1
== unpriv can't clear retention
attr_x ioctl failed on '/mnt/test/test/retention-basic/file-1': Operation not permitted (1)
scoutfs: set-attr-x failed: Operation not permitted (1)
== can set hidden scoutfs xattr in retention
== setting user. xattr fails in retention
setfattr: /mnt/test/test/retention-basic/file-1: Operation not permitted
== file deletion fails in retention
rm: cannot remove '/mnt/test/test/retention-basic/file-1': Operation not permitted
== file rename fails in retention
mv: cannot move '/mnt/test/test/retention-basic/file-1' to '/mnt/test/test/retention-basic/file-2': Operation not permitted
== file write fails in retention
date: write error: Operation not permitted
== file truncate fails in retention
truncate: failed to truncate '/mnt/test/test/retention-basic/file-1' at 0 bytes: Operation not permitted
== setattr fails in retention
touch: setting times of '/mnt/test/test/retention-basic/file-1': Operation not permitted
== clear retention
== file write
== file rename
== setattr
== xattr deletion
== cleanup

View File

@@ -73,7 +73,6 @@ $(basename $0) options:
-t | Enabled trace events that match the given glob argument.
| Multiple options enable multiple globbed events.
-T <nr> | Multiply the original trace buffer size by nr during the run.
-V <nr> | Set mkfs device format version.
-X | xfstests git repo. Used by tests/xfstests.sh.
-x | xfstests git branch to checkout and track.
-y | xfstests ./check additional args
@@ -177,11 +176,6 @@ while true; do
T_TRACE_MULT="$2"
shift
;;
-V)
test -n "$2" || die "-V must have a format version argument"
T_MKFS_FORMAT_VERSION="-V $2"
shift
;;
-X)
test -n "$2" || die "-X requires xfstests git repo dir argument"
T_XFSTESTS_REPO="$2"
@@ -350,7 +344,7 @@ if [ -n "$T_MKFS" ]; then
done
msg "making new filesystem with $T_QUORUM quorum members"
cmd scoutfs mkfs -f $quo $T_DATA_ALLOC_ZONE_BLOCKS $T_MKFS_FORMAT_VERSION \
cmd scoutfs mkfs -f $quo $T_DATA_ALLOC_ZONE_BLOCKS \
"$T_META_DEVICE" "$T_DATA_DEVICE"
fi
@@ -358,8 +352,7 @@ if [ -n "$T_INSMOD" ]; then
msg "removing and reinserting scoutfs module"
test -e /sys/module/scoutfs && cmd rmmod scoutfs
cmd modprobe libcrc32c
T_MODULE="$T_KMOD/src/scoutfs.ko"
cmd insmod "$T_MODULE"
cmd insmod "$T_KMOD/src/scoutfs.ko"
fi
if [ -n "$T_TRACE_MULT" ]; then

View File

@@ -12,16 +12,12 @@ data-prealloc.sh
setattr_more.sh
offline-extent-waiting.sh
move-blocks.sh
projects.sh
large-fragmented-free.sh
format-version-forward-back.sh
enospc.sh
srch-safe-merge-pos.sh
srch-basic-functionality.sh
simple-xattr-unit.sh
retention-basic.sh
totl-xattr-tag.sh
quota.sh
lock-refleak.sh
lock-shrink-consistency.sh
lock-pr-cw-conflict.sh

View File

@@ -1,71 +0,0 @@
/*
* Copyright (C) 2023 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
#include <assert.h>
#include <limits.h>
static void linkat_tmpfile(char *dir, char *lpath)
{
char proc_self[PATH_MAX];
int ret;
int fd;
fd = open(dir, O_RDWR | O_TMPFILE, 0777);
if (fd < 0) {
perror("open(O_TMPFILE)");
exit(1);
}
snprintf(proc_self, sizeof(proc_self), "/proc/self/fd/%d", fd);
ret = linkat(AT_FDCWD, proc_self, AT_FDCWD, lpath, AT_SYMLINK_FOLLOW);
if (ret < 0) {
perror("linkat");
exit(1);
}
close(fd);
}
/*
* Use O_TMPFILE and linkat to create a new visible file, used to test
* the O_TMPFILE creation path by inspecting the created file.
*/
int main(int argc, char **argv)
{
char *lpath;
char *dir;
if (argc < 3) {
printf("%s <open_dir> <linkat_path>\n", argv[0]);
return 1;
}
dir = argv[1];
lpath = argv[2];
linkat_tmpfile(dir, lpath);
return 0;
}

View File

@@ -1,179 +0,0 @@
#
# Test our basic ability to work with different format versions.
#
# The current code being tested has a range of supported format
# versions. For each of the older supported format versions we have a
# git hash of the commit before the next greater version was introduced.
# We build versions of the scoutfs utility and kernel module for the
# last commit in tree that had a lesser supported version as its max
# supported version. We use those binaries to test forward and back
# compat as new and old code works with a persistent volume with a given
# format version.
#
mount_has_format_version()
{
local mnt="$1"
local vers="$2"
local sysfs_fmt_vers="$(t_sysfs_path_from_mnt $SCR)/format_version"
test "$(cat $sysfs_fmt_vers)" == "$vers"
}
SCR="/mnt/scoutfs.scratch"
MIN=$(modinfo $T_MODULE | awk '($1 == "scoutfs_format_version_min:"){print $2}')
MAX=$(modinfo $T_MODULE | awk '($1 == "scoutfs_format_version_max:"){print $2}')
echo "min: $MIN max: $MAX" > "$T_TMP.log"
test "$MIN" -gt 0 -a "$MAX" -gt 0 -a "$MIN" -le "$MAX" || \
t_fail "parsed bad versions, min: $MIN max: $MAX"
test "$MIN" == "$MAX" && \
t_skip "only one supported format version: $MIN"
# prepare dir and wipe any weird old partial state
builds="$T_RESULTS/format_version_builds"
mkdir -p "$builds"
echo "== ensuring utils and module for old versions"
declare -A commits
commits[1]=c3c4b080
for vers in $(seq $MIN $((MAX - 1))); do
dir="$builds/$vers"
platform=$(uname -rp)
buildmark="$dir/buildmark"
commit="${commits[$vers]}"
test -n "$commit" || \
t_fail "no commit for vers $vers"
# have our files for this version
test "$(cat $buildmark 2>&1)" == "$platform" && \
continue
# build as one big sequence of commands that can return failure
(
set -o pipefail
rm -rf $dir &&
mkdir -p $dir/building &&
cd "$T_TESTS/.." &&
git archive --format=tar "$commit" | tar -C "$dir/building" -xf - &&
cd - &&
find $dir &&
make -C "$dir/building" &&
mv $dir/building/utils/src/scoutfs $dir &&
mv $dir/building/kmod/src/scoutfs.ko $dir &&
rm -rf $dir/building &&
echo "$platform" > $buildmark &&
find $dir &&
cat $buildmark
) >> "$T_TMP.log" 2>&1 || t_fail "version $vers build failed"
done
echo "== unmounting test fs and removing test module"
t_quiet t_umount_all
t_quiet rmmod scoutfs
echo "== testing combinations of old and new format versions"
mkdir -p "$SCR"
for vers in $(seq $MIN $((MAX - 1))); do
old_scoutfs="$builds/$vers/scoutfs"
old_module="$builds/$vers/scoutfs.ko"
echo "mkfs $vers" >> "$T_TMP.log"
t_quiet $old_scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" \
|| t_fail "mkfs $vers failed"
echo "mount $vers with $vers" >> "$T_TMP.log"
t_quiet insmod $old_module
t_quiet mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR"
t_quiet mount_has_format_version "$SCR" "$vers"
echo "creating files in $vers" >> "$T_TMP.log"
t_quiet touch "$SCR/file-"{1,2,3}
stat "$SCR"/file-* > "$T_TMP.stat" || \
t_fail "stat in $vers failed"
echo "remounting $vers fs with $MAX" >> "$T_TMP.log"
t_quiet umount "$SCR"
rmmod scoutfs
insmod "$T_MODULE"
t_quiet mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR"
t_quiet mount_has_format_version "$SCR" "$vers"
echo "verifying stat in $vers with $MAX" >> "$T_TMP.log"
diff -u "$T_TMP.stat" <(stat "$SCR"/file-*)
echo "keep/update/del existing, create new in $vers" >> "$T_TMP.log"
t_quiet touch "$SCR/file-2"
t_quiet rm -f "$SCR/file-3"
t_quiet touch "$SCR/file-4"
stat "$SCR"/file-* > "$T_TMP.stat" || \
t_fail "stat in $vers failed"
echo "remounting $vers fs with $vers" >> "$T_TMP.log"
t_quiet umount "$SCR"
rmmod scoutfs
insmod "$old_module"
t_quiet mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR"
t_quiet mount_has_format_version "$SCR" "$vers"
echo "verifying stat in $vers with $vers" >> "$T_TMP.log"
diff -u "$T_TMP.stat" <(stat "$SCR"/file-*)
echo "changing format vers to $MAX" >> "$T_TMP.log"
t_quiet umount "$SCR"
rmmod scoutfs
t_quiet scoutfs change-format-version -F -V $MAX $T_EX_META_DEV "$T_EX_DATA_DEV"
echo "mount fs $MAX with old $vers should fail" >> "$T_TMP.log"
insmod "$old_module"
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR" >> "$T_TMP.log" 2>&1
if [ "$?" == "0" ]; then
umount "$SCR"
t_fail "old code ver $vers able to mount new ver $MAX"
fi
echo "remounting $MAX fs with $MAX" >> "$T_TMP.log"
rmmod scoutfs
insmod "$T_MODULE"
t_quiet mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR"
t_quiet mount_has_format_version "$SCR" "$MAX"
echo "verifying stat in $MAX with $MAX" >> "$T_TMP.log"
diff -u "$T_TMP.stat" <(stat "$SCR"/file-*)
echo "keep/update/del existing, create new in $MAX" >> "$T_TMP.log"
t_quiet touch "$SCR/file-2"
t_quiet rm -f "$SCR/file-4"
t_quiet touch "$SCR/file-5"
stat "$SCR"/file-* > "$T_TMP.stat" || \
t_fail "stat in $MAX failed"
echo "remounting $MAX fs with $MAX again" >> "$T_TMP.log"
t_quiet umount "$SCR"
t_quiet mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
"$T_EX_DATA_DEV" "$SCR"
t_quiet mount_has_format_version "$SCR" "$MAX"
echo "verifying stat in $MAX with $MAX again" >> "$T_TMP.log"
diff -u "$T_TMP.stat" <(stat "$SCR"/file-*)
echo "done with old vers $vers" >> "$T_TMP.log"
t_quiet umount "$SCR"
rmmod scoutfs
done
echo "== restoring test module and mount"
insmod "$T_MODULE"
t_mount_all
t_pass

View File

@@ -1,52 +0,0 @@
# notable id to recognize in output
ID=8675309
echo "== default new files don't have project"
touch "$T_D0/file"
scoutfs get-attr-x -p "$T_D0/file"
echo "== set new project on files and dirs"
mkdir "$T_D0/dir"
scoutfs set-attr-x -p $ID "$T_D0/file"
scoutfs set-attr-x -p $ID "$T_D0/dir"
scoutfs get-attr-x -p "$T_D0/file"
scoutfs get-attr-x -p "$T_D0/dir"
echo "== non-root can see id"
chmod 644 "$T_D0/file"
setpriv --ruid=12345 --euid=12345 scoutfs get-attr-x -p "$T_D0/file"
echo "== can use IDs around long width limits"
touch "$T_D0/ids"
for id in 0x7FFFFFFF 0x80000000 0xFFFFFFFF \
0x7FFFFFFFFFFFFFFF 0x8000000000000000 0xFFFFFFFFFFFFFFFF; do
scoutfs set-attr-x -p $id "$T_D0/ids"
scoutfs get-attr-x -p "$T_D0/ids"
done
echo "== created files and dirs inherit project id"
touch "$T_D0/dir/file"
mkdir "$T_D0/dir/sub"
scoutfs get-attr-x -p "$T_D0/dir/file"
scoutfs get-attr-x -p "$T_D0/dir/sub"
echo "== inheritance continues"
mkdir "$T_D0/dir/sub/more"
scoutfs get-attr-x -p "$T_D0/dir/sub/more"
# .. just inherits 0 :)
echo "== clearing project id stops inheritance"
scoutfs set-attr-x -p 0 "$T_D0/dir"
touch "$T_D0/dir/another-file"
mkdir "$T_D0/dir/another-sub"
scoutfs get-attr-x -p "$T_D0/dir/another-file"
scoutfs get-attr-x -p "$T_D0/dir/another-sub"
echo "== o_tmpfile creations inherit dir"
scoutfs set-attr-x -p $ID "$T_D0/dir"
o_tmpfile_linkat "$T_D0/dir" "$T_D0/dir/tmpfile"
scoutfs get-attr-x -p "$T_D0/dir/tmpfile"
t_pass

View File

@@ -1,150 +0,0 @@
TEST_UID=22222
TEST_GID=44444
# sys_setreuid() set fs[uid] to e[ug]id
SET_UID="--ruid=$TEST_UID --euid=$TEST_UID"
SET_GID="--rgid=$TEST_GID --egid=$TEST_GID --clear-groups"
FILE="$T_D0/dir/file"
sync_and_drop()
{
sync
echo 1 > $(t_debugfs_path)/drop_weak_item_cache
echo 1 > $(t_debugfs_path)/drop_quota_check_cache
}
reset_all()
{
rm -f "$FILE"
scoutfs quota-wipe -p "$T_M0"
getfattr --absolute-names -d -m - "$T_D0" | \
grep "^scoutfs.totl." | \
cut -d '=' -f 1 | \
xargs -n 1 -I'{}' setfattr -x '{}' "$T_D0"
}
echo "== prepare dir with write perm for test ids"
mkdir "$T_D0/dir"
chown --quiet $TEST_UID "$T_D0/dir"
chgrp --quiet $TEST_GID "$T_D0/dir"
echo "== test assumes starting with no rules, empty list"
scoutfs quota-list -p "$T_M0"
echo "== add rule"
scoutfs quota-add -p "$T_M0" -r "7 13,L,- 15,L,- 17,L,- I 33 -"
scoutfs quota-list -p "$T_M0"
echo "== list is empty again after delete"
scoutfs quota-del -p "$T_M0" -r "7 13,L,- 15,L,- 17,L,- I 33 -"
scoutfs quota-list -p "$T_M0"
echo "== can change limits without deleting"
scoutfs quota-add -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 100 -"
scoutfs quota-list -p "$T_M0"
scoutfs quota-add -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 101 -"
scoutfs quota-del -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 100 -"
scoutfs quota-list -p "$T_M0"
scoutfs quota-add -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 99 -"
scoutfs quota-del -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 101 -"
scoutfs quota-list -p "$T_M0"
scoutfs quota-del -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 99 -"
reset_all
echo "== wipe and restore rules in bulk"
for a in $(seq 10 15); do
scoutfs quota-add -p "$T_M0" -r "7 $a,L,- 0,L,- 0,L,- I 33 -"
done
scoutfs quota-list -p "$T_M0"
scoutfs quota-list -p "$T_M0" > "$T_TMP.list"
scoutfs quota-wipe -p "$T_M0"
scoutfs quota-list -p "$T_M0"
scoutfs quota-restore -p "$T_M0" < "$T_TMP.list"
scoutfs quota-list -p "$T_M0"
reset_all
echo "== default rule prevents file creation"
scoutfs quota-add -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- I 1 -"
setfattr -n scoutfs.totl.test.1.1.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE" 2>&1 | t_filter_fs
echo "== decreasing totl allows file creation again"
setfattr -x scoutfs.totl.test.1.1.1 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE"
reset_all
echo "== attr selecting rules prevent creation"
scoutfs quota-add -p "$T_M0" -r "1 $TEST_UID,U,S 1,L,- 1,L,- I 1 -"
scoutfs quota-add -p "$T_M0" -r "1 $TEST_GID,G,S 1,L,- 1,L,- I 1 -"
setfattr -n scoutfs.totl.test.$TEST_UID.1.1 -v 2 "$T_D0"
setfattr -n scoutfs.totl.test.$TEST_GID.1.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE" 2>&1 | t_filter_fs
setpriv $SET_GID touch "$FILE" 2>&1 | t_filter_fs
reset_all
echo "== multi attr selecting doesn't prevent partial"
scoutfs quota-add -p "$T_M0" -r "1 $TEST_UID,U,S $TEST_GID,G,S 1,L,- I 1 -"
setfattr -n scoutfs.totl.test.$TEST_UID.$TEST_GID.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE"
rm -f "$FILE"
setpriv $SET_GID touch "$FILE"
rm -f "$FILE"
setpriv $SET_UID $SET_GID touch "$FILE" 2>&1 | t_filter_fs
reset_all
echo "== op differentiates"
# inode ops succeed in presence of data rule
scoutfs quota-add -p "$T_M0" -r "1 $TEST_UID,U,S 1,L,- 1,L,- D 1 -"
setfattr -n scoutfs.totl.test.$TEST_UID.1.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE" 2>&1 | t_filter_fs
reset_all
# data ops succeed in presence of inode rule
touch "$FILE"
chown --quiet $TEST_UID "$FILE"
scoutfs quota-add -p "$T_M0" -r "1 $TEST_UID,U,S 1,L,- 1,L,- I 1 -"
setfattr -n scoutfs.totl.test.$TEST_UID.1.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID fallocate -l 4096 "$FILE" 2>&1 | t_filter_fs
reset_all
echo "== higher priority rule applies"
scoutfs quota-add -p "$T_M0" -r "1 $TEST_UID,U,S 1,L,- 1,L,- I 1000 -"
scoutfs quota-add -p "$T_M0" -r "2 $TEST_UID,U,S 1,L,- 1,L,- I 1 -"
setfattr -n scoutfs.totl.test.$TEST_UID.1.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE" 2>&1 | t_filter_fs
reset_all
echo "== data rules with total and count prevent write and fallocate"
touch "$FILE"
scoutfs quota-add -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- D 1 -"
setfattr -n scoutfs.totl.test.1.1.1 -v 2 "$T_D0"
sync_and_drop
dd if=/dev/zero of="$FILE" bs=4096 count=1 conv=notrunc status=none 2>&1 | t_filter_fs
fallocate -l 4096 "$FILE" 2>&1 | t_filter_fs
scoutfs quota-del -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- D 1 -"
scoutfs quota-add -p "$T_M0" -r "1 1,L,- 1,L,- 1,L,- D 0 C"
sync_and_drop
dd if=/dev/zero of="$FILE" bs=4096 count=1 conv=notrunc status=none 2>&1 | t_filter_fs
fallocate -l 4096 "$FILE" 2>&1 | t_filter_fs
reset_all
echo "== added rules work after bulk restore"
seq -f " 1 %.0f,U,S 1,L,- 1,L,- I 1 -" 9000050000 -1 9000000000 > "$T_TMP.lots"
scoutfs quota-restore -p "$T_M0" < "$T_TMP.lots"
scoutfs quota-list -p "$T_M0" > "$T_TMP.list"
diff -u "$T_TMP.lots" "$T_TMP.list"
scoutfs quota-add -p "$T_M0" -r "1 $TEST_UID,U,S 1,L,- 1,L,- I 1 -"
setfattr -n scoutfs.totl.test.$TEST_UID.1.1 -v 2 "$T_D0"
sync_and_drop
setpriv $SET_UID touch "$FILE" 2>&1 | t_filter_fs
reset_all
t_pass

View File

@@ -1,57 +0,0 @@
t_require_commands scoutfs touch rm setfattr
touch "$T_D0/file-1"
echo "== setting retention on dir fails"
scoutfs set-attr-x -t 1 "$T_D0" 2>&1 | t_filter_fs
echo "== set retention"
scoutfs set-attr-x -t 1 "$T_D0/file-1"
echo "== get-attr-x shows retention"
scoutfs get-attr-x -t "$T_D0/file-1"
echo "== unpriv can't clear retention"
setpriv --ruid=12345 --euid=12345 scoutfs set-attr-x -t 0 "$T_D0/file-1" 2>&1 | t_filter_fs
echo "== can set hidden scoutfs xattr in retention"
setfattr -n scoutfs.hide.srch.retention_test -v val "$T_D0/file-1"
echo "== setting user. xattr fails in retention"
setfattr -n user.retention_test -v val "$T_D0/file-1" 2>&1 | t_filter_fs
echo "== file deletion fails in retention"
rm -f "$T_D0/file-1" 2>&1 | t_filter_fs
echo "== file rename fails in retention"
mv $T_D0/file-1 $T_D0/file-2 2>&1 | t_filter_fs
echo "== file write fails in retention"
date >> $T_D0/file-1
echo "== file truncate fails in retention"
truncate -s 0 $T_D0/file-1 2>&1 | t_filter_fs
echo "== setattr fails in retention"
touch $T_D0/file-1 2>&1 | t_filter_fs
echo "== clear retention"
scoutfs set-attr-x -t 0 "$T_D0/file-1"
echo "== file write"
date >> $T_D0/file-1
echo "== file rename"
mv $T_D0/file-1 $T_D0/file-2
mv $T_D0/file-2 $T_D0/file-1
echo "== setattr"
touch $T_D0/file-1 2>&1 | t_filter_fs
echo "== xattr deletion"
setfattr -x scoutfs.hide.srch.retention_test "$T_D0/file-1"
echo "== cleanup"
rm -f "$T_D0/file-1"
t_pass

View File

@@ -3,7 +3,6 @@ t_require_commands touch rm setfattr scoutfs find_xattrs
read_xattr_totals()
{
sync
echo 1 > $(t_debugfs_path)/drop_weak_item_cache
scoutfs read-xattr-totals -p "$T_M0"
}
@@ -113,6 +112,7 @@ for phase in create update remove; do
echo "$k.0.0 = ${totals[$k]}, ${counts[$k]}"
done ) | grep -v "= 0, 0$" | sort -n >> $T_TMP.check_arr
sync
read_xattr_totals | sort -n >> $T_TMP.check_read
diff -u $T_TMP.check_arr $T_TMP.check_read || \

View File

@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-fno-strict-aliasing \
-I src/ -fno-strict-aliasing \
-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,8 +15,9 @@ CFLAGS += -I../kmod/src
endif
BIN := src/scoutfs
OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
DEPS := $(wildcard */*.d)
OBJ_DIRS := src src/check
OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
all: $(BIN)

View File

@@ -270,21 +270,6 @@ metadata that is bound to a specific volume and should not be
transferred with the file by tools that read extended attributes, like
.BR tar(1) .
.TP
.B .indx.
Attributes with the .indx. tag dd the inode containing the attribute to
a filesystem-wide index. The name of the extended attribute must end
with strings representing two values separated by dots. The first value
is an unsigned 8bit value and the second is an unsigned 64bit value.
These attributes can only be modified with root privileges and the
attributes can not have a value.
.sp
The inodes in the index are stored in increasing sort order of the
values, with the first u8 value being most significant. Inodes can be
at many positions as tracked by many extended attributes, and their
position follows the creation, renaming, or deletion of the attributes.
The index can be read with the read-xattr-index command which uses the
underlying READ_XATTR_INDEX ioctl.
.TP
.B .srch.
Attributes with the .srch. tag are indexed so that they can be
found by the
@@ -310,36 +295,6 @@ with the
ioctl.
.RE
.SH FILE RETENTION MODE
A file can be set to retention mode by setting the
.IB RETENTION
attribute with the
.IB SET_ATTR_X
ioctl. This flag can only be set on regular files and requires root
permission (the
.IB CAP_SYS_ADMIN
capability).
.sp
Once in retention mode all modifications of the file will fail. The
only exceptions are that system extended attributes (all those without
the "user." prefix) may be modified. The retention bit may be cleared
with sufficient priveledges to remove the retention restrictions on
other modifications.
.RE
.SH PROJECT IDs
All inodes have a project ID attribute that can be set via the
SET_ATTR_X ioctl and displayed with the GET_ATTR_X ioctl. Project IDs
are an unsigned 64bit value and the value of 0 is reserved to indicate
that no project ID is assigned. If a project ID is set on a directory
then all inodes created with it as the initial parent inheret that ID,
for all file types. This includes files initially unlinked from the
namespace when created with O_TMPFILE. Project IDs are only
automatically inherited from the parent dir on initial creation.
They're not changed as directory entry linkes to the inode are created
or renamed.
.RE
.SH FORMAT VERSION
The format version defines the layout and use of structures stored on
devices and passed over the network. The version is incremented for
@@ -418,19 +373,6 @@ The version that a mount is using is shown in the
file in the mount's sysfs directory, typically
.I /sys/fs/scoutfs/f.FSID.r.RID/
.RE
.sp
The defined format versions are:
.RS
.TP
.sp
.B 1
Initial format version.
.TP
.B 2
Added retention mode by setting the retention attribute. Added the
project ID inode attribute. Added quota rules and enforcement. Added
the .indx. extended attribute tag.
.RE
.SH CORRUPTION DETECTION
A

View File

@@ -209,16 +209,6 @@ A path within a ScoutFS filesystem.
.RE
.PD
.TP
.BI "get-attr-x FILE"
.sp
Display ScoutFS-specific attributes from a file. If no options are
given than all the attributes that the command supports will be
displayed. If attributes are specified with options then only those
attributes are displayed. If only one attribute is specified then it
will not have a label prefix in the display output. The --help option
will list the attributes that the command supports. The file system may
support a different set of attributes.
.TP
.BI "get-referring-entries [-p|--path PATH] INO"
.sp
@@ -516,15 +506,6 @@ A path within a ScoutFS filesystem.
.RE
.PD
.TP
.BI "set-attr-x FILE"
.sp
Set ScoutFS-specific attributes on a file. Only the attributes that are
spcified by options will be set. The --help option will list the
attributes that the command understands. The file system may support a
different set of attributes.
.PD
.TP
.BI "setattr FILE [-d, --data-version=VERSION [-s, --size=SIZE [-o, --offline]]] [-t, --ctime=TIMESPEC]"
.sp

View File

@@ -1,304 +0,0 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <argp.h>
#include <stdbool.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "ioctl.h"
#include "parse.h"
#include "cmd.h"
struct attr_x_args {
bool set;
char *filename;
struct scoutfs_ioctl_inode_attr_x iax;
};
#define pr(iax, name, label, fmt, args...) \
do { \
if ((iax->x_mask & SCOUTFS_IOC_IAX_##name)) { \
if (__builtin_popcount(iax->x_mask) > 1) \
printf(label ": " fmt "\n", ##args); \
else \
printf(fmt "\n", ##args); \
} \
} while (0)
#define prb(iax, name, label) \
pr(iax, name, label, "%u", !!((iax)->bits & SCOUTFS_IOC_IAX_B_##name))
static int do_attr_x(struct attr_x_args *args)
{
struct scoutfs_ioctl_inode_attr_x *iax = &args->iax;
int fd = -1;
int ret;
int op;
if (args->set) {
/* nothing to do if not setting */
if (iax->x_mask == 0)
return 0;
op = SCOUTFS_IOC_SET_ATTR_X;
} else {
/* get all known if none specified */
if (iax->x_mask == 0)
iax->x_mask = ~SCOUTFS_IOC_IAX__UNKNOWN;
op = SCOUTFS_IOC_GET_ATTR_X;
}
fd = open(args->filename, O_RDONLY);
if (fd < 0) {
ret = -errno;
fprintf(stderr, "failed to open '%s': %s (%d)\n",
args->filename, strerror(errno), errno);
goto out;
}
ret = ioctl(fd, op, iax);
if (ret < 0) {
ret = -errno;
fprintf(stderr, "attr_x ioctl failed on '%s': "
"%s (%d)\n", args->filename, strerror(errno), errno);
goto out;
}
if (!args->set) {
pr(iax, META_SEQ, "meta_seq", "%llu", iax->meta_seq);
pr(iax, DATA_SEQ, "data_seq", "%llu", iax->data_seq);
pr(iax, DATA_VERSION, "data_version", "%llu", iax->data_version);
pr(iax, ONLINE_BLOCKS, "online_blocks", "%llu", iax->online_blocks);
pr(iax, OFFLINE_BLOCKS, "offline_blocks", "%llu", iax->offline_blocks);
pr(iax, CTIME, "ctime", "%llu.%u", iax->ctime_sec, iax->ctime_nsec);
pr(iax, CRTIME, "crtime", "%llu.%u", iax->crtime_sec, iax->crtime_nsec);
pr(iax, SIZE, "size", "%llu", iax->size);
prb(iax, RETENTION, "retention");
pr(iax, PROJECT_ID, "project_id", "%llu", iax->project_id);
}
ret = 0;
out:
if (fd >= 0)
close(fd);
return ret;
}
/*
* This is called for both get and set. The get calls won't have
* arguments and are only setting the mask. The set calls parse the
* value to set. We could have defaults by making set option arguments
* optional, like setting the current time for timestamps, but that
* hasn't been needed.
*
* Option value parsing places no constraints on the attributes or
* values themselves once parsed. This lets us use the set command to
* test the kernel's testing for invalid attribute combinations and
* values.
*/
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct attr_x_args *args = state->input;
struct timespec ts;
int ret;
u64 x;
switch (key) {
case 'm':
args->iax.x_mask |= SCOUTFS_IOC_IAX_META_SEQ;
if (arg) {
ret = parse_u64(arg, &args->iax.meta_seq);
if (ret)
return ret;
}
break;
case 'd':
args->iax.x_mask |= SCOUTFS_IOC_IAX_DATA_SEQ;
if (arg) {
ret = parse_u64(arg, &args->iax.data_seq);
if (ret)
return ret;
}
break;
case 'v':
args->iax.x_mask |= SCOUTFS_IOC_IAX_DATA_VERSION;
if (arg) {
ret = parse_u64(arg, &args->iax.data_version);
if (ret)
return ret;
if (args->iax.data_version == 0)
argp_error(state, "data version must not be 0");
}
break;
case 'n':
args->iax.x_mask |= SCOUTFS_IOC_IAX_ONLINE_BLOCKS;
if (arg) {
ret = parse_u64(arg, &args->iax.online_blocks);
if (ret)
return ret;
}
break;
case 'f':
args->iax.x_mask |= SCOUTFS_IOC_IAX_OFFLINE_BLOCKS;
if (arg) {
ret = parse_u64(arg, &args->iax.offline_blocks);
if (ret)
return ret;
}
break;
case 'c':
args->iax.x_mask |= SCOUTFS_IOC_IAX_CTIME;
if (arg) {
ret = parse_timespec(arg, &ts);
if (ret)
return ret;
args->iax.ctime_sec = ts.tv_sec;
args->iax.ctime_nsec = ts.tv_nsec;
}
break;
case 'r':
args->iax.x_mask |= SCOUTFS_IOC_IAX_CRTIME;
if (arg) {
ret = parse_timespec(arg, &ts);
if (ret)
return ret;
args->iax.crtime_sec = ts.tv_sec;
args->iax.crtime_nsec = ts.tv_nsec;
}
break;
case 's':
args->iax.x_mask |= SCOUTFS_IOC_IAX_SIZE;
if (arg) {
ret = parse_u64(arg, &args->iax.size);
if (ret)
return ret;
}
break;
case 't':
args->iax.x_mask |= SCOUTFS_IOC_IAX_RETENTION;
if (arg) {
ret = parse_u64(arg, &x);
if (ret)
return ret;
if (x)
args->iax.bits |= SCOUTFS_IOC_IAX_B_RETENTION;
}
break;
case 'p':
args->iax.x_mask |= SCOUTFS_IOC_IAX_PROJECT_ID;
if (arg) {
ret = parse_u64(arg, &args->iax.project_id);
if (ret)
return ret;
}
break;
case ARGP_KEY_ARG:
if (!args->filename)
args->filename = strdup_or_error(state, arg);
else
argp_error(state, "more than one argument given");
break;
case ARGP_KEY_FINI:
if (!args->filename)
argp_error(state, "no filename given");
break;
default:
break;
}
return 0;
}
/*
* The get options are derived from these by copying the struct and
* modifying fields.
*/
static struct argp_option set_options[] = {
{ "meta_seq", 'm', "SEQ", 0, "Inode Metadata change index sequence number"},
{ "data_seq", 'd', "SEQ", 0, "File Data change index sequence number"},
{ "data_version", 'v', "VERSION", 0, "File Data contents version"},
{ "online_blocks", 'n', "COUNT", 0, "Online data block count"},
{ "offline_blocks", 'f', "COUNT", 0, "Offline data block count"},
{ "ctime", 'c', "SECS.NSECS", 0, "Inode change time (posix ctime)"},
{ "crtime", 'r', "SECS.NSECS", 0, "ScoutFS creation time"},
{ "size", 's', "SIZE", 0, "Inode i_size field"},
{ "retention", 't', "0|1", 0, "Retention flag"},
{ "project_id", 'p', "PROJECT_ID", 0, "Project ID"},
{ NULL }
};
static struct argp get_argp = {
NULL, /* dynamically built */
parse_opt,
"FILE",
"get extensible file attributes"
};
static int get_attr_x_cmd(int argc, char **argv)
{
struct attr_x_args args = {0,};
int ret;
ret = argp_parse(&get_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_attr_x(&args);
}
/*
* The set options match the get arguments but don't take argument
* values to set.
*/
static void build_get_options(void)
{
struct argp_option **opts = (struct argp_option **)&get_argp.options;
int i;
*opts = calloc(array_size(set_options), sizeof(set_options[0]));
assert(*opts);
memcpy(*opts, set_options, array_size(set_options) * sizeof(set_options[0]));
for (i = 0; i < array_size(set_options) - 1; i++)
(*opts)[i].arg = NULL;
}
static void __attribute__((constructor)) get_ctor(void)
{
build_get_options();
cmd_register_argp("get-attr-x", &get_argp, GROUP_AGENT, get_attr_x_cmd);
}
static struct argp set_argp = {
set_options,
parse_opt,
"FILE",
"Set extensible file attributes"
};
static int set_attr_x_cmd(int argc, char **argv)
{
struct attr_x_args args = {.set = true,};
int ret;
ret = argp_parse(&set_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_attr_x(&args);
}
static void __attribute__((constructor)) set_ctor(void)
{
cmd_register_argp("set-attr-x", &set_argp, GROUP_AGENT, set_attr_x_cmd);
}

View File

@@ -10,6 +10,11 @@
* Just a quick simple native bitmap.
*/
int test_bit(unsigned long *bits, u64 nr)
{
return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
}
void set_bit(unsigned long *bits, u64 nr)
{
bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));

View File

@@ -1,6 +1,7 @@
#ifndef _BITMAP_H_
#define _BITMAP_H_
int test_bit(unsigned long *bits, u64 nr);
void set_bit(unsigned long *bits, u64 nr);
void clear_bit(unsigned long *bits, u64 nr);
u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);

View File

@@ -119,16 +119,6 @@ static int do_change_fmt_vers(struct change_fmt_vers_args *args)
goto out;
}
if (le64_to_cpu(meta_super->fmt_vers) > args->fmt_vers ||
le64_to_cpu(data_super->fmt_vers) > args->fmt_vers) {
ret = -EPERM;
printf("Downgrade of Meta Format Version: %llu and Data Format Version: %llu to Format Version: %llu is not allowed\n",
le64_to_cpu(meta_super->fmt_vers),
le64_to_cpu(data_super->fmt_vers),
args->fmt_vers);
goto out;
}
if (le64_to_cpu(meta_super->fmt_vers) != args->fmt_vers) {
meta_super->fmt_vers = cpu_to_le64(args->fmt_vers);

159
utils/src/check/alloc.c Normal file
View File

@@ -0,0 +1,159 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "bitmap.h"
#include "key.h"
#include "alloc.h"
#include "block.h"
#include "btree.h"
#include "extent.h"
#include "iter.h"
#include "sns.h"
/*
* We check the list blocks serially.
*
* XXX:
* - compare ref seqs
* - detect cycles?
*/
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
{
struct scoutfs_alloc_list_block *lblk;
struct scoutfs_block_ref ref;
struct block *blk = NULL;
u64 blkno;
int ret;
ref = lhead->ref;
while (ref.blkno) {
blkno = le64_to_cpu(ref.blkno);
ret = cb(blkno, 1, cb_arg);
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
ret = block_get(&blk, blkno, 0);
if (ret < 0)
goto out;
lblk = block_buf(blk);
/* XXX verify block */
/* XXX sort? maybe */
ref = lblk->next;
block_put(&blk);
}
ret = 0;
out:
return ret;
}
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
{
return btree_meta_iter(&root->root, cb, cb_arg);
}
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
{
struct scoutfs_alloc_list_block *lblk;
struct scoutfs_block_ref ref;
struct block *blk = NULL;
u64 blkno;
int ret;
int i;
ref = lhead->ref;
while (ref.blkno) {
blkno = le64_to_cpu(ref.blkno);
ret = block_get(&blk, blkno, 0);
if (ret < 0)
goto out;
sns_push("alloc_list_block", blkno, 0);
lblk = block_buf(blk);
/* XXX verify block */
/* XXX sort? maybe */
ret = 0;
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
ret = cb(blkno, 1, cb_arg);
if (ret < 0)
break;
}
ref = lblk->next;
block_put(&blk);
sns_pop();
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
}
ret = 0;
out:
return ret;
}
static bool valid_free_extent_key(struct scoutfs_key *key)
{
return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
(!key->_sk_fourth && !key->sk_type &&
(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
}
static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
{
struct extent_cb_arg_t *ecba = cb_arg;
u64 start;
u64 len;
/* XXX not sure these eios are what we want */
if (val_len != 0)
return -EIO;
if (!valid_free_extent_key(key))
return -EIO;
if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
return -ECHECK_ITER_DONE;
start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
len = le64_to_cpu(key->skfb_len);
return ecba->cb(start, len, ecba->cb_arg);
}
/*
* Call the callback with each of the primary BLKNO free extents stored
* in item in the given alloc root. It doesn't visit the secondary
* ORDER extents.
*/
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
{
struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
return btree_item_iter(&root->root, free_item_cb, &ecba);
}

12
utils/src/check/alloc.h Normal file
View File

@@ -0,0 +1,12 @@
#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
#define _SCOUTFS_UTILS_CHECK_ALLOC_H
#include "extent.h"
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
#endif

564
utils/src/check/block.c Normal file
View File

@@ -0,0 +1,564 @@
#define _ISOC11_SOURCE /* aligned_alloc */
#define _DEFAULT_SOURCE /* syscall() */
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <stdio.h>
#include <errno.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "list.h"
#include "cmp.h"
#include "hash.h"
#include "block.h"
#include "debug.h"
#include "eno.h"
static struct block_data {
struct list_head *hash_lists;
size_t hash_nr;
struct list_head active_head;
struct list_head inactive_head;
struct list_head dirty_list;
size_t nr_active;
size_t nr_inactive;
size_t nr_dirty;
int meta_fd;
size_t max_cached;
size_t nr_events;
aio_context_t ctx;
struct iocb *iocbs;
struct iocb **iocbps;
struct io_event *events;
} global_bdat;
struct block {
struct list_head hash_head;
struct list_head lru_head;
struct list_head dirty_head;
struct list_head submit_head;
unsigned long refcount;
unsigned long uptodate:1,
active:1;
u64 blkno;
void *buf;
size_t size;
};
#define BLK_FMT \
"blkno %llu rc %ld d %u a %u"
#define BLK_ARG(blk) \
(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
#define debug_blk(blk, fmt, args...) \
debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
/*
* This just allocates and initialzies the block. The caller is
* responsible for putting it on the appropriate initial lists and
* managing refcounts.
*/
static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
{
struct block *blk;
blk = calloc(1, sizeof(struct block));
if (blk) {
blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
if (!blk->buf) {
free(blk);
blk = NULL;
} else {
INIT_LIST_HEAD(&blk->hash_head);
INIT_LIST_HEAD(&blk->lru_head);
INIT_LIST_HEAD(&blk->dirty_head);
INIT_LIST_HEAD(&blk->submit_head);
blk->blkno = blkno;
blk->size = size;
}
}
return blk;
}
static void free_block(struct block_data *bdat, struct block *blk)
{
debug_blk(blk, "free");
if (!list_empty(&blk->lru_head)) {
if (blk->active)
bdat->nr_active--;
else
bdat->nr_inactive--;
list_del(&blk->lru_head);
}
if (!list_empty(&blk->dirty_head)) {
bdat->nr_dirty--;
list_del(&blk->dirty_head);
}
if (!list_empty(&blk->hash_head))
list_del(&blk->hash_head);
if (!list_empty(&blk->submit_head))
list_del(&blk->submit_head);
free(blk->buf);
free(blk);
}
static bool blk_is_dirty(struct block *blk)
{
return !list_empty(&blk->dirty_head);
}
/*
* Rebalance the cache.
*
* First we shrink the cache to limit it to max_cached blocks.
* Logically, we walk from oldest to newest in the inactive list and
* then in the active list. Since these lists are physically one
* list_head list we achieve this with a reverse walk starting from the
* active head.
*
* Then we rebalnace the size of the two lists. The constraint is that
* we don't let the active list grow larger than the inactive list. We
* move blocks from the oldest tail of the active list to the newest
* head of the inactive list.
*
* <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
*/
static void rebalance_cache(struct block_data *bdat)
{
struct block *blk;
struct block *blk_;
list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
break;
if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
blk_is_dirty(blk))
continue;
free_block(bdat, blk);
}
list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
break;
list_move(&blk->lru_head, &bdat->inactive_head);
blk->active = 0;
bdat->nr_active--;
bdat->nr_inactive++;
}
}
static void make_active(struct block_data *bdat, struct block *blk)
{
if (!blk->active) {
if (!list_empty(&blk->lru_head)) {
list_move(&blk->lru_head, &bdat->active_head);
bdat->nr_inactive--;
} else {
list_add(&blk->lru_head, &bdat->active_head);
}
blk->active = 1;
bdat->nr_active++;
}
}
static int compar_iocbp(const void *A, const void *B)
{
struct iocb *a = *(struct iocb **)A;
struct iocb *b = *(struct iocb **)B;
return scoutfs_cmp(a->aio_offset, b->aio_offset);
}
static int submit_and_wait(struct block_data *bdat, struct list_head *list)
{
struct io_event *event;
struct iocb *iocb;
struct block *blk;
int ret;
int err;
int nr;
int i;
err = 0;
nr = 0;
list_for_each_entry(blk, list, submit_head) {
iocb = &bdat->iocbs[nr];
bdat->iocbps[nr] = iocb;
memset(iocb, 0, sizeof(struct iocb));
iocb->aio_data = (intptr_t)blk;
iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
iocb->aio_fildes = bdat->meta_fd;
iocb->aio_buf = (intptr_t)blk->buf;
iocb->aio_nbytes = blk->size;
iocb->aio_offset = blk->blkno * blk->size;
nr++;
debug_blk(blk, "submit");
if ((nr < bdat->nr_events) && blk->submit_head.next != list)
continue;
qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
if (ret != nr) {
if (ret >= 0)
errno = EIO;
ret = -errno;
printf("fatal system error submitting async IO: "ENO_FMT"\n",
ENO_ARG(-ret));
goto out;
}
ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
if (ret != nr) {
if (ret >= 0)
errno = EIO;
ret = -errno;
printf("fatal system error getting IO events: "ENO_FMT"\n",
ENO_ARG(-ret));
goto out;
}
ret = 0;
for (i = 0; i < nr; i++) {
event = &bdat->events[i];
iocb = (struct iocb *)(intptr_t)event->obj;
blk = (struct block *)(intptr_t)event->data;
debug_blk(blk, "complete res %lld", (long long)event->res);
if (event->res >= 0 && event->res != blk->size)
event->res = -EIO;
/* io errors are fatal */
if (event->res < 0) {
ret = event->res;
goto out;
}
if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
blk->uptodate = 1;
} else {
list_del_init(&blk->dirty_head);
bdat->nr_dirty--;
}
}
nr = 0;
}
ret = 0;
out:
return ret ?: err;
}
static void inc_refcount(struct block *blk)
{
blk->refcount++;
}
void block_put(struct block **blkp)
{
struct block_data *bdat = &global_bdat;
struct block *blk = *blkp;
if (blk) {
blk->refcount--;
*blkp = NULL;
rebalance_cache(bdat);
}
}
static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
{
u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
return &bdat->hash_lists[hash % bdat->hash_nr];
}
static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
{
struct list_head *bucket = hash_bucket(bdat, blkno);
struct block *search;
struct block *blk;
size_t size;
size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
blk = NULL;
list_for_each_entry(search, bucket, hash_head) {
if (search->blkno && blkno && search->size == size) {
blk = search;
break;
}
}
if (!blk) {
blk = alloc_block(bdat, blkno, size);
if (blk) {
list_add(&blk->hash_head, bucket);
list_add(&blk->lru_head, &bdat->inactive_head);
bdat->nr_inactive++;
}
}
if (blk)
inc_refcount(blk);
return blk;
}
/*
* Get a block.
*
* The caller holds a refcount to the block while it's in use that
* prevents it from being removed from the cache. It must be dropped
* with block_put();
*/
int block_get(struct block **blk_ret, u64 blkno, int bf)
{
struct block_data *bdat = &global_bdat;
struct block *blk;
LIST_HEAD(list);
int ret;
blk = get_or_alloc(bdat, blkno, bf);
if (!blk) {
ret = -ENOMEM;
goto out;
}
if ((bf & BF_ZERO)) {
memset(blk->buf, 0, blk->size);
blk->uptodate = 1;
}
if (bf & BF_OVERWRITE)
blk->uptodate = 1;
if (!blk->uptodate) {
list_add(&blk->submit_head, &list);
ret = submit_and_wait(bdat, &list);
list_del_init(&blk->submit_head);
if (ret < 0)
goto out;
}
if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
list_add_tail(&bdat->dirty_list, &blk->dirty_head);
bdat->nr_dirty++;
}
make_active(bdat, blk);
rebalance_cache(bdat);
ret = 0;
out:
if (ret < 0)
block_put(&blk);
*blk_ret = blk;
return ret;
}
void *block_buf(struct block *blk)
{
return blk->buf;
}
size_t block_size(struct block *blk)
{
return blk->size;
}
/*
* Drop the block from the cache, regardless of if it was free or not.
* This is used to avoid writing blocks which were dirtied but then
* later freed.
*
* The block is immediately freed and can't be referenced after this
* returns.
*/
void block_drop(struct block **blkp)
{
struct block_data *bdat = &global_bdat;
free_block(bdat, *blkp);
*blkp = NULL;
rebalance_cache(bdat);
}
/*
* This doesn't quite work for mixing large and small blocks, but that's
* fine, we never do that.
*/
static int compar_u64(const void *A, const void *B)
{
u64 a = *((u64 *)A);
u64 b = *((u64 *)B);
return scoutfs_cmp(a, b);
}
/*
* This read-ahead is synchronous and errors are ignored. If any of the
* blknos aren't present in the cache then we issue concurrent reads for
* them and wait. Any existing cached blocks will be left as is.
*
* We might be trying to read a lot more than the number of events so we
* sort the caller's blknos before iterating over them rather than
* relying on submission sorting the blocks in each submitted set.
*/
void block_readahead(u64 *blknos, size_t nr)
{
struct block_data *bdat = &global_bdat;
struct block *blk;
struct block *blk_;
LIST_HEAD(list);
size_t i;
if (nr == 0)
return;
qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
for (i = 0; i < nr; i++) {
blk = get_or_alloc(bdat, blknos[i], 0);
if (blk) {
if (!blk->uptodate)
list_add_tail(&blk->submit_head, &list);
else
block_put(&blk);
}
}
(void)submit_and_wait(bdat, &list);
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
list_del_init(&blk->submit_head);
block_put(&blk);
}
rebalance_cache(bdat);
}
/*
* The caller's block changes form a consistent transaction. If the amount of dirty
* blocks is large enough we issue a write.
*/
int block_try_commit(bool force)
{
struct block_data *bdat = &global_bdat;
struct block *blk;
struct block *blk_;
LIST_HEAD(list);
int ret;
if (!force && bdat->nr_dirty < bdat->nr_events)
return 0;
list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
list_add_tail(&blk->submit_head, &list);
inc_refcount(blk);
}
ret = submit_and_wait(bdat, &list);
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
list_del_init(&blk->submit_head);
block_put(&blk);
}
if (ret < 0) {
printf("error writing dirty transaction blocks\n");
goto out;
}
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
if (ret == 0) {
list_add(&blk->submit_head, &list);
ret = submit_and_wait(bdat, &list);
list_del_init(&blk->submit_head);
block_put(&blk);
} else {
ret = -ENOMEM;
}
if (ret < 0)
printf("error writing super block to commit transaction\n");
out:
rebalance_cache(bdat);
return ret;
}
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
{
struct block_data *bdat = &global_bdat;
size_t i;
int ret;
bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
bdat->hash_nr = bdat->max_cached / 4;
bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&bdat->active_head);
INIT_LIST_HEAD(&bdat->inactive_head);
INIT_LIST_HEAD(&bdat->dirty_list);
bdat->meta_fd = meta_fd;
list_add(&bdat->inactive_head, &bdat->active_head);
for (i = 0; i < bdat->hash_nr; i++)
INIT_LIST_HEAD(&bdat->hash_lists[i]);
ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
out:
if (ret < 0) {
free(bdat->iocbs);
free(bdat->iocbps);
free(bdat->events);
free(bdat->hash_lists);
}
return ret;
}
void block_shutdown(void)
{
struct block_data *bdat = &global_bdat;
syscall(SYS_io_destroy, bdat->ctx);
free(bdat->iocbs);
free(bdat->iocbps);
free(bdat->events);
free(bdat->hash_lists);
}

32
utils/src/check/block.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
#include <unistd.h>
#include <stdbool.h>
struct block;
#include "sparse.h"
/* block flags passed to block_get() */
enum {
BF_ZERO = (1 << 0), /* zero contents buf as block is returned */
BF_DIRTY = (1 << 1), /* block will be written with transaction */
BF_SM = (1 << 2), /* small 4k block instead of large 64k block */
BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
};
int block_get(struct block **blk_ret, u64 blkno, int bf);
void block_put(struct block **blkp);
void *block_buf(struct block *blk);
size_t block_size(struct block *blk);
void block_drop(struct block **blkp);
void block_readahead(u64 *blknos, size_t nr);
int block_try_commit(bool force);
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
void block_shutdown(void);
#endif

209
utils/src/check/btree.c Normal file
View File

@@ -0,0 +1,209 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "key.h"
#include "avl.h"
#include "block.h"
#include "btree.h"
#include "extent.h"
#include "iter.h"
#include "sns.h"
#include "meta.h"
#include "problem.h"
static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
{
return (void *)bt + le16_to_cpu(item->val_off);
}
static void readahead_refs(struct scoutfs_btree_block *bt)
{
struct scoutfs_btree_item *item;
struct scoutfs_avl_node *node;
struct scoutfs_block_ref *ref;
u64 *blknos;
u64 blkno;
u16 valid = 0;
u16 nr = le16_to_cpu(bt->nr_items);
int i;
blknos = calloc(nr, sizeof(blknos[0]));
if (!blknos)
return;
node = avl_first(&bt->item_root);
for (i = 0; i < nr; i++) {
item = container_of(node, struct scoutfs_btree_item, node);
ref = item_val(bt, item);
blkno = le64_to_cpu(ref->blkno);
if (valid_meta_blkno(blkno))
blknos[valid++] = blkno;
node = avl_next(&bt->item_root, &item->node);
}
if (valid > 0)
block_readahead(blknos, valid);
free(blknos);
}
/*
* Call the callback on the referenced block. Then if the block
* contains referneces read it and recurse into all its references.
*/
static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
void *cb_arg)
{
struct scoutfs_btree_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_avl_node *node;
struct block *blk = NULL;
u64 blkno;
int ret;
int i;
blkno = le64_to_cpu(ref->blkno);
if (!blkno)
return 0;
ret = cb(blkno, 1, cb_arg);
if (ret < 0) {
ret = xlate_iter_errno(ret);
return 0;
}
if (level == 0)
return 0;
ret = block_get(&blk, blkno, 0);
if (ret < 0)
return ret;
sns_push("btree_parent", blkno, 0);
bt = block_buf(blk);
/* XXX integrate verification with block cache */
if (bt->level != level) {
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
ret = -EINVAL;
goto out;
}
/* read-ahead last level of parents */
if (level == 2)
readahead_refs(bt);
node = avl_first(&bt->item_root);
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
item = container_of(node, struct scoutfs_btree_item, node);
ref = item_val(bt, item);
ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
if (ret < 0)
goto out;
node = avl_next(&bt->item_root, &item->node);
}
ret = 0;
out:
block_put(&blk);
sns_pop();
return ret;
}
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
{
/* XXX check root */
if (root->height == 0)
return 0;
return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
}
static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
btree_item_cb_t cb, void *cb_arg)
{
struct scoutfs_btree_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_avl_node *node;
struct block *blk = NULL;
u64 blkno;
int ret;
int i;
blkno = le64_to_cpu(ref->blkno);
if (!blkno)
return 0;
ret = block_get(&blk, blkno, 0);
if (ret < 0)
return ret;
if (level)
sns_push("btree_parent", blkno, 0);
else
sns_push("btree_leaf", blkno, 0);
bt = block_buf(blk);
/* XXX integrate verification with block cache */
if (bt->level != level) {
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
ret = -EINVAL;
goto out;
}
/* read-ahead leaves that contain items */
if (level == 1)
readahead_refs(bt);
node = avl_first(&bt->item_root);
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
item = container_of(node, struct scoutfs_btree_item, node);
if (level) {
ref = item_val(bt, item);
ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
} else {
ret = cb(&item->key, item_val(bt, item),
le16_to_cpu(item->val_len), cb_arg);
debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
}
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
node = avl_next(&bt->item_root, &item->node);
}
ret = 0;
out:
block_put(&blk);
sns_pop();
return ret;
}
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
{
/* XXX check root */
if (root->height == 0)
return 0;
return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
}

14
utils/src/check/btree.h Normal file
View File

@@ -0,0 +1,14 @@
#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
#define _SCOUTFS_UTILS_CHECK_BTREE_H_
#include "util.h"
#include "format.h"
#include "extent.h"
typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
#endif

149
utils/src/check/check.c Normal file
View File

@@ -0,0 +1,149 @@
#define _GNU_SOURCE /* O_DIRECT */
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
#include <argp.h>
#include "sparse.h"
#include "parse.h"
#include "util.h"
#include "format.h"
#include "ioctl.h"
#include "cmd.h"
#include "dev.h"
#include "alloc.h"
#include "block.h"
#include "debug.h"
#include "meta.h"
#include "super.h"
struct check_args {
char *meta_device;
char *data_device;
char *debug_path;
};
static int do_check(struct check_args *args)
{
int debug_fd = -1;
int meta_fd = -1;
int data_fd = -1;
int ret;
if (args->debug_path) {
debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (debug_fd < 0) {
ret = -errno;
fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
args->debug_path, strerror(errno), errno);
goto out;
}
debug_enable(debug_fd);
}
meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
if (meta_fd < 0) {
ret = -errno;
fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
args->meta_device, strerror(errno), errno);
goto out;
}
data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
if (data_fd < 0) {
ret = -errno;
fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
args->data_device, strerror(errno), errno);
goto out;
}
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
if (ret < 0)
goto out;
ret = check_supers() ?:
check_meta_alloc();
out:
/* and tear it all down */
block_shutdown();
super_shutdown();
debug_disable();
if (meta_fd >= 0)
close(meta_fd);
if (data_fd >= 0)
close(data_fd);
if (debug_fd >= 0)
close(debug_fd);
return ret;
}
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct check_args *args = state->input;
switch (key) {
case 'd':
args->debug_path = strdup_or_error(state, arg);
break;
case 'e':
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
else if (!args->data_device)
args->data_device = strdup_or_error(state, arg);
else
argp_error(state, "more than two device arguments given");
break;
case ARGP_KEY_FINI:
if (!args->meta_device)
argp_error(state, "no metadata device argument given");
if (!args->data_device)
argp_error(state, "no data device argument given");
break;
default:
break;
}
return 0;
}
static struct argp_option options[] = {
{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
{ NULL }
};
static struct argp argp = {
options,
parse_opt,
"META-DEVICE DATA-DEVICE",
"Check filesystem consistency"
};
static int check_cmd(int argc, char **argv)
{
struct check_args check_args = {NULL};
int ret;
ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
if (ret)
return ret;
return do_check(&check_args);
}
static void __attribute__((constructor)) check_ctor(void)
{
cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
}

16
utils/src/check/debug.c Normal file
View File

@@ -0,0 +1,16 @@
#include <stdlib.h>
#include "debug.h"
int debug_fd = -1;
void debug_enable(int fd)
{
debug_fd = fd;
}
void debug_disable(void)
{
if (debug_fd >= 0)
debug_fd = -1;
}

17
utils/src/check/debug.h Normal file
View File

@@ -0,0 +1,17 @@
#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
#include <stdio.h>
#define debug(fmt, args...) \
do { \
if (debug_fd >= 0) \
dprintf(debug_fd, fmt"\n", ##args); \
} while (0)
extern int debug_fd;
void debug_enable(int fd);
void debug_disable(void);
#endif

9
utils/src/check/eno.h Normal file
View File

@@ -0,0 +1,9 @@
#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
#define _SCOUTFS_UTILS_CHECK_ENO_H_
#include <errno.h>
#define ENO_FMT "%d (%s)"
#define ENO_ARG(eno) eno, strerror(eno)
#endif

312
utils/src/check/extent.c Normal file
View File

@@ -0,0 +1,312 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include "util.h"
#include "lk_rbtree_wrapper.h"
#include "debug.h"
#include "extent.h"
/*
* In-memory extent management in rbtree nodes.
*/
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
{
u64 a_end = a_start + a_len;
u64 b_end = b_start + b_len;
return !((a_end <= b_start) || (b_end <= a_start));
}
static int ext_contains(struct extent_node *ext, u64 start, u64 len)
{
return ext->start <= start && ext->start + ext->len >= start + len;
}
/*
* True if the given extent is bisected by the given range; there's
* leftover containing extents on both the left and right sides of the
* range in the extent.
*/
static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
{
return ext->start < start && ext->start + ext->len > start + len;
}
static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
{
return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
}
static struct extent_node *next_ext(struct extent_node *ext)
{
return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
}
static struct extent_node *prev_ext(struct extent_node *ext)
{
return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
}
struct walk_results {
unsigned bisect_to_leaf:1;
struct extent_node *found;
struct extent_node *next;
struct rb_node *parent;
struct rb_node **node;
};
static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
{
struct rb_node **node = &root->rbroot.rb_node;
struct extent_node *ext;
u64 end = start + len;
int cmp;
wlk->found = NULL;
wlk->next = NULL;
wlk->parent = NULL;
while (*node) {
wlk->parent = *node;
ext = ext_from_rbnode(*node);
cmp = end <= ext->start ? -1 :
start >= ext->start + ext->len ? 1 : 0;
if (cmp < 0) {
node = &ext->rbnode.rb_left;
wlk->next = ext;
} else if (cmp > 0) {
node = &ext->rbnode.rb_right;
} else {
wlk->found = ext;
if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
break;
/* walk right so we can insert greater right from bisection */
node = &ext->rbnode.rb_right;
}
}
wlk->node = node;
}
/*
* Return an extent that overlaps with the given range.
*/
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
{
struct walk_results wlk = { 0, };
int ret;
walk_extents(root, start, len, &wlk);
if (wlk.found) {
memset(found, 0, sizeof(struct extent_node));
found->start = wlk.found->start;
found->len = wlk.found->len;
ret = 0;
} else {
ret = -ENOENT;
}
return ret;
}
/*
* Callers can iterate through direct node references and are entirely
* responsible for consistency when doing so.
*/
struct extent_node *extent_first(struct extent_root *root)
{
struct walk_results wlk = { 0, };
walk_extents(root, 0, 1, &wlk);
return wlk.found ?: wlk.next;
}
struct extent_node *extent_next(struct extent_node *ext)
{
return next_ext(ext);
}
struct extent_node *extent_prev(struct extent_node *ext)
{
return prev_ext(ext);
}
/*
* Insert a new extent into the tree. We can extend existing nodes,
* merge with neighbours, or remove existing extents entirely if we
* insert a range that fully spans existing nodes.
*/
static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
{
struct walk_results wlk = { 0, };
struct extent_node *ext;
struct extent_node *nei;
int ret;
walk_extents(root, start, len, &wlk);
ext = wlk.found;
if (ext && found_err) {
ret = found_err;
goto out;
}
if (!ext) {
ext = malloc(sizeof(struct extent_node));
if (!ext) {
ret = -ENOMEM;
goto out;
}
ext->start = start;
ext->len = len;
rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
rb_insert_color(&ext->rbnode, &root->rbroot);
}
/* start by expanding an existing extent if our range is larger */
if (start < ext->start) {
ext->len += ext->start - start;
ext->start = start;
}
if (ext->start + ext->len < start + len)
ext->len += (start + len) - (ext->start + ext->len);
/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
while ((nei = prev_ext(ext))) {
if (nei->start + nei->len < ext->start)
break;
if (nei->start < ext->start) {
ext->len += ext->start - nei->start;
ext->start = nei->start;
}
rb_erase(&nei->rbnode, &root->rbroot);
free(nei);
}
while ((nei = next_ext(ext))) {
if (ext->start + ext->len < nei->start)
break;
if (ext->start + ext->len < nei->start + nei->len)
ext->len += (nei->start + nei->len) - (ext->start + ext->len);
rb_erase(&nei->rbnode, &root->rbroot);
free(nei);
}
ret = 0;
out:
debug("start %llu len %llu ret %d", start, len, ret);
return ret;
}
/*
* Insert a new extent. The specified extent must not overlap with any
* existing extents or -EEXIST is returned.
*/
int extent_insert_new(struct extent_root *root, u64 start, u64 len)
{
return walk_insert(root, start, len, true);
}
/*
* Insert an extent, extending any existing extents that may overlap.
*/
int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
{
return walk_insert(root, start, len, false);
}
/*
* Remove the specified extent from an existing node. The given extent must be fully
* contained in a single node or -ENOENT is returned.
*/
int extent_remove(struct extent_root *root, u64 start, u64 len)
{
struct extent_node *ext;
struct extent_node *ins;
struct walk_results wlk = {
.bisect_to_leaf = 1,
};
int ret;
walk_extents(root, start, len, &wlk);
if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
ret = -ENOENT;
goto out;
}
if (ext_bisected(ext, start, len)) {
debug("found bisected start %llu len %llu", ext->start, ext->len);
ins = malloc(sizeof(struct extent_node));
if (!ins) {
ret = -ENOMEM;
goto out;
}
ins->start = start + len;
ins->len = (ext->start + ext->len) - ins->start;
rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
rb_insert_color(&ins->rbnode, &root->rbroot);
}
if (start > ext->start) {
ext->len = start - ext->start;
} else if (len < ext->len) {
ext->start += len;
ext->len -= len;
} else {
rb_erase(&ext->rbnode, &root->rbroot);
}
ret = 0;
out:
debug("start %llu len %llu ret %d", start, len, ret);
return ret;
}
void extent_root_init(struct extent_root *root)
{
root->rbroot = RB_ROOT;
root->total = 0;
}
void extent_root_free(struct extent_root *root)
{
struct extent_node *ext;
struct rb_node *node;
struct rb_node *tmp;
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
ext = rb_entry(node, struct extent_node, rbnode);
rb_erase(&ext->rbnode, &root->rbroot);
free(ext);
}
}
void extent_root_print(struct extent_root *root)
{
struct extent_node *ext;
struct rb_node *node;
struct rb_node *tmp;
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
ext = rb_entry(node, struct extent_node, rbnode);
debug(" start %llu len %llu", ext->start, ext->len);
}
}

38
utils/src/check/extent.h Normal file
View File

@@ -0,0 +1,38 @@
#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
#include "lk_rbtree_wrapper.h"
struct extent_root {
struct rb_root rbroot;
u64 total;
};
struct extent_node {
struct rb_node rbnode;
u64 start;
u64 len;
};
typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
struct extent_cb_arg_t {
extent_cb_t cb;
void *cb_arg;
};
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
struct extent_node *extent_first(struct extent_root *root);
struct extent_node *extent_next(struct extent_node *ext);
struct extent_node *extent_prev(struct extent_node *ext);
int extent_insert_new(struct extent_root *root, u64 start, u64 len);
int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
int extent_remove(struct extent_root *root, u64 start, u64 len);
void extent_root_init(struct extent_root *root);
void extent_root_free(struct extent_root *root);
void extent_root_print(struct extent_root *root);
#endif

540
utils/src/check/image.c Normal file
View File

@@ -0,0 +1,540 @@
#define _GNU_SOURCE /* O_DIRECT */
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <stdbool.h>
#include <argp.h>
#include "sparse.h"
#include "bitmap.h"
#include "parse.h"
#include "util.h"
#include "format.h"
#include "crc.h"
#include "cmd.h"
#include "dev.h"
#include "alloc.h"
#include "block.h"
#include "btree.h"
#include "log_trees.h"
#include "super.h"
/* huh. */
#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
#define SCOUTFS_META_IMAGE_HEADER_MAGIC 0x8aee00d098fa60c5ULL
#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC 0x70bd5e9269effd86ULL
struct scoutfs_meta_image_header {
__le64 magic;
__le64 total_bytes;
__le32 version;
} __packed;
struct scoutfs_meta_image_block_header {
__le64 magic;
__le64 offset;
__le32 size;
__le32 crc;
} __packed;
struct image_args {
char *meta_device;
bool is_read;
bool show_header;
u64 ra_window;
};
struct block_bitmaps {
unsigned long *bits;
u64 size;
u64 count;
};
#define errf(fmt, args...) \
dprintf(STDERR_FILENO, fmt, ##args)
static int set_meta_bit(u64 start, u64 len, void *arg)
{
struct block_bitmaps *bm = arg;
int ret;
if (len != 1) {
ret = -EINVAL;
} else {
if (!test_bit(bm->bits, start)) {
set_bit(bm->bits, start);
bm->count++;
}
ret = 0;
}
return ret;
}
static int get_ref_bits(struct block_bitmaps *bm)
{
struct scoutfs_super_block *super = global_super;
int ret;
u64 i;
/*
* There are almost no small blocks we need to read, so we read
* them as the large blocks that contain them to simplify the
* block reading process.
*/
set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
log_trees_meta_iter(set_meta_bit, bm);
return ret;
}
/*
* Note that this temporarily modifies the header that it's given.
*/
static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
{
__le32 saved = bh->crc;
u32 crc = ~0;
bh->crc = 0;
crc = crc32c(crc, bh, sizeof(*bh));
crc = crc32c(crc, buf, size);
bh->crc = saved;
return cpu_to_le32(crc);
}
static void printf_header(struct scoutfs_meta_image_header *hdr)
{
errf("magic: 0x%016llx\n"
"total_bytes: %llu\n"
"version: %u\n",
le64_to_cpu(hdr->magic),
le64_to_cpu(hdr->total_bytes),
le32_to_cpu(hdr->version));
}
typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
{
return read(fd, buf, count);
}
static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
{
return pread(fd, buf, count, offset);
}
static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
{
return write(fd, buf, count);
}
static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
{
return pwrite(fd, buf, count, offset);
}
static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
{
ssize_t sret;
while (count > 0) {
sret = func(fd, buf, count, offset);
if (sret <= 0 || sret > count) {
if (sret < 0)
return -errno;
else
return -EIO;
}
if (tot)
*tot += sret;
buf += sret;
count -= sret;
}
return 0;
}
static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
{
struct scoutfs_meta_image_block_header bh;
struct scoutfs_meta_image_header hdr;
u64 opening;
void *buf;
off_t off;
u64 bit;
u64 ra;
int ret;
buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
if (!buf) {
ret = -ENOMEM;
goto out;
}
hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
(bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
hdr.version = cpu_to_le32(1);
if (args->show_header) {
printf_header(&hdr);
ret = 0;
goto out;
}
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
if (ret < 0)
goto out;
opening = args->ra_window;
ra = 0;
bit = 0;
for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
/* readahead to open the full window, then a block at a time */
do {
ra = find_next_set_bit(bm->bits, ra, bm->size);
if (ra < bm->size) {
off = ra << SCOUTFS_BLOCK_LG_SHIFT;
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
ra++;
if (opening)
opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
}
} while (opening > 0);
off = bit << SCOUTFS_BLOCK_LG_SHIFT;
ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
if (ret < 0)
goto out;
/*
* Might as well try to drop the pages we've used to
* reduce memory pressure on our read-ahead pages that
* are waiting.
*/
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
bh.magic = SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC;
bh.offset = cpu_to_le64(off);
bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
if (ret < 0)
goto out;
}
out:
free(buf);
return ret;
}
static int invalid_header(struct scoutfs_meta_image_header *hdr)
{
if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
} else if (le32_to_cpu(hdr->version) != 1) {
errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
} else {
return 0;
}
return -EIO;
}
/*
* Doesn't catch offset+size overflowing, presumes pwrite() will return
* an error.
*/
static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
{
if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
} else if (le32_to_cpu(bh->size) == 0) {
errf("invalid block header size %u\n", le32_to_cpu(bh->size));
} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
errf("block header size %u too large for size_t (> %zu)\n",
le32_to_cpu(bh->size), (size_t)SIZE_MAX);
} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
errf("block header offset %llu too large for off_t (> %llu)\n",
le64_to_cpu(bh->offset), (u64)OFF_MAX);
} else {
return 0;
}
return -EIO;
}
static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
{
struct scoutfs_meta_image_block_header bh;
struct scoutfs_meta_image_header hdr;
size_t writeback_batch = (2 * 1024 * 1024);
size_t buf_size;
size_t dirty;
size_t size;
off_t first;
off_t last;
off_t off;
__le32 calc;
void *buf;
u64 tot;
int ret;
tot = 0;
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
if (ret < 0)
goto out;
if (args->show_header) {
printf_header(&hdr);
ret = 0;
goto out;
}
ret = invalid_header(&hdr);
if (ret < 0)
goto out;
dirty = 0;
first = OFF_MAX;
last = 0;
buf = NULL;
buf_size = 0;
while (tot < le64_to_cpu(hdr.total_bytes)) {
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
if (ret < 0)
goto out;
ret = invalid_block_header(&bh);
if (ret < 0)
goto out;
size = le32_to_cpu(bh.size);
if (buf_size < size) {
buf = realloc(buf, size);
if (!buf) {
ret = -ENOMEM;
goto out;
}
buf_size = size;
}
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
if (ret < 0)
goto out;
calc = calc_crc(&bh, buf, size);
if (calc != bh.crc) {
errf("crc err");
ret = -EIO;
goto out;
}
off = le64_to_cpu(bh.offset);
ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
if (ret < 0)
goto out;
dirty += size;
first = min(first, off);
last = max(last, off);
if (dirty >= writeback_batch) {
posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
dirty = 0;
first = OFF_MAX;
last = 0;
}
}
ret = fsync(fd);
if (ret < 0) {
ret = -errno;
goto out;
}
out:
return ret;
}
static int do_image(struct image_args *args)
{
struct block_bitmaps bm = { .bits = NULL };
int meta_fd = -1;
u64 dev_size;
mode_t mode;
int ret;
mode = args->is_read ? O_RDONLY : O_RDWR;
meta_fd = open(args->meta_device, mode);
if (meta_fd < 0) {
ret = -errno;
errf("failed to open meta device '%s': %s (%d)\n",
args->meta_device, strerror(errno), errno);
goto out;
}
if (args->is_read) {
ret = flush_device(meta_fd);
if (ret < 0)
goto out;
ret = get_device_size(args->meta_device, meta_fd, &dev_size);
if (ret < 0)
goto out;
bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
if (!bm.bits) {
ret = -ENOMEM;
goto out;
}
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
check_supers() ?:
get_ref_bits(&bm) ?:
read_image(args, meta_fd, &bm);
block_shutdown();
} else {
ret = write_image(args, meta_fd, &bm);
}
out:
free(bm.bits);
if (meta_fd >= 0)
close(meta_fd);
return ret;
}
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct image_args *args = state->input;
int ret;
switch (key) {
case 'h':
args->show_header = true;
break;
case 'r':
ret = parse_u64(arg, &args->ra_window);
if (ret)
argp_error(state, "readahead winddoe parse error");
break;
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
else
argp_error(state, "more than two device arguments given");
break;
case ARGP_KEY_FINI:
if (!args->meta_device)
argp_error(state, "no metadata device argument given");
break;
default:
break;
}
return 0;
}
static struct argp_option options[] = {
{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
{ NULL }
};
static struct argp read_image_argp = {
options,
parse_opt,
"META-DEVICE",
"Read metadata image stream from metadata device file"
};
#define DEFAULT_RA_WINDOW (512 * 1024)
static int read_image_cmd(int argc, char **argv)
{
struct image_args image_args = {
.is_read = true,
.ra_window = DEFAULT_RA_WINDOW,
};
int ret;
ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
if (ret)
return ret;
return do_image(&image_args);
}
static struct argp write_image_argp = {
options,
parse_opt,
"META-DEVICE",
"Write metadata image stream to metadata device file"
};
static int write_image_cmd(int argc, char **argv)
{
struct image_args image_args = {
.is_read = false,
.ra_window = DEFAULT_RA_WINDOW,
};
int ret;
ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
if (ret)
return ret;
return do_image(&image_args);
}
static void __attribute__((constructor)) image_ctor(void)
{
cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
}

15
utils/src/check/iter.h Normal file
View File

@@ -0,0 +1,15 @@
#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
#define _SCOUTFS_UTILS_CHECK_ITER_H_
/*
* Callbacks can return a weird -errno that we'll never use to indicate
* that iteration can stop and return 0 for success.
*/
#define ECHECK_ITER_DONE EL2HLT
static inline int xlate_iter_errno(int ret)
{
return ret == -ECHECK_ITER_DONE ? 0 : ret;
}
#endif

View File

@@ -0,0 +1,98 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "key.h"
#include "alloc.h"
#include "btree.h"
#include "debug.h"
#include "extent.h"
#include "iter.h"
#include "sns.h"
#include "log_trees.h"
#include "super.h"
struct iter_args {
extent_cb_t cb;
void *cb_arg;
};
static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
{
struct iter_args *ia = cb_arg;
struct scoutfs_log_trees *lt;
int ret;
if (val_len != sizeof(struct scoutfs_log_trees))
; /* XXX */
lt = val;
sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
sns_push("meta_avail", 0, 0);
ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
sns_push("meta_freed", 0, 0);
ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
sns_push("item_root", 0, 0);
ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
if (lt->bloom_ref.blkno) {
sns_push("bloom_ref", 0, 0);
ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
sns_pop();
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
}
sns_push("data_avail", 0, 0);
ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
sns_push("data_freed", 0, 0);
ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
ret = 0;
out:
sns_pop();
return ret;
}
/*
* Call the callers callback with the extent of all the metadata block references contained
* in log btrees. We walk the logs_root btree items and walk all the metadata structures
* they reference.
*/
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
{
struct scoutfs_super_block *super = global_super;
struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
}

View File

@@ -0,0 +1,8 @@
#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
#include "extent.h"
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
#endif

367
utils/src/check/meta.c Normal file
View File

@@ -0,0 +1,367 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "bitmap.h"
#include "key.h"
#include "alloc.h"
#include "btree.h"
#include "debug.h"
#include "extent.h"
#include "sns.h"
#include "log_trees.h"
#include "meta.h"
#include "problem.h"
#include "super.h"
static struct meta_data {
struct extent_root meta_refed;
struct extent_root meta_free;
struct {
u64 ref_blocks;
u64 free_extents;
u64 free_blocks;
} stats;
} global_mdat;
bool valid_meta_blkno(u64 blkno)
{
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
}
static bool valid_meta_extent(u64 start, u64 len)
{
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
bool valid;
valid = len > 0 &&
start >= SCOUTFS_META_DEV_START_BLKNO &&
start < tot &&
len <= tot &&
((start + len) <= tot) &&
((start + len) > start);
debug("start %llu len %llu valid %u", start, len, !!valid);
if (!valid)
problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
return valid;
}
/*
* Track references to individual metadata blocks. This uses the extent
* callback type but is only ever called for single block references.
* Any reference to a block that has already been referenced is
* considered invalid and is ignored. Later repair will resolve
* duplicate references.
*/
static int insert_meta_ref(u64 start, u64 len, void *arg)
{
struct meta_data *mdat = &global_mdat;
struct extent_root *root = arg;
int ret = 0;
/* this is tracking single metadata block references */
if (len != 1) {
ret = -EINVAL;
goto out;
}
if (valid_meta_blkno(start)) {
ret = extent_insert_new(root, start, len);
if (ret == 0)
mdat->stats.ref_blocks++;
else if (ret == -EEXIST)
problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
}
out:
return ret;
}
static int insert_meta_free(u64 start, u64 len, void *arg)
{
struct meta_data *mdat = &global_mdat;
struct extent_root *root = arg;
int ret = 0;
if (valid_meta_extent(start, len)) {
ret = extent_insert_new(root, start, len);
if (ret == 0) {
mdat->stats.free_extents++;
mdat->stats.free_blocks++;
} else if (ret == -EEXIST) {
problem(PB_META_FREE_OVERLAPS_EXISTING,
"start %llu llen %llu", start, len);
}
}
return ret;
}
/*
* Walk all metadata references in the system. This walk doesn't need
* to read metadata that doesn't contain any metadata references so it
* can skip the bulk of metadata blocks. This gives us the set of
* referenced metadata blocks which we can then use to repair metadata
* allocator structures.
*/
static int get_meta_refs(void)
{
struct meta_data *mdat = &global_mdat;
struct scoutfs_super_block *super = global_super;
int ret;
extent_root_init(&mdat->meta_refed);
/* XXX record reserved blocks around super as referenced */
sns_push("meta_alloc", 0, 0);
ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("meta_alloc", 1, 0);
ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("data_alloc", 1, 0);
ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 0, 0);
ret = alloc_list_meta_iter(&super->server_meta_avail[0],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 1, 0);
ret = alloc_list_meta_iter(&super->server_meta_avail[1],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 0, 0);
ret = alloc_list_meta_iter(&super->server_meta_freed[0],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 1, 0);
ret = alloc_list_meta_iter(&super->server_meta_freed[1],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("fs_root", 0, 0);
ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("logs_root", 0, 0);
ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("log_merge", 0, 0);
ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("mounted_clients", 0, 0);
ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("srch_root", 0, 0);
ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
if (ret < 0)
goto out;
printf("found %llu referenced metadata blocks\n", mdat->stats.ref_blocks);
ret = 0;
out:
return ret;
}
static int get_meta_free(void)
{
struct meta_data *mdat = &global_mdat;
struct scoutfs_super_block *super = global_super;
int ret;
extent_root_init(&mdat->meta_free);
sns_push("meta_alloc", 0, 0);
ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("meta_alloc", 1, 0);
ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 0, 0);
ret = alloc_list_extent_iter(&super->server_meta_avail[0],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 1, 0);
ret = alloc_list_extent_iter(&super->server_meta_avail[1],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 0, 0);
ret = alloc_list_extent_iter(&super->server_meta_freed[0],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 1, 0);
ret = alloc_list_extent_iter(&super->server_meta_freed[1],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
printf("found %llu free metadata blocks in %llu extents\n",
mdat->stats.free_blocks, mdat->stats.free_extents);
ret = 0;
out:
return ret;
}
/*
* All the space between referenced blocks must be recorded in the free
* extents. The free extent walk didn't check that the extents
* overlapped with references, we do that here. Remember that metadata
* block references were merged into extents here, the refed extents
* aren't necessarily all a single block.
*/
static int compare_refs_and_free(void)
{
struct meta_data *mdat = &global_mdat;
struct extent_node *ref;
struct extent_node *free;
struct extent_node *next;
struct extent_node *prev;
u64 expect;
u64 start;
u64 end;
expect = 0;
ref = extent_first(&mdat->meta_refed);
free = extent_first(&mdat->meta_free);
while (ref || free) {
printf("exp %llu ref %llu.%llu free %llu.%llu\n",
expect, ref ? ref->start : 0, ref ? ref->len : 0,
free ? free->start : 0, free ? free->len : 0);
/* referenced marked free, remove ref from free and continue from same point */
if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
printf("ref extent %llu.%llu overlaps free %llu %llu\n",
ref->start, ref->len, free->start, free->len);
start = max(ref->start, free->start);
end = min(ref->start + ref->len, free->start + free->len);
prev = extent_prev(free);
extent_remove(&mdat->meta_free, start, end - start);
if (prev)
free = extent_next(prev);
else
free = extent_first(&mdat->meta_free);
continue;
}
/* see which extent starts earlier */
if (!free || (ref && ref->start <= free->start))
next = ref;
else
next = free;
/* untracked region before next extent */
if (expect < next->start) {
printf("missing free extent %llu.%llu\n", expect, next->start - expect);
expect = next->start;
continue;
}
/* didn't overlap, advance past next extent */
expect = next->start + next->len;
if (next == ref)
ref = extent_next(ref);
else
free = extent_next(free);
}
return 0;
}
/*
* Check the metadata allocators by comparing the set of referenced
* blocks with the set of free blocks that are stored in free btree
* items and alloc list blocks.
*/
int check_meta_alloc(void)
{
int ret;
ret = get_meta_refs();
if (ret < 0)
goto out;
ret = get_meta_free();
if (ret < 0)
goto out;
ret = compare_refs_and_free();
if (ret < 0)
goto out;
ret = 0;
out:
return ret;
}

9
utils/src/check/meta.h Normal file
View File

@@ -0,0 +1,9 @@
#ifndef _SCOUTFS_UTILS_CHECK_META_H_
#define _SCOUTFS_UTILS_CHECK_META_H_
bool valid_meta_blkno(u64 blkno);
int check_meta_alloc(void);
#endif

23
utils/src/check/padding.c Normal file
View File

@@ -0,0 +1,23 @@
#include <string.h>
#include <stdbool.h>
#include "util.h"
#include "padding.h"
bool padding_is_zeros(const void *data, size_t sz)
{
static char zeros[32] = {0,};
const size_t batch = array_size(zeros);
while (sz >= batch) {
if (memcmp(data, zeros, batch))
return false;
data += batch;
sz -= batch;
}
if (sz > 0 && memcmp(data, zeros, sz))
return false;
return true;
}

View File

@@ -0,0 +1,6 @@
#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
#define _SCOUTFS_UTILS_CHECK_PADDING_H_
bool padding_is_zeros(const void *data, size_t sz);
#endif

23
utils/src/check/problem.c Normal file
View File

@@ -0,0 +1,23 @@
#include <stdio.h>
#include <stdint.h>
#include "problem.h"
#if 0
#define PROB_STR(pb) [pb] = #pb
static char *prob_strs[] = {
PROB_STR(PB_META_EXTENT_INVALID),
PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
};
#endif
static struct problem_data {
uint64_t counts[PB__NR];
} global_pdat;
void problem_record(prob_t pb)
{
struct problem_data *pdat = &global_pdat;
pdat->counts[pb]++;
}

23
utils/src/check/problem.h Normal file
View File

@@ -0,0 +1,23 @@
#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
#include "debug.h"
#include "sns.h"
typedef enum {
PB_META_EXTENT_INVALID,
PB_META_REF_OVERLAPS_EXISTING,
PB_META_FREE_OVERLAPS_EXISTING,
PB_BTREE_BLOCK_BAD_LEVEL,
PB__NR,
} prob_t;
#define problem(pb, fmt, ...) \
do { \
debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__); \
problem_record(pb); \
} while (0)
void problem_record(prob_t pb);
#endif

118
utils/src/check/sns.c Normal file
View File

@@ -0,0 +1,118 @@
#include <stdlib.h>
#include <string.h>
#include "sns.h"
/*
* This "str num stack" is used to describe our location in metadata at
* any given time.
*
* As we descend into structures we pop a string on decribing them,
* perhaps with associated numbers. Pushing and popping is very cheap
* and only rarely do we format the stack into a string, as an arbitrary
* example:
* super.fs_root.btree_parent:1231.btree_leaf:3231"
*/
#define SNS_MAX_DEPTH 1000
#define SNS_STR_SIZE (SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
static struct sns_data {
unsigned int depth;
struct sns_entry {
char *str;
size_t len;
u64 a;
u64 b;
} ents[SNS_MAX_DEPTH];
char str[SNS_STR_SIZE];
} global_lsdat;
void _sns_push(char *str, size_t len, u64 a, u64 b)
{
struct sns_data *lsdat = &global_lsdat;
if (lsdat->depth < SNS_MAX_DEPTH) {
lsdat->ents[lsdat->depth++] = (struct sns_entry) {
.str = str,
.len = len,
.a = a,
.b = b,
};
}
}
void sns_pop(void)
{
struct sns_data *lsdat = &global_lsdat;
if (lsdat->depth > 0)
lsdat->depth--;
}
static char *append_str(char *pos, char *str, size_t len)
{
memcpy(pos, str, len);
return pos + len;
}
/*
* This is not called for x = 0 so we don't need to emit an initial 0.
* We could by using do {} while instead of while {}.
*/
static char *append_u64x(char *pos, u64 x)
{
static char hex[] = "0123456789abcdef";
while (x) {
*pos++ = hex[x & 0xf];
x >>= 4;
}
return pos;
}
static char *append_char(char *pos, char c)
{
*(pos++) = c;
return pos;
}
/*
* Return a pointer to a null terminated string that describes the
* current location stack. The string buffer is global.
*/
char *sns_str(void)
{
struct sns_data *lsdat = &global_lsdat;
struct sns_entry *ent;
char *pos;
int i;
pos = lsdat->str;
for (i = 0; i < lsdat->depth; i++) {
ent = &lsdat->ents[i];
if (i)
pos = append_char(pos, '.');
pos = append_str(pos, ent->str, ent->len);
if (ent->a) {
pos = append_char(pos, ':');
pos = append_u64x(pos, ent->a);
}
if (ent->b) {
pos = append_char(pos, ':');
pos = append_u64x(pos, ent->b);
}
}
*pos = '\0';
return lsdat->str;
}

20
utils/src/check/sns.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
#define _SCOUTFS_UTILS_CHECK_SNS_H_
#include <assert.h>
#include "sparse.h"
#define SNS_MAX_STR_LEN 20
#define sns_push(str, a, b) \
do { \
build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN); \
_sns_push((str), sizeof(str) - 1, a, b); \
} while (0)
void _sns_push(char *str, size_t len, u64 a, u64 b);
void sns_pop(void);
char *sns_str(void);
#endif

57
utils/src/check/super.c Normal file
View File

@@ -0,0 +1,57 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "block.h"
#include "super.h"
/*
* After we check the super blocks we provide a global buffer to track
* the current super block. It is referenced to get static information
* about the system and is also modified and written as part of
* transactions.
*/
struct scoutfs_super_block *global_super;
/*
* After checking the supers we save a copy of it in a global buffer that's used by
* other modules to track the current super. It can be modified and written during commits.
*/
int check_supers(void)
{
struct scoutfs_super_block *super = NULL;
struct block *blk = NULL;
int ret;
global_super = malloc(sizeof(struct scoutfs_super_block));
if (!global_super) {
printf("error allocating super block buffer\n");
ret = -ENOMEM;
goto out;
}
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
if (ret < 0) {
printf("error reading super block\n");
goto out;
}
super = block_buf(blk);
memcpy(global_super, super, sizeof(struct scoutfs_super_block));
ret = 0;
out:
block_put(&blk);
return ret;
}
void super_shutdown(void)
{
free(global_super);
}

9
utils/src/check/super.h Normal file
View File

@@ -0,0 +1,9 @@
#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
#define _SCOUTFS_UTILS_CHECK_SUPER_H_
extern struct scoutfs_super_block *global_super;
int check_supers(void);
void super_shutdown(void);
#endif

View File

@@ -141,13 +141,4 @@ static inline void scoutfs_key_dec(struct scoutfs_key *key)
key->sk_zone--;
}
static inline void scoutfs_xattr_get_indx_key(struct scoutfs_key *key, u8 *major, u64 *minor,
u64 *ino, u64 *xid)
{
*major = le64_to_cpu(key->_sk_first) >> 56;
*minor = (le64_to_cpu(key->_sk_first) << 8) | (le64_to_cpu(key->_sk_second) >> 56);
*ino = (le64_to_cpu(key->_sk_second) << 8) | (le64_to_cpu(key->_sk_third) >> 56);
*xid = (le64_to_cpu(key->_sk_third) << 8) | key->_sk_fourth;
}
#endif

View File

@@ -156,6 +156,16 @@ static inline void list_move_tail(struct list_head *list,
list_add_tail(list, head);
}
/**
* list_is_head - tests whether @list is the list @head
* @list: the entry to test
* @head: the head of the list
*/
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
return list == head;
}
/**
* list_empty - tests whether a list is empty
* @head: the list to test.
@@ -242,6 +252,15 @@ static inline void list_splice_init(struct list_head *list,
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)
/**
* list_entry_is_head - test if the entry points to the head of the list
* @pos: the type * to cursor
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_entry_is_head(pos, head, member) \
(&pos->member == (head))
/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop counter.
@@ -307,4 +326,28 @@ static inline void list_splice_init(struct list_head *list,
#define list_next_entry(pos, member) \
list_entry((pos)->member.next, typeof(*(pos)), member)
/**
* list_prev_entry - get the prev element in list
* @pos: the type * to cursor
* @member: the name of the list_head within the struct.
*/
#define list_prev_entry(pos, member) \
list_entry((pos)->member.prev, typeof(*(pos)), member)
/**
* list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate backwards over list of given type, safe against removal
* of list entry.
*/
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member), \
n = list_prev_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = n, n = list_prev_entry(n, member))
#endif

View File

@@ -0,0 +1,24 @@
#ifndef _LK_RBTREE_WRAPPER_H_
#define _LK_RBTREE_WRAPPER_H_
/*
* We're using this lame hack to build and use the kernel's rbtree in
* userspace. We drop the kernel's rbtree*[ch] implementation in and
* use them with this wrapper. We only have to remove the kernel
* includes from the imported files.
*/
#include <stdbool.h>
#include "util.h"
#define rcu_assign_pointer(a, b) do { a = b; } while (0)
#define READ_ONCE(a) ({ a; })
#define WRITE_ONCE(a, b) do { a = b; } while (0)
#define unlikely(a) ({ a; })
#define EXPORT_SYMBOL(a) /* nop */
#include "rbtree_types.h"
#include "rbtree.h"
#include "rbtree_augmented.h"
#endif

View File

@@ -274,9 +274,7 @@ static int do_mkfs(struct mkfs_args *args)
inode.ctime.nsec = inode.atime.nsec;
inode.mtime.sec = inode.atime.sec;
inode.mtime.nsec = inode.atime.nsec;
inode.crtime.sec = inode.atime.sec;
inode.crtime.nsec = inode.atime.nsec;
btree_append_item(bt, &key, &inode, scoutfs_inode_vers_bytes(args->fmt_vers));
btree_append_item(bt, &key, &inode, sizeof(inode));
ret = write_block(meta_fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, 1, blkno,
SCOUTFS_BLOCK_LG_SHIFT, &bt->hdr);

View File

@@ -49,7 +49,7 @@ static void print_inode(struct scoutfs_key *key, void *val, int val_len)
{
struct scoutfs_inode *inode = val;
printf(" inode: ino %llu size %llu version %llu proj %llu nlink %u\n"
printf(" inode: ino %llu size %llu version %llu nlink %u\n"
" uid %u gid %u mode 0%o rdev 0x%x flags 0x%x\n"
" next_readdir_pos %llu meta_seq %llu data_seq %llu data_version %llu\n"
" atime %llu.%08u ctime %llu.%08u\n"
@@ -57,7 +57,6 @@ static void print_inode(struct scoutfs_key *key, void *val, int val_len)
le64_to_cpu(key->ski_ino),
le64_to_cpu(inode->size),
le64_to_cpu(inode->version),
le64_to_cpu(inode->proj),
le32_to_cpu(inode->nlink), le32_to_cpu(inode->uid),
le32_to_cpu(inode->gid), le32_to_cpu(inode->mode),
le32_to_cpu(inode->rdev),
@@ -80,24 +79,6 @@ static void print_orphan(struct scoutfs_key *key, void *val, int val_len)
}
#define SQR_FMT "[%u %llu,%u,%x %llu,%u,%x %llu,%u,%x %u %llu %x]"
#define SQR_ARGS(r) \
(r)->prio, \
le64_to_cpu((r)->name_val[0]), (r)->name_source[0], (r)->name_flags[0], \
le64_to_cpu((r)->name_val[1]), (r)->name_source[1], (r)->name_flags[1], \
le64_to_cpu((r)->name_val[2]), (r)->name_source[2], (r)->name_flags[2], \
(r)->op, le64_to_cpu((r)->limit), (r)->rule_flags
static void print_quota(struct scoutfs_key *key, void *val, int val_len)
{
struct scoutfs_quota_rule_val *rv = val;
printf(" quota rule: hash 0x%016llx coll_nr %llu\n"
" "SQR_FMT"\n",
le64_to_cpu(key->skqr_hash), le64_to_cpu(key->skqr_coll_nr), SQR_ARGS(rv));
}
static void print_xattr_totl(struct scoutfs_key *key, void *val, int val_len)
{
struct scoutfs_xattr_totl_val *tval = val;
@@ -108,17 +89,6 @@ static void print_xattr_totl(struct scoutfs_key *key, void *val, int val_len)
le64_to_cpu(tval->count));
}
static void print_xattr_indx(struct scoutfs_key *key, void *val, int val_len)
{
u64 minor;
u64 ino;
u64 xid;
u8 major;
scoutfs_xattr_get_indx_key(key, &major, &minor, &ino, &xid);
printf(" xattr indx: major %u minor %llu ino %llu xid %llu", major, minor, ino, xid);
}
static u8 *global_printable_name(u8 *name, int name_len)
{
static u8 name_buf[SCOUTFS_NAME_LEN + 1];
@@ -207,15 +177,9 @@ static print_func_t find_printer(u8 zone, u8 type)
return print_orphan;
}
if (zone == SCOUTFS_QUOTA_ZONE)
return print_quota;
if (zone == SCOUTFS_XATTR_TOTL_ZONE)
return print_xattr_totl;
if (zone == SCOUTFS_XATTR_INDX_ZONE)
return print_xattr_indx;
if (zone == SCOUTFS_FS_ZONE) {
switch(type) {
case SCOUTFS_INODE_TYPE: return print_inode;
@@ -645,6 +609,8 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
u64 blkno;
u64 start;
u64 len;
u64 st;
u64 nr;
int wid;
int ret;
int i;
@@ -663,27 +629,37 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
AL_REF_A(&lblk->next), le32_to_cpu(lblk->start),
le32_to_cpu(lblk->nr));
if (lblk->nr) {
wid = printf(" exts: ");
start = 0;
len = 0;
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
if (len == 0)
start = le64_to_cpu(lblk->blknos[i]);
len++;
if (i == (le32_to_cpu(lblk->nr) - 1) ||
start + len != le64_to_cpu(lblk->blknos[i + 1])) {
if (wid >= 72)
wid = printf("\n ");
wid += printf("%llu,%llu ", start, len);
len = 0;
}
}
printf("\n");
st = le32_to_cpu(lblk->start);
nr = le32_to_cpu(lblk->nr);
if (st >= SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
nr > SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
(st + nr) > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
printf(" (invalid start and nr fields)\n");
goto out;
}
if (lblk->nr == 0)
goto out;
wid = printf(" exts: ");
start = 0;
len = 0;
for (i = 0; i < nr; i++) {
if (len == 0)
start = le64_to_cpu(lblk->blknos[st + i]);
len++;
if (i == (nr - 1) || (start + len) != le64_to_cpu(lblk->blknos[st + i + 1])) {
if (wid >= 72)
wid = printf("\n ");
wid += printf("%llu,%llu ", start, len);
len = 0;
}
}
printf("\n");
out:
next = lblk->next;
free(lblk);
return print_alloc_list_block(fd, str, &next);

View File

@@ -1,547 +0,0 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <argp.h>
#include "sparse.h"
#include "parse.h"
#include "util.h"
#include "format.h"
#include "ioctl.h"
#include "cmd.h"
#include "util.h"
#include "key.h"
static char opc[] = {
[SQ_OP_DATA] = 'D',
[SQ_OP_INODE] = 'I',
};
static char nsc[] = {
[SQ_NS_LITERAL] = 'L',
[SQ_NS_PROJ] = 'P',
[SQ_NS_UID] = 'U',
[SQ_NS_GID] = 'G',
};
static void printf_rule(struct scoutfs_ioctl_quota_rule *irule)
{
int i;
/* priority: [0-9]+ */
printf("%3u ", irule->prio);
/* totl name: ([0-9]+,[LPUG-]+,[S-]+){3} */
for (i = 0; i < array_size(irule->name_val); i++) {
printf("%llu,%c,%c ",
irule->name_val[i],
nsc[irule->name_source[i]],
(irule->name_flags[i] & SQ_NF_SELECT) ? 'S' : '-');
}
/* op: [ID], limit: [0-9]+, flags [C-] */
printf("%c %llu %c\n",
opc[irule->op], irule->limit, (irule->rule_flags & SQ_RF_TOTL_COUNT) ? 'C' : '-');
}
static int parse_rule(struct scoutfs_ioctl_quota_rule *irule, char *str)
{
char ns[3];
char nf[3];
char rf;
char op;
int ret;
int i;
int j;
memset(irule, 0, sizeof(struct scoutfs_ioctl_quota_rule));
ret = sscanf(str, " %hhu %llu,%c,%c %llu,%c,%c %llu,%c,%c %c %llu %c",
&irule->prio, &irule->name_val[0], &ns[0], &nf[0], &irule->name_val[1],
&ns[1], &nf[1], &irule->name_val[2], &ns[2], &nf[2], &op, &irule->limit,
&rf);
if (ret != 13) {
printf("invalid rule, missing fields: %s\n", str);
ret = -EINVAL;
goto out;
}
for (i = 0; i < array_size(irule->name_val); i++) {
irule->name_source[i] = SQ_NS__NR;
for (j = 0; j < array_size(nsc); j++) {
if (ns[i] == nsc[j]) {
irule->name_source[i] = j;
break;
}
}
if (irule->name_source[i] == SQ_NS__NR) {
printf("invalid name source '%c' in name #%u in rule:\n\t%s\n",
ns[i], i + 1, str);
ret = -EINVAL;
goto out;
}
irule->name_flags[i] = nf[i] == '-' ? 0 :
nf[i] == 'S' ? SQ_NF_SELECT :
SQ_NF__UNKNOWN;
if (irule->name_flags[i] == SQ_NF__UNKNOWN) {
printf("invalid name flags '%c' in name #%u in rule:\n\t%s\n",
nf[i], i + 1, str);
ret = -EINVAL;
goto out;
}
}
irule->op = SQ_NS__NR;
for (i = 0; i < array_size(opc); i++) {
if (op == opc[i]) {
irule->op = i;
break;
}
}
if (irule->op == SQ_NS__NR) {
printf("invalid op '%c' in rule:\n\t%s\n", op, str);
ret = -EINVAL;
goto out;
}
irule->rule_flags = rf == '-' ? 0 : rf == 'C' ? SQ_RF_TOTL_COUNT : SQ_RF__UNKNOWN;
if (irule->rule_flags == SQ_RF__UNKNOWN) {
printf("invalid rule flags '%c' in rule:\n\t%s\n", rf, str);
ret = -EINVAL;
goto out;
}
ret = 0;
out:
return ret;
}
/* ---------------------------------------------- */
struct mod_args {
char *path;
char *rule_str;
bool is_add;
};
static int do_mod(struct mod_args *args)
{
struct scoutfs_ioctl_quota_rule irule;
unsigned int cmd;
int fd = -1;
int ret;
memset(&irule, 0, sizeof(irule));
ret = parse_rule(&irule, args->rule_str);
if (ret < 0)
goto out;
fd = get_path(args->path, O_RDONLY);
if (fd < 0)
return fd;
cmd = args->is_add ? SCOUTFS_IOC_ADD_QUOTA_RULE : SCOUTFS_IOC_DEL_QUOTA_RULE;
ret = ioctl(fd, cmd, &irule);
if (ret < 0) {
ret = -errno;
fprintf(stderr, "MOD_QUOTA_RULE ioctl failed: %s (%d)\n",
strerror(errno), errno);
goto out;
}
ret = 0;
out:
if (fd >= 0)
close(fd);
return ret;
}
static int parse_mod_opt(int key, char *arg, struct argp_state *state)
{
struct mod_args *args = state->input;
switch (key) {
case 'p':
args->path = strdup_or_error(state, arg);
break;
case 'r':
args->rule_str = strdup_or_error(state, arg);
break;
case ARGP_KEY_FINI:
if (!args->path)
argp_error(state, "must provide file path");
if (!args->rule_str)
argp_error(state, "must provide rule string");
break;
default:
break;
}
return 0;
}
static struct argp_option add_options[] = {
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
{ "rule", 'r', "RULE_STRING", 0, "Rule string"},
{ NULL }
};
static struct argp add_argp = {
add_options,
parse_mod_opt,
"",
"Add quota rule"
};
static int add_cmd(int argc, char **argv)
{
struct mod_args args = { .is_add = true, };
int ret;
ret = argp_parse(&add_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_mod(&args);
}
static void __attribute__((constructor)) add_ctor(void)
{
cmd_register_argp("quota-add", &add_argp, GROUP_CORE, add_cmd);
}
static struct argp_option del_options[] = {
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
{ "rule", 'r', "RULE_STRING", 0, "Rule string"},
{ NULL }
};
static struct argp del_argp = {
del_options,
parse_mod_opt,
"",
"Delete quota rule"
};
static int del_cmd(int argc, char **argv)
{
struct mod_args args = { .is_add = false };
int ret;
ret = argp_parse(&del_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_mod(&args);
}
static void __attribute__((constructor)) del_ctor(void)
{
cmd_register_argp("quota-del", &del_argp, GROUP_CORE, del_cmd);
}
/* ---------------------------------------------- */
struct bulk_args {
char *path;
bool unsorted;
};
typedef int (*bulk_in_fn)(int fd, struct scoutfs_ioctl_quota_rule *irules, size_t nr,
void *in_args);
typedef int (*bulk_out_fn)(int fd, struct scoutfs_ioctl_quota_rule *irule, void *out_args);
static int cmp_irules(const struct scoutfs_ioctl_quota_rule *a,
const struct scoutfs_ioctl_quota_rule *b)
{
return scoutfs_cmp(a->prio, b->prio) ?:
scoutfs_cmp(a->name_val[0], b->name_val[0]) ?:
scoutfs_cmp(a->name_source[0], b->name_source[0]) ?:
scoutfs_cmp(a->name_flags[0], b->name_flags[0]) ?:
scoutfs_cmp(a->name_val[1], b->name_val[1]) ?:
scoutfs_cmp(a->name_source[1], b->name_source[1]) ?:
scoutfs_cmp(a->name_flags[1], b->name_flags[1]) ?:
scoutfs_cmp(a->name_val[2], b->name_val[2]) ?:
scoutfs_cmp(a->name_source[2], b->name_source[2]) ?:
scoutfs_cmp(a->name_flags[2], b->name_flags[2]) ?:
scoutfs_cmp(a->op, b->op) ?:
scoutfs_cmp(a->limit, b->limit) ?:
scoutfs_cmp(a->rule_flags, b->rule_flags);
}
static int compar_irules(const void *a, const void *b)
{
return -cmp_irules(a, b);
}
static int do_bulk(struct bulk_args *args, bulk_in_fn in_fn, void *in_args,
bulk_out_fn out_fn, void *out_args)
{
struct scoutfs_ioctl_quota_rule *irules = NULL;
size_t alloced = 0;
size_t nr = 0;
size_t batch;
size_t i;
int fd = -1;
int ret;
fd = get_path(args->path, O_RDONLY);
if (fd < 0)
return fd;
for (;;) {
if (nr == alloced) {
alloced += 1024;
irules = realloc(irules, alloced * sizeof(irules[0]));
if (!irules) {
ret = -errno;
fprintf(stderr, "memory allocation failed: %s (%d)\n",
strerror(errno), errno);
goto out;
}
}
ret = in_fn(fd, &irules[nr], alloced - nr, in_args);
if (ret == 0)
break;
if (ret < 0)
goto out;
batch = ret;
if (args->unsorted) {
for (i = 0; i < batch; i++) {
ret = out_fn(fd, &irules[nr + i], out_args);
if (ret < 0)
goto out;
}
} else {
nr += batch;
}
}
if (!args->unsorted) {
qsort(irules, nr, sizeof(irules[0]), compar_irules);
for (i = 0; i < nr; i++) {
ret = out_fn(fd, &irules[i], out_args);
if (ret < 0)
goto out;
}
}
ret = 0;
out:
if (fd >= 0)
close(fd);
if (irules)
free(irules);
return ret;
}
/* ---------------------------------------------- */
/* maintain iterator in gqr between calls */
static int get_ioctl_in_fn(int fd, struct scoutfs_ioctl_quota_rule *irules, size_t nr,
void *in_args)
{
struct scoutfs_ioctl_get_quota_rules *gqr = in_args;
int ret;
gqr->rules_ptr = (intptr_t)irules;
gqr->rules_nr = nr;
ret = ioctl(fd, SCOUTFS_IOC_GET_QUOTA_RULES, gqr);
if (ret < 0) {
ret = -errno;
fprintf(stderr, "GET_QUOTA_RULES ioctl failed: %s (%d)\n",
strerror(errno), errno);
}
return ret;
}
static int parse_stdin_in_fn(int fd, struct scoutfs_ioctl_quota_rule *irules, size_t nr,
void *in_args)
{
char *line = NULL;
size_t size;
int ret;
ret = getline(&line, &size, stdin);
if (ret < 0) {
if (errno == ENOENT)
return 0;
ret = -errno;
fprintf(stderr, "error reading rules: %s (%d)\n",
strerror(errno), errno);
return ret;
}
ret = parse_rule(&irules[0], line);
if (ret == 0)
ret = 1;
free(line);
return ret;
}
struct mod_ioctl_args {
unsigned int cmd;
char *which;
};
static int mod_ioctl_out_fn(int fd, struct scoutfs_ioctl_quota_rule *irule, void *out_args)
{
struct mod_ioctl_args *args = out_args;
int ret;
ret = ioctl(fd, args->cmd, irule);
if (ret < 0) {
ret = -errno;
printf("Failed to %s following rule:\n ", args->which);
printf_rule(irule);
fprintf(stderr, "Error: %s (%d)\n", strerror(-ret), -ret);
}
return ret;
}
static int print_out_fn(int fd, struct scoutfs_ioctl_quota_rule *irule, void *out_args)
{
printf_rule(irule);
return 0;
}
/* ---------------------------------------------- */
static int parse_bulk_opt(int key, char *arg, struct argp_state *state)
{
struct bulk_args *args = state->input;
switch (key) {
case 'p':
args->path = strdup_or_error(state, arg);
break;
case 'U':
args->unsorted = true;
break;
case ARGP_KEY_FINI:
if (!args->path)
argp_error(state, "must provide file path");
break;
default:
break;
}
return 0;
}
static struct argp_option bulk_options[] = {
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
{ "unsorted", 'U', NULL, 0, "Process rules in unsorted filesystem storage order"},
{ NULL }
};
static struct argp list_argp = {
bulk_options,
parse_bulk_opt,
"",
"List quota rules"
};
static int list_cmd(int argc, char **argv)
{
struct scoutfs_ioctl_get_quota_rules gqr = {{0,}};
struct bulk_args args = {NULL};
int ret;
ret = argp_parse(&list_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_bulk(&args, get_ioctl_in_fn, &gqr, print_out_fn, NULL);
}
static void __attribute__((constructor)) list_ctor(void)
{
cmd_register_argp("quota-list", &list_argp, GROUP_CORE, list_cmd);
}
/* ---------------------------------------------- */
static struct argp wipe_argp = {
bulk_options,
parse_bulk_opt,
"",
"Delete all quota rules"
};
static int wipe_cmd(int argc, char **argv)
{
struct bulk_args args = {NULL};
struct scoutfs_ioctl_get_quota_rules gqr = {{0,}};
struct mod_ioctl_args out_args = {
.cmd = SCOUTFS_IOC_DEL_QUOTA_RULE,
.which = "delete",
};
int ret;
ret = argp_parse(&wipe_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_bulk(&args, get_ioctl_in_fn, &gqr, mod_ioctl_out_fn, &out_args);
}
static void __attribute__((constructor)) wipe_ctor(void)
{
cmd_register_argp("quota-wipe", &wipe_argp, GROUP_CORE, wipe_cmd);
}
/* ---------------------------------------------- */
static struct argp restore_argp = {
bulk_options,
parse_bulk_opt,
"",
"Restore quota rules from list output on stdin"
};
static int restore_cmd(int argc, char **argv)
{
struct bulk_args args = {NULL};
struct mod_ioctl_args out_args = {
.cmd = SCOUTFS_IOC_ADD_QUOTA_RULE,
.which = "add",
};
int ret;
ret = argp_parse(&restore_argp, argc, argv, 0, NULL, &args);
if (ret)
return ret;
return do_bulk(&args, parse_stdin_in_fn, NULL, mod_ioctl_out_fn, &out_args);
}
static void __attribute__((constructor)) restore_ctor(void)
{
cmd_register_argp("quota-restore", &restore_argp, GROUP_CORE, restore_cmd);
}

629
utils/src/rbtree.c Normal file
View File

@@ -0,0 +1,629 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
(C) 2012 Michel Lespinasse <walken@google.com>
linux/lib/rbtree.c
*/
#include "lk_rbtree_wrapper.h"
/*
* red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
*
* 1) A node is either red or black
* 2) The root is black
* 3) All leaves (NULL) are black
* 4) Both children of every red node are black
* 5) Every simple path from root to leaves contains the same number
* of black nodes.
*
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
* consecutive red nodes in a path and every red node is therefore followed by
* a black. So if B is the number of black nodes on every simple path (as per
* 5), then the longest possible path due to 4 is 2B.
*
* We shall indicate color with case, where black nodes are uppercase and red
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
* parentheses and have some accompanying text comment.
*/
/*
* Notes on lockless lookups:
*
* All stores to the tree structure (rb_left and rb_right) must be done using
* WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
* tree structure as seen in program order.
*
* These two requirements will allow lockless iteration of the tree -- not
* correct iteration mind you, tree rotations are not atomic so a lookup might
* miss entire subtrees.
*
* But they do guarantee that any such traversal will only see valid elements
* and that it will indeed complete -- does not get stuck in a loop.
*
* It also guarantees that if the lookup returns an element it is the 'correct'
* one. But not returning an element does _NOT_ mean it's not present.
*
* NOTE:
*
* Stores to __rb_parent_color are not important for simple lookups so those
* are left undone as of now. Nor did I check for loops involving parent
* pointers.
*/
static inline void rb_set_black(struct rb_node *rb)
{
rb->__rb_parent_color |= RB_BLACK;
}
static inline struct rb_node *rb_red_parent(struct rb_node *red)
{
return (struct rb_node *)red->__rb_parent_color;
}
/*
* Helper function for rotations:
* - old's parent and color get assigned to new
* - old gets assigned new as a parent and 'color' as a color.
*/
static inline void
__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
struct rb_root *root, int color)
{
struct rb_node *parent = rb_parent(old);
new->__rb_parent_color = old->__rb_parent_color;
rb_set_parent_color(old, new, color);
__rb_change_child(old, new, parent, root);
}
static __always_inline void
__rb_insert(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
while (true) {
/*
* Loop invariant: node is red.
*/
if (unlikely(!parent)) {
/*
* The inserted node is root. Either this is the
* first node, or we recursed at Case 1 below and
* are no longer violating 4).
*/
rb_set_parent_color(node, NULL, RB_BLACK);
break;
}
/*
* If there is a black parent, we are done.
* Otherwise, take some corrective action as,
* per 4), we don't want a red root or two
* consecutive red nodes.
*/
if(rb_is_black(parent))
break;
gparent = rb_red_parent(parent);
tmp = gparent->rb_right;
if (parent != tmp) { /* parent == gparent->rb_left */
if (tmp && rb_is_red(tmp)) {
/*
* Case 1 - node's uncle is red (color flips).
*
* G g
* / \ / \
* p u --> P U
* / /
* n n
*
* However, since g's parent might be red, and
* 4) does not allow this, we need to recurse
* at g.
*/
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_right;
if (node == tmp) {
/*
* Case 2 - node's uncle is black and node is
* the parent's right child (left rotate at parent).
*
* G G
* / \ / \
* p U --> n U
* \ /
* n p
*
* This still leaves us in violation of 4), the
* continuation into Case 3 will fix that.
*/
tmp = node->rb_left;
WRITE_ONCE(parent->rb_right, tmp);
WRITE_ONCE(node->rb_left, parent);
if (tmp)
rb_set_parent_color(tmp, parent,
RB_BLACK);
rb_set_parent_color(parent, node, RB_RED);
augment_rotate(parent, node);
parent = node;
tmp = node->rb_right;
}
/*
* Case 3 - node's uncle is black and node is
* the parent's left child (right rotate at gparent).
*
* G P
* / \ / \
* p U --> n g
* / \
* n U
*/
WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
WRITE_ONCE(parent->rb_right, gparent);
if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment_rotate(gparent, parent);
break;
} else {
tmp = gparent->rb_left;
if (tmp && rb_is_red(tmp)) {
/* Case 1 - color flips */
rb_set_parent_color(tmp, gparent, RB_BLACK);
rb_set_parent_color(parent, gparent, RB_BLACK);
node = gparent;
parent = rb_parent(node);
rb_set_parent_color(node, parent, RB_RED);
continue;
}
tmp = parent->rb_left;
if (node == tmp) {
/* Case 2 - right rotate at parent */
tmp = node->rb_right;
WRITE_ONCE(parent->rb_left, tmp);
WRITE_ONCE(node->rb_right, parent);
if (tmp)
rb_set_parent_color(tmp, parent,
RB_BLACK);
rb_set_parent_color(parent, node, RB_RED);
augment_rotate(parent, node);
parent = node;
tmp = node->rb_left;
}
/* Case 3 - left rotate at gparent */
WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
WRITE_ONCE(parent->rb_left, gparent);
if (tmp)
rb_set_parent_color(tmp, gparent, RB_BLACK);
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
augment_rotate(gparent, parent);
break;
}
}
}
/*
* Inline version for rb_erase() use - we want to be able to inline
* and eliminate the dummy_rotate callback there
*/
static __always_inline void
____rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
while (true) {
/*
* Loop invariants:
* - node is black (or NULL on first iteration)
* - node is not the root (parent is not NULL)
* - All leaf paths going through parent and node have a
* black node count that is 1 lower than other leaf paths.
*/
sibling = parent->rb_right;
if (node != sibling) { /* node == parent->rb_left */
if (rb_is_red(sibling)) {
/*
* Case 1 - left rotate at parent
*
* P S
* / \ / \
* N s --> p Sr
* / \ / \
* Sl Sr N Sl
*/
tmp1 = sibling->rb_left;
WRITE_ONCE(parent->rb_right, tmp1);
WRITE_ONCE(sibling->rb_left, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root,
RB_RED);
augment_rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_right;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_left;
if (!tmp2 || rb_is_black(tmp2)) {
/*
* Case 2 - sibling color flip
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N s
* / \ / \
* Sl Sr Sl Sr
*
* This leaves us violating 5) which
* can be fixed by flipping p to black
* if it was red, or by recursing at p.
* p is red when coming from Case 1.
*/
rb_set_parent_color(sibling, parent,
RB_RED);
if (rb_is_red(parent))
rb_set_black(parent);
else {
node = parent;
parent = rb_parent(node);
if (parent)
continue;
}
break;
}
/*
* Case 3 - right rotate at sibling
* (p could be either color here)
*
* (p) (p)
* / \ / \
* N S --> N sl
* / \ \
* sl Sr S
* \
* Sr
*
* Note: p might be red, and then both
* p and sl are red after rotation(which
* breaks property 4). This is fixed in
* Case 4 (in __rb_rotate_set_parents()
* which set sl the color of p
* and set p RB_BLACK)
*
* (p) (sl)
* / \ / \
* N sl --> P S
* \ / \
* S N Sr
* \
* Sr
*/
tmp1 = tmp2->rb_right;
WRITE_ONCE(sibling->rb_left, tmp1);
WRITE_ONCE(tmp2->rb_right, sibling);
WRITE_ONCE(parent->rb_right, tmp2);
if (tmp1)
rb_set_parent_color(tmp1, sibling,
RB_BLACK);
augment_rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/*
* Case 4 - left rotate at parent + color flips
* (p and sl could be either color here.
* After rotation, p becomes black, s acquires
* p's color, and sl keeps its color)
*
* (p) (s)
* / \ / \
* N S --> P Sr
* / \ / \
* (sl) sr N (sl)
*/
tmp2 = sibling->rb_left;
WRITE_ONCE(parent->rb_right, tmp2);
WRITE_ONCE(sibling->rb_left, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2)
rb_set_parent(tmp2, parent);
__rb_rotate_set_parents(parent, sibling, root,
RB_BLACK);
augment_rotate(parent, sibling);
break;
} else {
sibling = parent->rb_left;
if (rb_is_red(sibling)) {
/* Case 1 - right rotate at parent */
tmp1 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp1);
WRITE_ONCE(sibling->rb_right, parent);
rb_set_parent_color(tmp1, parent, RB_BLACK);
__rb_rotate_set_parents(parent, sibling, root,
RB_RED);
augment_rotate(parent, sibling);
sibling = tmp1;
}
tmp1 = sibling->rb_left;
if (!tmp1 || rb_is_black(tmp1)) {
tmp2 = sibling->rb_right;
if (!tmp2 || rb_is_black(tmp2)) {
/* Case 2 - sibling color flip */
rb_set_parent_color(sibling, parent,
RB_RED);
if (rb_is_red(parent))
rb_set_black(parent);
else {
node = parent;
parent = rb_parent(node);
if (parent)
continue;
}
break;
}
/* Case 3 - left rotate at sibling */
tmp1 = tmp2->rb_left;
WRITE_ONCE(sibling->rb_right, tmp1);
WRITE_ONCE(tmp2->rb_left, sibling);
WRITE_ONCE(parent->rb_left, tmp2);
if (tmp1)
rb_set_parent_color(tmp1, sibling,
RB_BLACK);
augment_rotate(sibling, tmp2);
tmp1 = sibling;
sibling = tmp2;
}
/* Case 4 - right rotate at parent + color flips */
tmp2 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp2);
WRITE_ONCE(sibling->rb_right, parent);
rb_set_parent_color(tmp1, sibling, RB_BLACK);
if (tmp2)
rb_set_parent(tmp2, parent);
__rb_rotate_set_parents(parent, sibling, root,
RB_BLACK);
augment_rotate(parent, sibling);
break;
}
}
}
/* Non-inline version for rb_erase_augmented() use */
void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
____rb_erase_color(parent, root, augment_rotate);
}
EXPORT_SYMBOL(__rb_erase_color);
/*
* Non-augmented rbtree manipulation functions.
*
* We use dummy augmented callbacks here, and have the compiler optimize them
* out of the rb_insert_color() and rb_erase() function definitions.
*/
static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
static const struct rb_augment_callbacks dummy_callbacks = {
.propagate = dummy_propagate,
.copy = dummy_copy,
.rotate = dummy_rotate
};
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
__rb_insert(node, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_insert_color);
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *rebalance;
rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
if (rebalance)
____rb_erase_color(rebalance, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_erase);
/*
* Augmented rbtree manipulation functions.
*
* This instantiates the same __always_inline functions as in the non-augmented
* case, but this time with user-defined callbacks.
*/
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
__rb_insert(node, root, augment_rotate);
}
EXPORT_SYMBOL(__rb_insert_augmented);
/*
* This function returns the first node (in sort order) of the tree.
*/
struct rb_node *rb_first(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_left)
n = n->rb_left;
return n;
}
EXPORT_SYMBOL(rb_first);
struct rb_node *rb_last(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_right)
n = n->rb_right;
return n;
}
EXPORT_SYMBOL(rb_last);
struct rb_node *rb_next(const struct rb_node *node)
{
struct rb_node *parent;
if (RB_EMPTY_NODE(node))
return NULL;
/*
* If we have a right-hand child, go down and then left as far
* as we can.
*/
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
node = node->rb_left;
return (struct rb_node *)node;
}
/*
* No right-hand children. Everything down and left is smaller than us,
* so any 'next' node must be in the general direction of our parent.
* Go up the tree; any time the ancestor is a right-hand child of its
* parent, keep going up. First time it's a left-hand child of its
* parent, said parent is our 'next' node.
*/
while ((parent = rb_parent(node)) && node == parent->rb_right)
node = parent;
return parent;
}
EXPORT_SYMBOL(rb_next);
struct rb_node *rb_prev(const struct rb_node *node)
{
struct rb_node *parent;
if (RB_EMPTY_NODE(node))
return NULL;
/*
* If we have a left-hand child, go down and then right as far
* as we can.
*/
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
node = node->rb_right;
return (struct rb_node *)node;
}
/*
* No left-hand children. Go up till we find an ancestor which
* is a right-hand child of its parent.
*/
while ((parent = rb_parent(node)) && node == parent->rb_left)
node = parent;
return parent;
}
EXPORT_SYMBOL(rb_prev);
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root)
{
struct rb_node *parent = rb_parent(victim);
/* Copy the pointers/colour from the victim to the replacement */
*new = *victim;
/* Set the surrounding nodes to point to the replacement */
if (victim->rb_left)
rb_set_parent(victim->rb_left, new);
if (victim->rb_right)
rb_set_parent(victim->rb_right, new);
__rb_change_child(victim, new, parent, root);
}
EXPORT_SYMBOL(rb_replace_node);
void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
struct rb_root *root)
{
struct rb_node *parent = rb_parent(victim);
/* Copy the pointers/colour from the victim to the replacement */
*new = *victim;
/* Set the surrounding nodes to point to the replacement */
if (victim->rb_left)
rb_set_parent(victim->rb_left, new);
if (victim->rb_right)
rb_set_parent(victim->rb_right, new);
/* Set the parent's pointer to the new node last after an RCU barrier
* so that the pointers onwards are seen to be set correctly when doing
* an RCU walk over the tree.
*/
__rb_change_child_rcu(victim, new, parent, root);
}
EXPORT_SYMBOL(rb_replace_node_rcu);
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
{
for (;;) {
if (node->rb_left)
node = node->rb_left;
else if (node->rb_right)
node = node->rb_right;
else
return (struct rb_node *)node;
}
}
struct rb_node *rb_next_postorder(const struct rb_node *node)
{
const struct rb_node *parent;
if (!node)
return NULL;
parent = rb_parent(node);
/* If we're sitting on node, we've already seen our children */
if (parent && node == parent->rb_left && parent->rb_right) {
/* If we are the parent's left node, go to the parent's right
* node then all the way down to the left */
return rb_left_deepest_node(parent->rb_right);
} else
/* Otherwise we are the parent's right node, and the parent
* should be next */
return (struct rb_node *)parent;
}
EXPORT_SYMBOL(rb_next_postorder);
struct rb_node *rb_first_postorder(const struct rb_root *root)
{
if (!root->rb_node)
return NULL;
return rb_left_deepest_node(root->rb_node);
}
EXPORT_SYMBOL(rb_first_postorder);

328
utils/src/rbtree.h Normal file
View File

@@ -0,0 +1,328 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
linux/include/linux/rbtree.h
To use rbtrees you'll have to implement your own insert and search cores.
This will avoid us to use callbacks and to drop drammatically performances.
I know it's not the cleaner way, but in C (not in C++) to get
performances and genericity...
See Documentation/core-api/rbtree.rst for documentation and samples.
*/
#ifndef _LINUX_RBTREE_H
#define _LINUX_RBTREE_H
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node) \
((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node) \
((node)->__rb_parent_color = (unsigned long)(node))
extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
*rb_link = node;
}
static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
rcu_assign_pointer(*rb_link, node);
}
#define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \
})
/**
* rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
* given type allowing the backing memory of @pos to be invalidated
*
* @pos: the 'type *' to use as a loop cursor.
* @n: another 'type *' to use as temporary storage
* @root: 'rb_root *' of the rbtree.
* @field: the name of the rb_node field within 'type'.
*
* rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
* list_for_each_entry_safe() and allows the iteration to continue independent
* of changes to @pos by the body of the loop.
*
* Note, however, that it cannot handle other modifications that re-order the
* rbtree it is iterating over. This includes calling rb_erase() on @pos, as
* rb_erase() may rebalance the tree, causing us to miss some nodes.
*/
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
typeof(*pos), field); 1; }); \
pos = n)
/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost
static inline void rb_insert_color_cached(struct rb_node *node,
struct rb_root_cached *root,
bool leftmost)
{
if (leftmost)
root->rb_leftmost = node;
rb_insert_color(node, &root->rb_root);
}
static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
struct rb_node *leftmost = NULL;
if (root->rb_leftmost == node)
leftmost = root->rb_leftmost = rb_next(node);
rb_erase(node, &root->rb_root);
return leftmost;
}
static inline void rb_replace_node_cached(struct rb_node *victim,
struct rb_node *new,
struct rb_root_cached *root)
{
if (root->rb_leftmost == victim)
root->rb_leftmost = new;
rb_replace_node(victim, new, &root->rb_root);
}
/*
* The below helper functions use 2 operators with 3 different
* calling conventions. The operators are related like:
*
* comp(a->key,b) < 0 := less(a,b)
* comp(a->key,b) > 0 := less(b,a)
* comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
*
* If these operators define a partial order on the elements we make no
* guarantee on which of the elements matching the key is found. See
* rb_find().
*
* The reason for this is to allow the find() interface without requiring an
* on-stack dummy object, which might not be feasible due to object size.
*/
/**
* rb_add_cached() - insert @node into the leftmost cached tree @tree
* @node: node to insert
* @tree: leftmost cached tree to insert @node into
* @less: operator defining the (partial) node order
*
* Returns @node when it is the new leftmost, or NULL.
*/
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
bool (*less)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_root.rb_node;
struct rb_node *parent = NULL;
bool leftmost = true;
while (*link) {
parent = *link;
if (less(node, parent)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(node, parent, link);
rb_insert_color_cached(node, tree, leftmost);
return leftmost ? node : NULL;
}
/**
* rb_add() - insert @node into @tree
* @node: node to insert
* @tree: tree to insert @node into
* @less: operator defining the (partial) node order
*/
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
bool (*less)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
while (*link) {
parent = *link;
if (less(node, parent))
link = &parent->rb_left;
else
link = &parent->rb_right;
}
rb_link_node(node, parent, link);
rb_insert_color(node, tree);
}
/**
* rb_find_add() - find equivalent @node in @tree, or add @node
* @node: node to look-for / insert
* @tree: tree to search / modify
* @cmp: operator defining the node order
*
* Returns the rb_node matching @node, or NULL when no match is found and @node
* is inserted.
*/
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
int (*cmp)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
int c;
while (*link) {
parent = *link;
c = cmp(node, parent);
if (c < 0)
link = &parent->rb_left;
else if (c > 0)
link = &parent->rb_right;
else
return parent;
}
rb_link_node(node, parent, link);
rb_insert_color(node, tree);
return NULL;
}
/**
* rb_find() - find @key in tree @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining the node order
*
* Returns the rb_node matching @key or NULL.
*/
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
while (node) {
int c = cmp(key, node);
if (c < 0)
node = node->rb_left;
else if (c > 0)
node = node->rb_right;
else
return node;
}
return NULL;
}
/**
* rb_find_first() - find the first @key in @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*
* Returns the leftmost node matching @key, or NULL.
*/
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
struct rb_node *match = NULL;
while (node) {
int c = cmp(key, node);
if (c <= 0) {
if (!c)
match = node;
node = node->rb_left;
} else if (c > 0) {
node = node->rb_right;
}
}
return match;
}
/**
* rb_next_match() - find the next @key in @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*
* Returns the next node matching @key, or NULL.
*/
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
int (*cmp)(const void *key, const struct rb_node *))
{
node = rb_next(node);
if (node && cmp(key, node))
node = NULL;
return node;
}
/**
* rb_for_each() - iterates a subtree matching @key
* @node: iterator
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*/
#define rb_for_each(node, key, tree, cmp) \
for ((node) = rb_find_first((key), (tree), (cmp)); \
(node); (node) = rb_next_match((key), (node), (cmp)))
#endif /* _LINUX_RBTREE_H */

View File

@@ -0,0 +1,313 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
(C) 2012 Michel Lespinasse <walken@google.com>
linux/include/linux/rbtree_augmented.h
*/
#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H
/*
* Please note - only struct rb_augment_callbacks and the prototypes for
* rb_insert_augmented() and rb_erase_augmented() are intended to be public.
* The rest are implementation details you are not expected to depend on.
*
* See Documentation/core-api/rbtree.rst for documentation and samples.
*/
struct rb_augment_callbacks {
void (*propagate)(struct rb_node *node, struct rb_node *stop);
void (*copy)(struct rb_node *old, struct rb_node *new);
void (*rotate)(struct rb_node *old, struct rb_node *new);
};
extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
/*
* Fixup the rbtree and update the augmented information when rebalancing.
*
* On insertion, the user must update the augmented information on the path
* leading to the inserted node, then call rb_link_node() as usual and
* rb_insert_augmented() instead of the usual rb_insert_color() call.
* If rb_insert_augmented() rebalances the rbtree, it will callback into
* a user provided function to update the augmented information on the
* affected subtrees.
*/
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
__rb_insert_augmented(node, root, augment->rotate);
}
static inline void
rb_insert_augmented_cached(struct rb_node *node,
struct rb_root_cached *root, bool newleft,
const struct rb_augment_callbacks *augment)
{
if (newleft)
root->rb_leftmost = node;
rb_insert_augmented(node, &root->rb_root, augment);
}
/*
* Template for declaring augmented rbtree callbacks (generic case)
*
* RBSTATIC: 'static' or empty
* RBNAME: name of the rb_augment_callbacks structure
* RBSTRUCT: struct type of the tree nodes
* RBFIELD: name of struct rb_node field within RBSTRUCT
* RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
* RBCOMPUTE: name of function that recomputes the RBAUGMENTED data
*/
#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \
static inline void \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
{ \
while (rb != stop) { \
RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \
if (RBCOMPUTE(node, true)) \
break; \
rb = rb_parent(&node->RBFIELD); \
} \
} \
static inline void \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
new->RBAUGMENTED = old->RBAUGMENTED; \
} \
static void \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
new->RBAUGMENTED = old->RBAUGMENTED; \
RBCOMPUTE(old, false); \
} \
RBSTATIC const struct rb_augment_callbacks RBNAME = { \
.propagate = RBNAME ## _propagate, \
.copy = RBNAME ## _copy, \
.rotate = RBNAME ## _rotate \
};
/*
* Template for declaring augmented rbtree callbacks,
* computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
*
* RBSTATIC: 'static' or empty
* RBNAME: name of the rb_augment_callbacks structure
* RBSTRUCT: struct type of the tree nodes
* RBFIELD: name of struct rb_node field within RBSTRUCT
* RBTYPE: type of the RBAUGMENTED field
* RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
* RBCOMPUTE: name of function that returns the per-node RBTYPE scalar
*/
#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \
RBTYPE, RBAUGMENTED, RBCOMPUTE) \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \
{ \
RBSTRUCT *child; \
RBTYPE max = RBCOMPUTE(node); \
if (node->RBFIELD.rb_left) { \
child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \
if (child->RBAUGMENTED > max) \
max = child->RBAUGMENTED; \
} \
if (node->RBFIELD.rb_right) { \
child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \
if (child->RBAUGMENTED > max) \
max = child->RBAUGMENTED; \
} \
if (exit && node->RBAUGMENTED == max) \
return true; \
node->RBAUGMENTED = max; \
return false; \
} \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
#define RB_RED 0
#define RB_BLACK 1
#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
#define __rb_color(pc) ((pc) & 1)
#define __rb_is_black(pc) __rb_color(pc)
#define __rb_is_red(pc) (!__rb_color(pc))
#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
}
static inline void rb_set_parent_color(struct rb_node *rb,
struct rb_node *p, int color)
{
rb->__rb_parent_color = (unsigned long)p | color;
}
static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
struct rb_node *parent, struct rb_root *root)
{
if (parent) {
if (parent->rb_left == old)
WRITE_ONCE(parent->rb_left, new);
else
WRITE_ONCE(parent->rb_right, new);
} else
WRITE_ONCE(root->rb_node, new);
}
static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
struct rb_node *parent, struct rb_root *root)
{
if (parent) {
if (parent->rb_left == old)
rcu_assign_pointer(parent->rb_left, new);
else
rcu_assign_pointer(parent->rb_right, new);
} else
rcu_assign_pointer(root->rb_node, new);
}
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right;
struct rb_node *tmp = node->rb_left;
struct rb_node *parent, *rebalance;
unsigned long pc;
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
*
* Note that if there is one child it must be red due to 5)
* and node must be black due to 4). We adjust colors locally
* so as to bypass __rb_erase_color() later on.
*/
pc = node->__rb_parent_color;
parent = __rb_parent(pc);
__rb_change_child(node, child, parent, root);
if (child) {
child->__rb_parent_color = pc;
rebalance = NULL;
} else
rebalance = __rb_is_black(pc) ? parent : NULL;
tmp = parent;
} else if (!child) {
/* Still case 1, but this time the child is node->rb_left */
tmp->__rb_parent_color = pc = node->__rb_parent_color;
parent = __rb_parent(pc);
__rb_change_child(node, tmp, parent, root);
rebalance = NULL;
tmp = parent;
} else {
struct rb_node *successor = child, *child2;
tmp = child->rb_left;
if (!tmp) {
/*
* Case 2: node's successor is its right child
*
* (n) (s)
* / \ / \
* (x) (s) -> (x) (c)
* \
* (c)
*/
parent = successor;
child2 = successor->rb_right;
augment->copy(node, successor);
} else {
/*
* Case 3: node's successor is leftmost under
* node's right child subtree
*
* (n) (s)
* / \ / \
* (x) (y) -> (x) (y)
* / /
* (p) (p)
* / /
* (s) (c)
* \
* (c)
*/
do {
parent = successor;
successor = tmp;
tmp = tmp->rb_left;
} while (tmp);
child2 = successor->rb_right;
WRITE_ONCE(parent->rb_left, child2);
WRITE_ONCE(successor->rb_right, child);
rb_set_parent(child, successor);
augment->copy(node, successor);
augment->propagate(parent, successor);
}
tmp = node->rb_left;
WRITE_ONCE(successor->rb_left, tmp);
rb_set_parent(tmp, successor);
pc = node->__rb_parent_color;
tmp = __rb_parent(pc);
__rb_change_child(node, successor, tmp, root);
if (child2) {
rb_set_parent_color(child2, parent, RB_BLACK);
rebalance = NULL;
} else {
rebalance = rb_is_black(successor) ? parent : NULL;
}
successor->__rb_parent_color = pc;
tmp = successor;
}
augment->propagate(tmp, NULL);
return rebalance;
}
static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
if (rebalance)
__rb_erase_color(rebalance, root, augment->rotate);
}
static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
const struct rb_augment_callbacks *augment)
{
if (root->rb_leftmost == node)
root->rb_leftmost = rb_next(node);
rb_erase_augmented(node, &root->rb_root, augment);
}
#endif /* _LINUX_RBTREE_AUGMENTED_H */

34
utils/src/rbtree_types.h Normal file
View File

@@ -0,0 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_RBTREE_TYPES_H
#define _LINUX_RBTREE_TYPES_H
struct rb_node {
unsigned long __rb_parent_color;
struct rb_node *rb_right;
struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
/* The alignment might seem pointless, but allegedly CRIS needs it */
struct rb_root {
struct rb_node *rb_node;
};
/*
* Leftmost-cached rbtrees.
*
* We do not cache the rightmost node based on footprint
* size vs number of potential users that could benefit
* from O(1) rb_last(). Just not worth it, users that want
* this feature can always implement the logic explicitly.
* Furthermore, users that want to cache both pointers may
* find it a bit asymmetric, but that's ok.
*/
struct rb_root_cached {
struct rb_root rb_root;
struct rb_node *rb_leftmost;
};
#define RB_ROOT (struct rb_root) { NULL, }
#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
#endif

View File

@@ -1,186 +0,0 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <argp.h>
#include "sparse.h"
#include "parse.h"
#include "util.h"
#include "format.h"
#include "ioctl.h"
#include "cmd.h"
#include "cmp.h"
#define ENTF "%u.%llu.%llu"
#define ENTA(e) (e)->major, (e)->minor, (e)->ino
struct xattr_args {
char *path;
char *first_entry;
char *last_entry;
};
static int compare_entries(struct scoutfs_ioctl_xattr_index_entry *a,
struct scoutfs_ioctl_xattr_index_entry *b)
{
return scoutfs_cmp(a->major, b->major) ?: scoutfs_cmp(a->minor, b->minor) ?:
scoutfs_cmp(a->ino, b->ino);
}
static int parse_entry(struct scoutfs_ioctl_xattr_index_entry *ent, char *str)
{
int major;
int ret;
ret = sscanf(str, "%i.%lli.%lli", &major, &ent->minor, &ent->ino);
if (ret != 3) {
fprintf(stderr, "bad index position entry argument '%s', it must be "
"in the form \"a.b.ino\" where each value can be prefixed by "
"'0' for octal or '0x' for hex\n", str);
return -EINVAL;
}
if (major < 0 || major > UCHAR_MAX) {
fprintf(stderr, "initial major index position '%d' must be between 0 and 255, "
"inclusive.\n", major);
return -EINVAL;
}
ent->major = major;
return 0;
}
#define NR_ENTRIES 1024
static int do_read_xattr_index(struct xattr_args *args)
{
struct scoutfs_ioctl_read_xattr_index rxi;
struct scoutfs_ioctl_xattr_index_entry *ents;
struct scoutfs_ioctl_xattr_index_entry *ent;
int fd = -1;
int ret;
int i;
ents = calloc(NR_ENTRIES, sizeof(struct scoutfs_ioctl_xattr_index_entry));
if (!ents) {
fprintf(stderr, "xattr index entry allocation failed\n");
ret = -ENOMEM;
goto out;
}
fd = get_path(args->path, O_RDONLY);
if (fd < 0)
return fd;
memset(&rxi, 0, sizeof(rxi));
memset(&rxi.last, 0xff, sizeof(rxi.last));
rxi.entries_ptr = (unsigned long)ents;
rxi.entries_nr = NR_ENTRIES;
ret = 0;
if (args->first_entry)
ret = parse_entry(&rxi.first, args->first_entry);
if (args->last_entry)
ret = parse_entry(&rxi.last, args->last_entry);
if (ret < 0)
goto out;
if (compare_entries(&rxi.first, &rxi.last) > 0) {
fprintf(stderr, "first index position "ENTF" must be less than last index position "ENTF"\n",
ENTA(&rxi.first), ENTA(&rxi.last));
ret = -EINVAL;
goto out;
}
for (;;) {
ret = ioctl(fd, SCOUTFS_IOC_READ_XATTR_INDEX, &rxi);
if (ret == 0)
break;
if (ret < 0) {
ret = -errno;
fprintf(stderr, "read_xattr_index ioctl failed: "
"%s (%d)\n", strerror(errno), errno);
goto out;
}
for (i = 0; i < ret; i++) {
ent = &ents[i];
printf("%u.%llu = %llu\n",
ent->major, ent->minor, ent->ino);
}
rxi.first = *ent;
if ((++rxi.first.ino == 0 && ++rxi.first.minor == 0 && ++rxi.first.major == 0) ||
compare_entries(&rxi.first, &rxi.last) > 0)
break;
}
ret = 0;
out:
if (fd >= 0)
close(fd);
free(ents);
return ret;
};
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct xattr_args *args = state->input;
switch (key) {
case 'p':
args->path = strdup_or_error(state, arg);
break;
case ARGP_KEY_ARG:
if (!args->first_entry)
args->first_entry = strdup_or_error(state, arg);
else if (!args->last_entry)
args->last_entry = strdup_or_error(state, arg);
else
argp_error(state, "more than two entry arguments given");
break;
default:
break;
}
return 0;
}
static struct argp_option options[] = {
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
{ NULL }
};
static struct argp argp = {
options,
parse_opt,
"FIRST-ENTRY LAST-ENTRY",
"Search and print inode numbers indexed by their .indx. xattrs"
};
static int read_xattr_index_cmd(int argc, char **argv)
{
struct xattr_args xattr_args = {NULL};
int ret;
ret = argp_parse(&argp, argc, argv, 0, NULL, &xattr_args);
if (ret)
return ret;
return do_read_xattr_index(&xattr_args);
}
static void __attribute__((constructor)) read_xattr_index_ctor(void)
{
cmd_register_argp("read-xattr-index", &argp, GROUP_INFO, read_xattr_index_cmd);
}