Files
scoutfs/kmod/src/dir.c
Zach Brown 609fc56cd6 Merge pull request #203 from versity/auke/new_inode_ctime
Fix new_inode ctime assignment.
2025-02-25 15:23:16 -08:00

2068 lines
56 KiB
C

/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/xattr.h>
#include <linux/namei.h>
#include <linux/mm.h>
#include "format.h"
#include "file.h"
#include "dir.h"
#include "inode.h"
#include "ioctl.h"
#include "key.h"
#include "msg.h"
#include "super.h"
#include "trans.h"
#include "xattr.h"
#include "item.h"
#include "lock.h"
#include "hash.h"
#include "omap.h"
#include "forest.h"
#include "acl.h"
#include "counters.h"
#include "quota.h"
#include "scoutfs_trace.h"
/*
* Directory entries are stored in three different items. Each has the
* same key format and all have identical values which contain the full
* entry name.
*
* Entries for name lookup are stored at the hash of the name and the
* readdir position. Including the position lets us create names
* without having to read the items to check for hash collisions.
* Lookup iterates over all the positions with the same hash values and
* compares the names.
*
* Entries for readdir are stored in an increasing unique readdir
* position. This results in returning entries in creation order which
* matches inode allocation order and avoids random inode access
* patterns during readdir.
*
* Entries for link backref traversal are stored at the target inode
* sorted by the parent dir and the entry's position in the parent dir.
* This keeps link backref users away from the higher contention area of
* dirent items in parent dirs.
*
* All the entries have a dirent struct with the full name in their
* value. The dirent struct contains the name hash and readdir position
* so that any item use can reference all the items for a given entry.
*/
static unsigned int mode_to_type(umode_t mode)
{
#define S_SHIFT 12
static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
[S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO,
[S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR,
[S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR,
[S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK,
[S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG,
[S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK,
[S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK,
};
return mode_types[(mode & S_IFMT) >> S_SHIFT];
#undef S_SHIFT
}
static unsigned int dentry_type(enum scoutfs_dentry_type type)
{
static unsigned char types[] = {
[SCOUTFS_DT_FIFO] = DT_FIFO,
[SCOUTFS_DT_CHR] = DT_CHR,
[SCOUTFS_DT_DIR] = DT_DIR,
[SCOUTFS_DT_BLK] = DT_BLK,
[SCOUTFS_DT_REG] = DT_REG,
[SCOUTFS_DT_LNK] = DT_LNK,
[SCOUTFS_DT_SOCK] = DT_SOCK,
[SCOUTFS_DT_WHT] = DT_WHT,
};
if (type < ARRAY_SIZE(types))
return types[type];
return DT_UNKNOWN;
}
static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags);
const struct dentry_operations scoutfs_dentry_ops = {
.d_revalidate = scoutfs_d_revalidate,
};
static void init_dirent_key(struct scoutfs_key *key, u8 type, u64 ino,
u64 major, u64 minor)
{
*key = (struct scoutfs_key) {
.sk_zone = SCOUTFS_FS_ZONE,
.skd_ino = cpu_to_le64(ino),
.sk_type = type,
.skd_major = cpu_to_le64(major),
.skd_minor = cpu_to_le64(minor),
};
}
static unsigned int dirent_bytes(unsigned int name_len)
{
return offsetof(struct scoutfs_dirent, name[name_len]);
}
static struct scoutfs_dirent *alloc_dirent(unsigned int name_len)
{
return kmalloc(dirent_bytes(name_len), GFP_NOFS);
}
/*
* Test a bit number as though an array of bytes is a large len-bit
* big-endian value. nr 0 is the LSB of the final byte, nr (len - 1) is
* the MSB of the first byte.
*/
static int test_be_bytes_bit(int nr, const char *bytes, int len)
{
return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
}
/*
* Generate a 32bit "fingerprint" of the name by extracting 32 evenly
* distributed bits from the name. The intent is to have the sort order
* of the fingerprints reflect the memcmp() sort order of the names
* while mapping large names down to small fs keys.
*
* Names that are smaller than 32bits are biased towards the high bits
* of the fingerprint so that most significant bits of the fingerprints
* consistently reflect the initial characters of the names.
*/
static u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
{
int name_bits = name_len * 8;
int skip = max(name_bits / 32, 1);
u32 fp = 0;
int f;
int n;
for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
return fp;
}
static u64 dirent_name_hash(const char *name, unsigned int name_len)
{
return scoutfs_hash32(name, name_len) |
((u64)dirent_name_fingerprint(name, name_len) << 32);
}
static bool dirent_names_equal(const char *a_name, unsigned int a_len,
const char *b_name, unsigned int b_len)
{
return a_len == b_len && memcmp(a_name, b_name, a_len) == 0;
}
/*
* Looks for the dirent item and fills the caller's dirent if it finds
* it. Returns item lookup errors including -ENOENT if it's not found.
*/
static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name,
unsigned name_len, u64 hash,
struct scoutfs_dirent *dent_ret,
struct scoutfs_lock *lock)
{
struct scoutfs_key last_key;
struct scoutfs_key key;
struct scoutfs_dirent *dent = NULL;
int ret;
dent = alloc_dirent(SCOUTFS_NAME_LEN);
if (!dent) {
return -ENOMEM;
}
init_dirent_key(&key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, 0);
init_dirent_key(&last_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, U64_MAX);
for (;;) {
ret = scoutfs_item_next(sb, &key, &last_key, dent,
dirent_bytes(SCOUTFS_NAME_LEN), lock);
if (ret < 0)
break;
ret -= sizeof(struct scoutfs_dirent);
if (ret < 1 || ret > SCOUTFS_NAME_LEN) {
scoutfs_corruption(sb, SC_DIRENT_NAME_LEN,
corrupt_dirent_name_len,
"dir_ino %llu hash %llu key "SK_FMT" len %d",
dir_ino, hash, SK_ARG(&key), ret);
ret = -EIO;
goto out;
}
if (dirent_names_equal(name, name_len, dent->name, ret)) {
*dent_ret = *dent;
ret = 0;
break;
}
if (le64_to_cpu(key.skd_minor) == U64_MAX) {
ret = -ENOENT;
break;
}
le64_add_cpu(&key.skd_minor, 1);
}
out:
kfree(dent);
return ret;
}
static int lookup_dentry_dirent(struct super_block *sb, u64 dir_ino, struct dentry *dentry,
struct scoutfs_dirent *dent_ret,
struct scoutfs_lock *lock)
{
return lookup_dirent(sb, dir_ino, dentry->d_name.name, dentry->d_name.len,
dirent_name_hash(dentry->d_name.name, dentry->d_name.len),
dent_ret, lock);
}
static u64 dentry_parent_ino(struct dentry *dentry)
{
struct dentry *parent = NULL;
struct inode *dir;
u64 dir_ino = 0;
if ((parent = dget_parent(dentry)) && (dir = parent->d_inode))
dir_ino = scoutfs_ino(dir);
dput(parent);
return dir_ino;
}
/* negative dentries return 0, our root ino is non-zero (1) */
static u64 dentry_ino(struct dentry *dentry)
{
return dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0;
}
static void set_dentry_fsdata(struct dentry *dentry, struct scoutfs_lock *lock)
{
void *now = (void *)(unsigned long)lock->refresh_gen;
void *was;
/* didn't want to alloc :/ */
BUILD_BUG_ON(sizeof(dentry->d_fsdata) != sizeof(u64));
BUILD_BUG_ON(sizeof(dentry->d_fsdata) != sizeof(long));
do {
was = dentry->d_fsdata;
} while (cmpxchg(&dentry->d_fsdata, was, now) != was);
}
static bool test_dentry_fsdata(struct dentry *dentry, u64 refresh)
{
u64 fsd = (unsigned long)READ_ONCE(dentry->d_fsdata);
return fsd == refresh;
}
/*
* Validate an operation caller's input dentry argument. If the fsdata
* is valid then the underlying dirent items couldn't have changed and
* we return 0. If fsdata is no longer protected by a lock or its
* fields don't match then we check the dirent item. If the dirent item
* doesn't match what the caller expected given their dentry fields then
* we return an error.
*/
static int validate_dentry(struct super_block *sb, u64 dir_ino, struct dentry *dentry,
struct scoutfs_lock *lock)
{
u64 ino = dentry_ino(dentry);
struct scoutfs_dirent dent = {0,};
int ret;
if (test_dentry_fsdata(dentry, lock->refresh_gen)) {
ret = 0;
goto out;
}
ret = lookup_dentry_dirent(sb, dir_ino, dentry, &dent, lock);
if (ret < 0 && ret != -ENOENT)
goto out;
/* use negative zeroed dent when lookup gave -ENOENT */
if (!ino && dent.ino) {
/* caller expected negative but there was a dirent */
ret = -EEXIST;
} else if (ino && !dent.ino) {
/* caller expected positive but there was no dirent */
ret = -ENOENT;
} else if (ino != le64_to_cpu(dent.ino)) {
/* name linked to different inode than caller's */
ret = -ESTALE;
} else {
/* dirent ino matches dentry ino */
ret = 0;
}
out:
trace_scoutfs_validate_dentry(sb, dentry, dir_ino, ino, le64_to_cpu(dent.ino),
lock->refresh_gen, ret);
return ret;
}
static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct super_block *sb = dentry->d_sb;
u64 dir_ino = dentry_parent_ino(dentry);
int ret;
/* don't think this happens but we can find out */
if (IS_ROOT(dentry)) {
scoutfs_inc_counter(sb, dentry_revalidate_root);
if (!dentry->d_inode ||
(scoutfs_ino(dentry->d_inode) != SCOUTFS_ROOT_INO)) {
ret = -EIO;
} else {
ret = 1;
}
goto out;
}
/* XXX what are the rules for _RCU? */
if (flags & LOOKUP_RCU) {
scoutfs_inc_counter(sb, dentry_revalidate_rcu);
ret = -ECHILD;
goto out;
}
if (test_dentry_fsdata(dentry, scoutfs_lock_ino_refresh_gen(sb, dir_ino))) {
scoutfs_inc_counter(sb, dentry_revalidate_valid);
ret = 1;
} else {
scoutfs_inc_counter(sb, dentry_revalidate_invalid);
ret = 0;
}
out:
trace_scoutfs_d_revalidate(sb, dentry, flags, dir_ino, ret);
if (ret < 0 && ret != -ECHILD)
scoutfs_inc_counter(sb, dentry_revalidate_error);
return ret;
}
/*
* Because of rename, locks are ordered by inode number. To hold the
* dir lock while calling iget, we might have to already hold a lesser
* inode's lock while telling iget whether or not to lock. Instead of
* adding all those moving pieces we drop the dir lock before calling
* iget. We don't reuse inode numbers so we don't have to worry about
* the target of the link changing. We will only follow the entry as it
* existed before or after whatever modification is happening under the
* dir lock and that can already legally race before or after our
* lookup.
*/
static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct super_block *sb = dir->i_sb;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_dirent dent = {0,};
struct inode *inode;
u64 ino = 0;
u64 hash;
int ret;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
if (dentry->d_name.len > SCOUTFS_NAME_LEN) {
ret = -ENAMETOOLONG;
goto out;
}
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, dir, &dir_lock);
if (ret)
goto out;
ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name,
dentry->d_name.len, hash, &dent, dir_lock);
if (ret == -ENOENT) {
ino = 0;
ret = 0;
} else if (ret == 0) {
ino = le64_to_cpu(dent.ino);
}
if (ret == 0)
set_dentry_fsdata(dentry, dir_lock);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
out:
if (ret < 0)
inode = ERR_PTR(ret);
else if (ino == 0)
inode = NULL;
else
inode = scoutfs_iget(sb, ino, 0, 0);
/*
* We can't splice dir aliases into the dcache. dir entries
* might have changed on other nodes so our dcache could still
* contain them, rather than having been moved in rename. For
* dirs, we use d_materialize_unique to remove any existing
* aliases which must be stale. Our inode numbers aren't reused
* so inodes pointed to by entries can't change types.
*/
if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode))
return d_materialise_unique(dentry, inode);
else
return d_splice_alias(inode, dentry);
}
/*
* Helper to make iterating through dirent ptrs aligned
*/
static inline struct scoutfs_dirent *next_aligned_dirent(struct scoutfs_dirent *dent, u8 len)
{
return (void *)dent +
ALIGN(offsetof(struct scoutfs_dirent, name[len]), __alignof__(struct scoutfs_dirent));
}
/*
* readdir simply iterates over the dirent items for the dir inode and
* uses their offset as the readdir position.
*
* It will need to be careful not to read past the region of the dirent
* hash offset keys that it has access to.
*/
static int scoutfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_dirent *dent = NULL;
/* we'll store name_len in dent->__pad[0] */
#define hacky_name_len __pad[0]
struct scoutfs_key last_key;
struct scoutfs_key key;
struct page *page = NULL;
int name_len;
u64 pos;
int entries = 0;
int ret;
int complete = 0;
struct scoutfs_dirent *end;
if (!dir_emit_dots(file, ctx))
return 0;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
end = page_address(page) + PAGE_SIZE;
init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
SCOUTFS_DIRENT_LAST_POS, 0);
/*
* lock and fetch dirent items, until the page no longer fits
* a max size dirent (288b). Then unlock and dir_emit the ones
* we stored in the page.
*/
for (;;) {
/* lock */
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
if (ret)
break;
dent = page_address(page);
pos = ctx->pos;
while (next_aligned_dirent(dent, SCOUTFS_NAME_LEN) < end) {
init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
pos, 0);
ret = scoutfs_item_next(sb, &key, &last_key, dent,
dirent_bytes(SCOUTFS_NAME_LEN),
dir_lock);
if (ret < 0) {
if (ret == -ENOENT) {
ret = 0;
complete = 1;
}
break;
}
name_len = ret - sizeof(struct scoutfs_dirent);
dent->hacky_name_len = name_len;
if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) {
scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN,
corrupt_dirent_readdir_name_len,
"dir_ino %llu pos %llu key "SK_FMT" len %d",
scoutfs_ino(inode),
pos,
SK_ARG(&key), name_len);
ret = -EIO;
break;
}
pos = le64_to_cpu(dent->pos) + 1;
dent = next_aligned_dirent(dent, name_len);
entries++;
}
/* unlock */
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ);
if (ret < 0)
break;
dent = page_address(page);
for (; entries > 0; entries--) {
ctx->pos = le64_to_cpu(dent->pos);
if (!dir_emit(ctx, dent->name, dent->hacky_name_len,
le64_to_cpu(dent->ino),
dentry_type(dent->type))) {
ret = 0;
goto out;
}
dent = next_aligned_dirent(dent, dent->hacky_name_len);
/* always advance ctx->pos past */
ctx->pos++;
}
if (complete)
break;
}
out:
if (page)
__free_page(page);
return ret;
}
/*
* Add all the items for the named link to the inode in the dir. Only
* items are modified. The caller is responsible for locking, entering
* a transaction, dirtying items, and managing the vfs structs.
*
* If this returns an error then nothing will have changed.
*/
static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
u64 pos, const char *name, unsigned name_len,
u64 ino, umode_t mode, struct scoutfs_lock *dir_lock,
struct scoutfs_lock *inode_lock)
{
struct scoutfs_dirent *dent = NULL;
struct scoutfs_key rdir_key;
struct scoutfs_key ent_key;
struct scoutfs_key lb_key;
bool del_rdir = false;
bool del_ent = false;
int ret;
dent = alloc_dirent(name_len);
if (!dent) {
return -ENOMEM;
}
/* initialize the dent */
dent->ino = cpu_to_le64(ino);
dent->hash = cpu_to_le64(hash);
dent->pos = cpu_to_le64(pos);
dent->type = mode_to_type(mode);
memcpy(dent->name, name, name_len);
init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0);
init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos);
ret = scoutfs_item_create(sb, &ent_key, dent, dirent_bytes(name_len),
dir_lock);
if (ret)
goto out;
del_ent = true;
ret = scoutfs_item_create(sb, &rdir_key, dent, dirent_bytes(name_len),
dir_lock);
if (ret)
goto out;
del_rdir = true;
ret = scoutfs_item_create(sb, &lb_key, dent, dirent_bytes(name_len),
inode_lock);
out:
if (ret < 0) {
if (del_ent)
scoutfs_item_delete(sb, &ent_key, dir_lock);
if (del_rdir)
scoutfs_item_delete(sb, &rdir_key, dir_lock);
}
kfree(dent);
return ret;
}
/*
* Delete all the items for the named link to the inode in the dir.
* Only items are modified. The caller is responsible for locking,
* entering a transaction, dirtying items, and managing the vfs structs.
*
* If this returns an error then nothing will have changed.
*/
static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
u64 pos, u64 ino, struct scoutfs_lock *dir_lock,
struct scoutfs_lock *inode_lock)
{
struct scoutfs_key rdir_key;
struct scoutfs_key ent_key;
struct scoutfs_key lb_key;
int ret;
init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0);
init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos);
ret = scoutfs_item_dirty(sb, &ent_key, dir_lock) ?:
scoutfs_item_dirty(sb, &rdir_key, dir_lock) ?:
scoutfs_item_dirty(sb, &lb_key, inode_lock);
if (ret == 0) {
ret = scoutfs_item_delete(sb, &ent_key, dir_lock) ?:
scoutfs_item_delete(sb, &rdir_key, dir_lock) ?:
scoutfs_item_delete(sb, &lb_key, inode_lock);
BUG_ON(ret); /* _dirty should have guaranteed success */
}
return ret;
}
/*
* Inode creation needs to hold dir and inode locks which can be greater
* or less than each other. It seems easiest to keep the dual locking
* here like it is for all the other dual locking of established inodes.
* Except we don't have the inode struct yet when we're getting locks,
* so we roll our own comparion between the two instead of pushing
* complexity down the locking paths that acquire existing inodes in
* order.
*/
static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev,
struct scoutfs_lock **dir_lock,
struct scoutfs_lock **inode_lock,
struct scoutfs_lock **orph_lock,
struct list_head *ind_locks)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = NULL;
u64 ind_seq;
int ret = 0;
u64 ino;
ret = scoutfs_alloc_ino(sb, S_ISDIR(mode), &ino);
if (ret)
return ERR_PTR(ret);
if (ino < scoutfs_ino(dir)) {
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino,
inode_lock) ?:
scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE, dir,
dir_lock);
} else {
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE, dir,
dir_lock) ?:
scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino,
inode_lock);
}
if (ret)
goto out_unlock;
ret = scoutfs_quota_check_inode(sb, dir);
if (ret)
goto out_unlock;
if (orph_lock) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
if (ret < 0)
goto out_unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
goto out_unlock;
ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode) ?:
scoutfs_init_acl_locked(inode, dir, *inode_lock, *dir_lock, ind_locks);
if (ret < 0)
goto out;
scoutfs_inode_set_proj(inode, scoutfs_inode_get_proj(dir));
ret = scoutfs_dirty_inode_item(dir, *dir_lock);
out:
if (ret)
scoutfs_release_trans(sb);
out_unlock:
if (ret) {
scoutfs_inode_index_unlock(sb, ind_locks);
scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
*dir_lock = NULL;
scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
*inode_lock = NULL;
if (orph_lock) {
scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
*orph_lock = NULL;
}
if (!IS_ERR_OR_NULL(inode))
iput(inode);
inode = ERR_PTR(ret);
}
return inode;
}
static int scoutfs_mknod(KC_VFS_NS_DEF
struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = NULL;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_inode_info *si;
LIST_HEAD(ind_locks);
u64 hash;
u64 pos;
int ret;
if (dentry->d_name.len > SCOUTFS_NAME_LEN)
return -ENAMETOOLONG;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
inode = lock_hold_create(dir, dentry, mode, rdev,
&dir_lock, &inode_lock, NULL, &ind_locks);
if (IS_ERR(inode))
return PTR_ERR(inode);
si = SCOUTFS_I(inode);
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
if (ret < 0)
goto out;
pos = SCOUTFS_I(dir)->next_readdir_pos++;
ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos,
dentry->d_name.name, dentry->d_name.len,
scoutfs_ino(inode), inode->i_mode, dir_lock,
inode_lock);
if (ret)
goto out;
set_dentry_fsdata(dentry, dir_lock);
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
dir->i_mtime = dir->i_ctime = current_time(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime;
si->crtime = inode->i_mtime;
inode_inc_iversion(dir);
inode_inc_iversion(inode);
scoutfs_forest_inc_inode_count(sb);
if (S_ISDIR(mode)) {
inc_nlink(inode);
inc_nlink(dir);
}
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
scoutfs_inode_index_unlock(sb, &ind_locks);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
out:
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
/* XXX delete the inode item here */
if (ret && !IS_ERR_OR_NULL(inode))
iput(inode);
return ret;
}
/* XXX hmm, do something with excl? */
static int scoutfs_create(KC_VFS_NS_DEF
struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
return scoutfs_mknod(KC_VFS_NS
dir, dentry, mode | S_IFREG, 0);
}
static int scoutfs_mkdir(KC_VFS_NS_DEF
struct inode *dir,
struct dentry *dentry, umode_t mode)
{
return scoutfs_mknod(KC_VFS_NS
dir, dentry, mode | S_IFDIR, 0);
}
static int scoutfs_link(struct dentry *old_dentry,
struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
struct super_block *sb = dir->i_sb;
struct scoutfs_lock *dir_lock;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
LIST_HEAD(ind_locks);
bool del_orphan = false;
u64 dir_size;
u64 ind_seq;
u64 hash;
u64 pos;
int ret;
int err;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
if (dentry->d_name.len > SCOUTFS_NAME_LEN)
return -ENAMETOOLONG;
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE,
dir, &dir_lock, inode, &inode_lock,
NULL, NULL, NULL, NULL);
if (ret)
return ret;
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
if (ret < 0)
goto out_unlock;
if (inode->i_nlink >= SCOUTFS_LINK_MAX) {
ret = -EMLINK;
goto out_unlock;
}
dir_size = i_size_read(dir) + dentry->d_name.len;
if (inode->i_nlink == 0) {
del_orphan = true;
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
&orph_lock);
if (ret < 0)
goto out_unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
goto out_unlock;
ret = scoutfs_dirty_inode_item(dir, dir_lock);
if (ret)
goto out;
if (del_orphan) {
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock);
if (ret)
goto out;
}
pos = SCOUTFS_I(dir)->next_readdir_pos++;
ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos,
dentry->d_name.name, dentry->d_name.len,
scoutfs_ino(inode), inode->i_mode, dir_lock,
inode_lock);
if (ret) {
err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock);
WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
goto out;
}
set_dentry_fsdata(dentry, dir_lock);
i_size_write(dir, dir_size);
dir->i_mtime = dir->i_ctime = current_time(inode);
inode->i_ctime = dir->i_mtime;
inc_nlink(inode);
inode_inc_iversion(dir);
inode_inc_iversion(inode);
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
atomic_inc(&inode->i_count);
d_instantiate(dentry, inode);
out:
scoutfs_release_trans(sb);
out_unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
static bool should_orphan(struct inode *inode)
{
if (inode == NULL)
return false;
if (S_ISDIR(inode->i_mode))
return inode->i_nlink == 2;
return inode->i_nlink == 1;
}
/*
* Unlink removes the entry from its item and removes the item if ours
* was the only remaining entry.
*/
static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct kc_timespec ts = current_time(inode);
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_dirent dent;
LIST_HEAD(ind_locks);
u64 ind_seq;
u64 hash;
int ret;
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE,
dir, &dir_lock, inode, &inode_lock,
NULL, NULL, NULL, NULL);
if (ret)
return ret;
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
if (ret < 0)
goto unlock;
if (S_ISDIR(inode->i_mode) && i_size_read(inode)) {
ret = -ENOTEMPTY;
goto unlock;
}
ret = scoutfs_inode_check_retention(inode);
if (ret < 0)
goto unlock;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, dentry->d_name.len, hash,
&dent, dir_lock);
if (ret < 0)
goto unlock;
if (should_orphan(inode)) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
&orph_lock);
if (ret < 0)
goto unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
if (ret > 0)
goto retry;
if (ret)
goto unlock;
if (should_orphan(inode)) {
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock);
if (ret < 0)
goto out;
}
ret = del_entry_items(sb, scoutfs_ino(dir), le64_to_cpu(dent.hash), le64_to_cpu(dent.pos),
scoutfs_ino(inode), dir_lock, inode_lock);
if (ret) {
ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock);
WARN_ON_ONCE(ret); /* should have been dirty */
goto out;
}
set_dentry_fsdata(dentry, dir_lock);
dir->i_ctime = ts;
dir->i_mtime = ts;
i_size_write(dir, i_size_read(dir) - dentry->d_name.len);
inode_inc_iversion(dir);
inode_inc_iversion(inode);
inode->i_ctime = ts;
drop_nlink(inode);
if (S_ISDIR(inode->i_mode)) {
drop_nlink(dir);
drop_nlink(inode);
}
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
out:
scoutfs_release_trans(sb);
unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
static void init_symlink_key(struct scoutfs_key *key, u64 ino, u8 nr)
{
*key = (struct scoutfs_key) {
.sk_zone = SCOUTFS_FS_ZONE,
.sks_ino = cpu_to_le64(ino),
.sk_type = SCOUTFS_SYMLINK_TYPE,
.sks_nr = cpu_to_le64(nr),
};
}
/*
* Operate on all the items that make up a symlink whose target might
* have to be split up into multiple items each with a maximally sized
* value.
*
* returns 0 or -errno from the item calls, particularly including
* EEXIST, EIO, or ENOENT if the item population doesn't match what was
* expected given the op.
*
* The target name can be null for deletion when val isn't used. Size
* still has to be provided to determine the number of items.
*/
enum symlink_ops {
SYM_CREATE = 0,
SYM_LOOKUP,
SYM_DELETE,
};
static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino,
struct scoutfs_lock *lock, const char *target,
size_t size)
{
struct scoutfs_key key;
unsigned bytes;
unsigned nr;
int ret;
int i;
if (WARN_ON_ONCE(size == 0 || size > SCOUTFS_SYMLINK_MAX_SIZE ||
op > SYM_DELETE))
return -EINVAL;
nr = DIV_ROUND_UP(size, SCOUTFS_MAX_VAL_SIZE);
for (i = 0; i < nr; i++) {
init_symlink_key(&key, ino, i);
bytes = min_t(u64, size, SCOUTFS_MAX_VAL_SIZE);
if (op == SYM_CREATE)
ret = scoutfs_item_create(sb, &key, (void *)target,
bytes, lock);
else if (op == SYM_LOOKUP)
ret = scoutfs_item_lookup_exact(sb, &key,
(void *)target, bytes,
lock);
else if (op == SYM_DELETE)
ret = scoutfs_item_delete(sb, &key, lock);
if (ret)
break;
target += SCOUTFS_MAX_VAL_SIZE;
size -= bytes;
}
return ret;
}
/*
* Fill a buffer with the null terminated symlink, and return it
* so callers can free it once the vfs is done.
*
* We chose to pay the runtime cost of per-call allocation and copy
* overhead instead of wiring up symlinks to the page cache, storing
* each small link in a full page, and later having to reclaim them.
*/
static void *scoutfs_get_link_target(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct scoutfs_lock *inode_lock = NULL;
char *path = NULL;
loff_t size;
int ret;
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
if (ret)
return ERR_PTR(ret);
size = i_size_read(inode);
if (size == 0 || size > SCOUTFS_SYMLINK_MAX_SIZE) {
scoutfs_corruption(sb, SC_SYMLINK_INODE_SIZE,
corrupt_symlink_inode_size,
"ino %llu size %llu",
scoutfs_ino(inode), (u64)size);
ret = -EIO;
goto out;
}
/* unlikely, but possible I suppose */
if (size > PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
path = kmalloc(size, GFP_NOFS);
if (!path) {
ret = -ENOMEM;
goto out;
}
ret = symlink_item_ops(sb, SYM_LOOKUP, scoutfs_ino(inode), inode_lock,
path, size);
if (ret == -ENOENT) {
scoutfs_corruption(sb, SC_SYMLINK_MISSING_ITEM,
corrupt_symlink_missing_item,
"ino %llu size %llu", scoutfs_ino(inode),
size);
ret = -EIO;
} else if (ret == 0 && path[size - 1]) {
scoutfs_corruption(sb, SC_SYMLINK_NOT_NULL_TERM,
corrupt_symlink_not_null_term,
"ino %llu last %u",
scoutfs_ino(inode), path[size - 1]);
ret = -EIO;
}
out:
if (ret < 0) {
kfree(path);
path = ERR_PTR(ret);
}
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
return path;
}
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
static void *scoutfs_follow_link(struct dentry *dentry, struct nameidata *nd)
{
char *path;
path = scoutfs_get_link_target(dentry);
if (!IS_ERR_OR_NULL(path))
nd_set_link(nd, path);
return path;
}
static void scoutfs_put_link(struct dentry *dentry, struct nameidata *nd,
void *cookie)
{
if (!IS_ERR_OR_NULL(cookie))
kfree(cookie);
}
#else
static const char *scoutfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done)
{
char *path;
path = scoutfs_get_link_target(dentry);
if (!IS_ERR_OR_NULL(path))
set_delayed_call(done, kfree_link, path);
return path;
}
#endif
/*
* Symlink target paths can be annoyingly large. We store relatively
* rare large paths in multiple items.
*/
static int scoutfs_symlink(KC_VFS_NS_DEF
struct inode *dir, struct dentry *dentry,
const char *symname)
{
struct super_block *sb = dir->i_sb;
const int name_len = strlen(symname) + 1;
struct inode *inode = NULL;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_inode_info *si;
LIST_HEAD(ind_locks);
u64 hash;
u64 pos;
int ret;
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
/* path_max includes null as does our value for nd_set_link */
if (dentry->d_name.len > SCOUTFS_NAME_LEN ||
name_len > PATH_MAX || name_len > SCOUTFS_SYMLINK_MAX_SIZE)
return -ENAMETOOLONG;
inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
&dir_lock, &inode_lock, NULL, &ind_locks);
if (IS_ERR(inode))
return PTR_ERR(inode);
si = SCOUTFS_I(inode);
ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock);
if (ret < 0)
goto out;
ret = symlink_item_ops(sb, SYM_CREATE, scoutfs_ino(inode), inode_lock,
symname, name_len);
if (ret)
goto out;
pos = SCOUTFS_I(dir)->next_readdir_pos++;
ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos,
dentry->d_name.name, dentry->d_name.len,
scoutfs_ino(inode), inode->i_mode, dir_lock,
inode_lock);
if (ret)
goto out;
set_dentry_fsdata(dentry, dir_lock);
i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
dir->i_mtime = dir->i_ctime = current_time(inode);
inode_inc_iversion(dir);
inode->i_ctime = dir->i_mtime;
si->crtime = inode->i_ctime;
i_size_write(inode, name_len);
inode_inc_iversion(inode);
scoutfs_forest_inc_inode_count(sb);
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
insert_inode_hash(inode);
/* XXX need to set i_op/fop before here for sec callbacks */
d_instantiate(dentry, inode);
inode = NULL;
ret = 0;
out:
if (ret < 0) {
/* XXX remove inode items */
symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock,
NULL, name_len);
}
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
if (!IS_ERR_OR_NULL(inode))
iput(inode);
return ret;
}
int scoutfs_symlink_drop(struct super_block *sb, u64 ino,
struct scoutfs_lock *lock, u64 i_size)
{
int ret;
ret = symlink_item_ops(sb, SYM_DELETE, ino, lock, NULL, i_size);
if (ret == -ENOENT)
ret = 0;
return ret;
}
/*
* Find the next link backref items for the given ino starting from the
* given dir inode and final entry position. For each backref item we
* add an allocated copy of it to the head of the caller's list.
*
* Callers who are building a path can add one entry for each parent.
* They're left with a list of entries from the root down in list order.
*
* Callers who are gathering multiple entries for one inode get the
* entries in the opposite order that their items are found.
*
* Returns +ve for number of entries added, -ENOENT if no entries were
* found, or -errno on error. It weirdly won't return 0, but early
* callers preferred -ENOENT so we use that for the case of no entries.
*
* Callers are comfortable with the race inherent to incrementally
* gathering backrefs across multiple lock acquisitions.
*/
int scoutfs_dir_add_next_linkrefs(struct super_block *sb, u64 ino, u64 dir_ino, u64 dir_pos,
int count, struct list_head *list)
{
struct scoutfs_link_backref_entry *prev_ent = NULL;
struct scoutfs_link_backref_entry *ent = NULL;
struct scoutfs_lock *lock = NULL;
struct scoutfs_key last_key;
struct scoutfs_key key;
int nr = 0;
int len;
int ret;
init_dirent_key(&key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, dir_pos);
init_dirent_key(&last_key, SCOUTFS_LINK_BACKREF_TYPE, ino, U64_MAX, U64_MAX);
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock);
if (ret)
goto out;
while (nr < count) {
ent = kmalloc(offsetof(struct scoutfs_link_backref_entry,
dent.name[SCOUTFS_NAME_LEN]), GFP_NOFS);
if (!ent) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&ent->head);
ret = scoutfs_item_next(sb, &key, &last_key, &ent->dent,
dirent_bytes(SCOUTFS_NAME_LEN), lock);
if (ret < 0) {
if (ret == -ENOENT && prev_ent)
prev_ent->last = true;
goto out;
}
len = ret - sizeof(struct scoutfs_dirent);
if (len < 1 || len > SCOUTFS_NAME_LEN) {
scoutfs_corruption(sb, SC_DIRENT_BACKREF_NAME_LEN,
corrupt_dirent_backref_name_len,
"ino %llu dir_ino %llu pos %llu key "SK_FMT" len %d",
ino, dir_ino, dir_pos, SK_ARG(&key), len);
ret = -EIO;
goto out;
}
ent->dir_ino = le64_to_cpu(key.skd_major);
ent->dir_pos = le64_to_cpu(key.skd_minor);
ent->name_len = len;
ent->d_type = dentry_type(ent->dent.type);
ent->last = false;
trace_scoutfs_dir_add_next_linkref_found(sb, ino, ent->dir_ino, ent->dir_pos,
ent->name_len);
list_add(&ent->head, list);
prev_ent = ent;
ent = NULL;
nr++;
scoutfs_key_inc(&key);
}
ret = 0;
out:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
trace_scoutfs_dir_add_next_linkrefs(sb, ino, dir_ino, dir_pos, count, nr, ret);
kfree(ent);
return nr ?: ret;
}
static u64 first_backref_dir_ino(struct list_head *list)
{
struct scoutfs_link_backref_entry *ent;
ent = list_first_entry(list, struct scoutfs_link_backref_entry, head);
return ent->dir_ino;
}
void scoutfs_dir_free_backref_path(struct super_block *sb,
struct list_head *list)
{
struct scoutfs_link_backref_entry *ent;
struct scoutfs_link_backref_entry *pos;
list_for_each_entry_safe(ent, pos, list, head) {
list_del_init(&ent->head);
kfree(ent);
}
}
/*
* Give the caller the next path from the root to the inode by walking
* backref items from the dir and name position, putting the backref keys
* we find in the caller's list.
*
* Return 0 if we found a path, -ENOENT if we didn't, and -errno on error.
*
* If parents get unlinked while we're searching we can fail to make it
* up to the root. We restart the search in that case. Parent dirs
* couldn't have been unlinked while they still had entries and we won't
* see links to the inode that have been unlinked.
*
* XXX Each path component traversal is consistent but that doesn't mean
* that the total traversed path is consistent. If renames hit dirs
* that have been visited and then dirs to be visited we can return a
* path that was never present in the system:
*
* path to inode mv performed built up path
* ----
* a/b/c/d/e/f
* d/e/f
* mv a/b/c/d/e a/b/c/
* a/b/c/e/f
* mv a/b/c a/
* a/c/e/f
* a/c/d/e/f
*
* XXX We'll protect against this by sampling the seq before the
* traversal and restarting if we saw backref items whose seq was
* greater than the start point. It's not precise in that it doesn't
* also capture the rename of a dir that we already traversed but it
* lets us complete the traversal in one pass that very rarely restarts.
*
* XXX and worry about traversing entirely dirty backref items with
* equal seqs that have seen crazy modification? seems like we have to
* sync if we see our dirty seq.
*/
int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino,
u64 dir_pos, struct list_head *list)
{
int retries = 10;
u64 par_ino;
int ret;
retry:
if (retries-- == 0) {
scoutfs_inc_counter(sb, dir_backref_excessive_retries);
ret = -ELOOP;
goto out;
}
/* get the next link name to the given inode */
ret = scoutfs_dir_add_next_linkrefs(sb, ino, dir_ino, dir_pos, 1, list);
if (ret < 0)
goto out;
/* then get the names of all the parent dirs */
par_ino = first_backref_dir_ino(list);
while (par_ino != SCOUTFS_ROOT_INO) {
ret = scoutfs_dir_add_next_linkrefs(sb, par_ino, 0, 0, 1, list);
if (ret < 0) {
if (ret == -ENOENT) {
/* restart if there was no parent component */
scoutfs_dir_free_backref_path(sb, list);
goto retry;
}
goto out;
}
par_ino = first_backref_dir_ino(list);
}
ret = 0;
out:
if (ret < 0)
scoutfs_dir_free_backref_path(sb, list);
return ret;
}
/*
* Given two parent dir inos, return the ancestor of p2 that is p1's
* child when p1 is also an ancestor of p2: p1/p/[...]/p2. This can
* return p2.
*
* We do this by walking link backref items. Each entry can be thought
* of as a dirent stored at the target. So the parent dir is stored in
* the target.
*
* The caller holds the global rename lock and link backref walk locks
* each inode as it looks up backrefs.
*/
static int item_d_ancestor(struct super_block *sb, u64 p1, u64 p2, u64 *p_ret)
{
struct scoutfs_link_backref_entry *ent;
LIST_HEAD(list);
int ret;
u64 p;
*p_ret = 0;
if (p2 == SCOUTFS_ROOT_INO) {
ret = 0;
goto out;
}
ret = scoutfs_dir_get_backref_path(sb, p2, 0, 0, &list);
if (ret)
goto out;
p = p2;
list_for_each_entry(ent, &list, head) {
if (ent->dir_ino == p1) {
*p_ret = p;
ret = 0;
break;
}
p = ent->dir_ino;
}
out:
scoutfs_dir_free_backref_path(sb, &list);
return ret;
}
/*
* The vfs checked the relationship between dirs, the source, and target
* before acquiring clusters locks. All that could have changed. If
* we're renaming between parent dirs then we try to verify the basics
* of those checks using our backref items.
*
* Compare this to lock_rename()'s use of d_ancestor() and what it's
* caller does with the returned ancestor.
*
* The caller only holds the global rename cluster lock.
* item_d_ancestor is going to walk backref paths and acquire and
* release locks for each target inode in the path.
*/
static int verify_ancestors(struct super_block *sb, u64 p1, u64 p2,
u64 old_ino, u64 new_ino)
{
int ret;
u64 p;
ret = item_d_ancestor(sb, p1, p2, &p);
if (ret == 0 && p == 0)
ret = item_d_ancestor(sb, p2, p1, &p);
if (ret == 0 && p && (p == old_ino || p == new_ino))
ret = -EINVAL;
return ret;
}
/*
* The vfs performs checks on cached inodes and dirents before calling
* here. It doesn't hold any locks so all of those checks can be based
* on cached state that has been invalidated by other operations in the
* cluster before we get here.
*
* We do the expedient thing today and verify the basic structural
* checks after we get cluster locks. We perform topology checks
* analagous to the d_ancestor() walks in lock_rename() after acquiring
* a clustered equivalent of the vfs rename lock. We then lock the dir
* and target inodes and verify that the entries assumed by the function
* arguments still exist.
*
* We don't duplicate all the permissions checking in the vfs
* (may_create(), etc, are all static.). This means racing renames can
* succeed after other nodes have gotten success out of changes to
* permissions that should have forbidden renames.
*
* All of this wouldn't be necessary if we could get prepare/complete
* callbacks around rename that'd let us lock the inodes, dirents, and
* topology while the vfs walks dentries and uses inodes.
*
* We acquire the inode locks in inode number order. Because of our
* inode group locking we can't define lock ordering correctness by
* properties that can be different in a given group. This prevents us
* from using parent/child locking orders as two groups can have both
* parent and child relationships to each other.
*/
static int scoutfs_rename_common(KC_VFS_NS_DEF
struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct super_block *sb = old_dir->i_sb;
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct scoutfs_lock *rename_lock = NULL;
struct scoutfs_lock *old_dir_lock = NULL;
struct scoutfs_lock *new_dir_lock = NULL;
struct scoutfs_lock *old_inode_lock = NULL;
struct scoutfs_lock *new_inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_dirent new_dent;
struct scoutfs_dirent old_dent;
struct kc_timespec now;
bool ins_new = false;
bool del_new = false;
bool ins_old = false;
LIST_HEAD(ind_locks);
u64 ind_seq;
u64 old_hash;
u64 new_hash;
u64 new_pos;
int ret;
int err;
trace_scoutfs_rename(sb, old_dir, old_dentry, new_dir, new_dentry);
old_hash = dirent_name_hash(old_dentry->d_name.name,
old_dentry->d_name.len);
new_hash = dirent_name_hash(new_dentry->d_name.name,
new_dentry->d_name.len);
if (new_dentry->d_name.len > SCOUTFS_NAME_LEN)
return -ENAMETOOLONG;
/* if dirs are different make sure ancestor relationships are valid */
if (old_dir != new_dir) {
ret = scoutfs_lock_rename(sb, SCOUTFS_LOCK_WRITE, 0,
&rename_lock);
if (ret)
return ret;
ret = verify_ancestors(sb, scoutfs_ino(old_dir),
scoutfs_ino(new_dir),
scoutfs_ino(old_inode),
new_inode ? scoutfs_ino(new_inode) : 0);
if (ret)
goto out_unlock;
}
/* lock all the inodes */
ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE,
SCOUTFS_LKF_REFRESH_INODE,
old_dir, &old_dir_lock,
new_dir, &new_dir_lock,
old_inode, &old_inode_lock,
new_inode, &new_inode_lock);
if (ret)
goto out_unlock;
/* make sure that the entries assumed by the argument still exist */
ret = validate_dentry(sb, scoutfs_ino(old_dir), old_dentry, old_dir_lock) ?:
validate_dentry(sb, scoutfs_ino(new_dir), new_dentry, new_dir_lock);
if (ret)
goto out_unlock;
/* test dir i_size now that it's refreshed */
if (new_inode && S_ISDIR(new_inode->i_mode) && i_size_read(new_inode)) {
ret = -ENOTEMPTY;
goto out_unlock;
}
if ((flags & RENAME_NOREPLACE) && (new_inode != NULL)) {
ret = -EEXIST;
goto out_unlock;
}
if ((old_inode && (ret = scoutfs_inode_check_retention(old_inode))) ||
(new_inode && (ret = scoutfs_inode_check_retention(new_inode))))
goto out_unlock;
if (should_orphan(new_inode)) {
ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
&orph_lock);
if (ret < 0)
goto out_unlock;
}
retry:
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
scoutfs_inode_index_prepare(sb, &ind_locks, old_inode, false) ?:
(new_dir == old_dir ? 0 :
scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
(new_inode == NULL ? 0 :
scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
if (ret > 0)
goto retry;
if (ret)
goto out_unlock;
/* get a pos for the new entry */
new_pos = SCOUTFS_I(new_dir)->next_readdir_pos++;
/* dirty the inodes so that updating doesn't fail */
ret = scoutfs_dirty_inode_item(old_dir, old_dir_lock) ?:
scoutfs_dirty_inode_item(old_inode, old_inode_lock) ?:
(old_dir != new_dir ?
scoutfs_dirty_inode_item(new_dir, new_dir_lock) : 0) ?:
(new_inode ?
scoutfs_dirty_inode_item(new_inode, new_inode_lock) : 0);
if (ret)
goto out;
/* remove the new entry if it exists */
if (new_inode) {
ret = lookup_dirent(sb, scoutfs_ino(new_dir), new_dentry->d_name.name,
new_dentry->d_name.len, new_hash, &new_dent, new_dir_lock);
if (ret < 0)
goto out;
ret = del_entry_items(sb, scoutfs_ino(new_dir), le64_to_cpu(new_dent.hash),
le64_to_cpu(new_dent.pos), scoutfs_ino(new_inode),
new_dir_lock, new_inode_lock);
if (ret)
goto out;
ins_new = true;
}
/* create the new entry */
ret = add_entry_items(sb, scoutfs_ino(new_dir), new_hash, new_pos,
new_dentry->d_name.name, new_dentry->d_name.len,
scoutfs_ino(old_inode), old_inode->i_mode,
new_dir_lock, old_inode_lock);
if (ret)
goto out;
del_new = true;
ret = lookup_dirent(sb, scoutfs_ino(old_dir), old_dentry->d_name.name,
old_dentry->d_name.len, old_hash, &old_dent, old_dir_lock);
if (ret < 0)
goto out;
/* remove the old entry */
ret = del_entry_items(sb, scoutfs_ino(old_dir), le64_to_cpu(old_dent.hash),
le64_to_cpu(old_dent.pos), scoutfs_ino(old_inode),
old_dir_lock, old_inode_lock);
if (ret)
goto out;
ins_old = true;
if (should_orphan(new_inode)) {
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock,
new_inode_lock);
if (ret)
goto out;
}
/* won't fail from here on out, update all the vfs structs */
/* the caller will use d_move to move the old_dentry into place */
set_dentry_fsdata(old_dentry, new_dir_lock);
i_size_write(old_dir, i_size_read(old_dir) - old_dentry->d_name.len);
if (!new_inode)
i_size_write(new_dir, i_size_read(new_dir) +
new_dentry->d_name.len);
if (new_inode) {
drop_nlink(new_inode);
if (S_ISDIR(new_inode->i_mode)) {
drop_nlink(new_dir);
drop_nlink(new_inode);
}
}
if (S_ISDIR(old_inode->i_mode) && (old_dir != new_dir)) {
drop_nlink(old_dir);
inc_nlink(new_dir);
}
now = current_time(old_inode);
old_dir->i_ctime = now;
old_dir->i_mtime = now;
if (new_dir != old_dir) {
new_dir->i_ctime = now;
new_dir->i_mtime = now;
}
old_inode->i_ctime = now;
if (new_inode)
new_inode->i_ctime = now;
inode_inc_iversion(old_dir);
inode_inc_iversion(old_inode);
if (new_dir != old_dir)
inode_inc_iversion(new_dir);
if (new_inode)
inode_inc_iversion(new_inode);
scoutfs_update_inode_item(old_dir, old_dir_lock, &ind_locks);
scoutfs_update_inode_item(old_inode, old_inode_lock, &ind_locks);
if (new_dir != old_dir)
scoutfs_update_inode_item(new_dir, new_dir_lock, &ind_locks);
if (new_inode)
scoutfs_update_inode_item(new_inode, new_inode_lock,
&ind_locks);
ret = 0;
out:
if (ret) {
/*
* XXX We have to clean up partial item deletions today
* because we can't have two dirents existing in a
* directory that point to different inodes. If we
* could we'd create the new name then everything after
* that is deletion that will only fail cleanly or
* succeed. Maybe we could have an item replace call
* that gives us the dupe to re-insert on cleanup? Not
* sure.
*
* It's safe to use dentry_info here 'cause they haven't
* been updated if we saw an error.
*/
err = 0;
if (ins_old)
err = add_entry_items(sb, scoutfs_ino(old_dir),
le64_to_cpu(old_dent.hash),
le64_to_cpu(old_dent.pos),
old_dentry->d_name.name,
old_dentry->d_name.len,
scoutfs_ino(old_inode),
old_inode->i_mode,
old_dir_lock,
old_inode_lock);
if (del_new && err == 0)
err = del_entry_items(sb, scoutfs_ino(new_dir),
new_hash, new_pos,
scoutfs_ino(old_inode),
new_dir_lock, old_inode_lock);
if (ins_new && err == 0)
err = add_entry_items(sb, scoutfs_ino(new_dir),
le64_to_cpu(new_dent.hash),
le64_to_cpu(new_dent.pos),
new_dentry->d_name.name,
new_dentry->d_name.len,
scoutfs_ino(new_inode),
new_inode->i_mode,
new_dir_lock,
new_inode_lock);
/* XXX freak out: panic, go read only, etc */
BUG_ON(err);
}
scoutfs_release_trans(sb);
out_unlock:
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, old_inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, new_inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
return ret;
}
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
static int scoutfs_rename(struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry)
{
return scoutfs_rename_common(KC_VFS_INIT_NS
old_dir, old_dentry, new_dir, new_dentry, 0);
}
#endif
static int scoutfs_rename2(KC_VFS_NS_DEF
struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
return scoutfs_rename_common(KC_VFS_NS
old_dir, old_dentry, new_dir, new_dentry, flags);
}
#ifdef KC_FMODE_KABI_ITERATE
/* we only need this to set the iterate flag for kabi :/ */
static int scoutfs_dir_open(struct inode *inode, struct file *file)
{
file->f_mode |= FMODE_KABI_ITERATE;
return 0;
}
#endif
static int scoutfs_tmpfile(KC_VFS_NS_DEF
struct inode *dir,
#ifdef KC_D_TMPFILE_DENTRY
struct dentry *dentry,
#else
struct file *file,
#endif
umode_t mode)
{
#ifndef KC_D_TMPFILE_DENTRY
struct dentry *dentry = file->f_path.dentry;
#endif
struct super_block *sb = dir->i_sb;
struct inode *inode = NULL;
struct scoutfs_lock *dir_lock = NULL;
struct scoutfs_lock *inode_lock = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_inode_info *si;
LIST_HEAD(ind_locks);
int ret;
if (dentry->d_name.len > SCOUTFS_NAME_LEN)
return -ENAMETOOLONG;
inode = lock_hold_create(dir, dentry, mode, 0,
&dir_lock, &inode_lock, &orph_lock, &ind_locks);
if (IS_ERR(inode))
return PTR_ERR(inode);
si = SCOUTFS_I(inode);
ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock);
if (ret < 0)
goto out; /* XXX returning error but items created */
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
si->crtime = inode->i_mtime;
insert_inode_hash(inode);
ihold(inode); /* need to update inode modifications in d_tmpfile */
#ifdef KC_D_TMPFILE_DENTRY
d_tmpfile(dentry, inode);
#else
d_tmpfile(file, inode);
#endif
inode_inc_iversion(inode);
scoutfs_forest_inc_inode_count(sb);
scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
scoutfs_inode_index_unlock(sb, &ind_locks);
#ifndef KC_D_TMPFILE_DENTRY
ret = finish_open_simple(file, 0);
#endif
out:
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
if (!IS_ERR_OR_NULL(inode))
iput(inode);
return ret;
}
const struct inode_operations scoutfs_symlink_iops = {
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
.readlink = generic_readlink,
.follow_link = scoutfs_follow_link,
.put_link = scoutfs_put_link,
#else
.get_link = scoutfs_get_link,
#endif
.getattr = scoutfs_getattr,
.setattr = scoutfs_setattr,
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
#endif
.listxattr = scoutfs_listxattr,
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
.removexattr = generic_removexattr,
#endif
.get_acl = scoutfs_get_acl,
#ifndef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
.tmpfile = scoutfs_tmpfile,
.rename = scoutfs_rename_common,
.symlink = scoutfs_symlink,
.unlink = scoutfs_unlink,
.link = scoutfs_link,
.mkdir = scoutfs_mkdir,
.create = scoutfs_create,
.lookup = scoutfs_lookup,
#endif
};
const struct file_operations scoutfs_dir_fops = {
.iterate = scoutfs_readdir,
#ifdef KC_FMODE_KABI_ITERATE
.open = scoutfs_dir_open,
#endif
.unlocked_ioctl = scoutfs_ioctl,
.fsync = scoutfs_file_fsync,
.llseek = generic_file_llseek,
};
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
const struct inode_operations_wrapper scoutfs_dir_iops = {
.ops = {
#else
const struct inode_operations scoutfs_dir_iops = {
#endif
.lookup = scoutfs_lookup,
.mknod = scoutfs_mknod,
.create = scoutfs_create,
.mkdir = scoutfs_mkdir,
.link = scoutfs_link,
.unlink = scoutfs_unlink,
.rmdir = scoutfs_unlink,
.getattr = scoutfs_getattr,
.setattr = scoutfs_setattr,
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
.rename = scoutfs_rename,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
#endif
.listxattr = scoutfs_listxattr,
.get_acl = scoutfs_get_acl,
.symlink = scoutfs_symlink,
.permission = scoutfs_permission,
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
},
#endif
.tmpfile = scoutfs_tmpfile,
#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
.rename2 = scoutfs_rename2,
#else
.rename = scoutfs_rename2,
#endif
};