/* * Copyright (C) 2016 Versity Software, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #include #include #include "format.h" #include "file.h" #include "dir.h" #include "inode.h" #include "ioctl.h" #include "key.h" #include "msg.h" #include "super.h" #include "trans.h" #include "xattr.h" #include "item.h" #include "lock.h" #include "hash.h" #include "omap.h" #include "forest.h" #include "acl.h" #include "counters.h" #include "quota.h" #include "scoutfs_trace.h" /* * Directory entries are stored in three different items. Each has the * same key format and all have identical values which contain the full * entry name. * * Entries for name lookup are stored at the hash of the name and the * readdir position. Including the position lets us create names * without having to read the items to check for hash collisions. * Lookup iterates over all the positions with the same hash values and * compares the names. * * Entries for readdir are stored in an increasing unique readdir * position. This results in returning entries in creation order which * matches inode allocation order and avoids random inode access * patterns during readdir. * * Entries for link backref traversal are stored at the target inode * sorted by the parent dir and the entry's position in the parent dir. * This keeps link backref users away from the higher contention area of * dirent items in parent dirs. * * All the entries have a dirent struct with the full name in their * value. The dirent struct contains the name hash and readdir position * so that any item use can reference all the items for a given entry. */ static unsigned int mode_to_type(umode_t mode) { #define S_SHIFT 12 static unsigned char mode_types[S_IFMT >> S_SHIFT] = { [S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO, [S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR, [S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR, [S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK, [S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG, [S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK, [S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK, }; return mode_types[(mode & S_IFMT) >> S_SHIFT]; #undef S_SHIFT } static unsigned int dentry_type(enum scoutfs_dentry_type type) { static unsigned char types[] = { [SCOUTFS_DT_FIFO] = DT_FIFO, [SCOUTFS_DT_CHR] = DT_CHR, [SCOUTFS_DT_DIR] = DT_DIR, [SCOUTFS_DT_BLK] = DT_BLK, [SCOUTFS_DT_REG] = DT_REG, [SCOUTFS_DT_LNK] = DT_LNK, [SCOUTFS_DT_SOCK] = DT_SOCK, [SCOUTFS_DT_WHT] = DT_WHT, }; if (type < ARRAY_SIZE(types)) return types[type]; return DT_UNKNOWN; } static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags); const struct dentry_operations scoutfs_dentry_ops = { .d_revalidate = scoutfs_d_revalidate, }; static void init_dirent_key(struct scoutfs_key *key, u8 type, u64 ino, u64 major, u64 minor) { *key = (struct scoutfs_key) { .sk_zone = SCOUTFS_FS_ZONE, .skd_ino = cpu_to_le64(ino), .sk_type = type, .skd_major = cpu_to_le64(major), .skd_minor = cpu_to_le64(minor), }; } static unsigned int dirent_bytes(unsigned int name_len) { return offsetof(struct scoutfs_dirent, name[name_len]); } static struct scoutfs_dirent *alloc_dirent(unsigned int name_len) { return kmalloc(dirent_bytes(name_len), GFP_NOFS); } /* * Test a bit number as though an array of bytes is a large len-bit * big-endian value. nr 0 is the LSB of the final byte, nr (len - 1) is * the MSB of the first byte. */ static int test_be_bytes_bit(int nr, const char *bytes, int len) { return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7)); } /* * Generate a 32bit "fingerprint" of the name by extracting 32 evenly * distributed bits from the name. The intent is to have the sort order * of the fingerprints reflect the memcmp() sort order of the names * while mapping large names down to small fs keys. * * Names that are smaller than 32bits are biased towards the high bits * of the fingerprint so that most significant bits of the fingerprints * consistently reflect the initial characters of the names. */ static u32 dirent_name_fingerprint(const char *name, unsigned int name_len) { int name_bits = name_len * 8; int skip = max(name_bits / 32, 1); u32 fp = 0; int f; int n; for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip) fp |= !!test_be_bytes_bit(n, name, name_bits) << f; return fp; } static u64 dirent_name_hash(const char *name, unsigned int name_len) { return scoutfs_hash32(name, name_len) | ((u64)dirent_name_fingerprint(name, name_len) << 32); } static bool dirent_names_equal(const char *a_name, unsigned int a_len, const char *b_name, unsigned int b_len) { return a_len == b_len && memcmp(a_name, b_name, a_len) == 0; } /* * Looks for the dirent item and fills the caller's dirent if it finds * it. Returns item lookup errors including -ENOENT if it's not found. */ static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name, unsigned name_len, u64 hash, struct scoutfs_dirent *dent_ret, struct scoutfs_lock *lock) { struct scoutfs_key last_key; struct scoutfs_key key; struct scoutfs_dirent *dent = NULL; int ret; dent = alloc_dirent(SCOUTFS_NAME_LEN); if (!dent) { return -ENOMEM; } init_dirent_key(&key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, 0); init_dirent_key(&last_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, U64_MAX); for (;;) { ret = scoutfs_item_next(sb, &key, &last_key, dent, dirent_bytes(SCOUTFS_NAME_LEN), lock); if (ret < 0) break; ret -= sizeof(struct scoutfs_dirent); if (ret < 1 || ret > SCOUTFS_NAME_LEN) { scoutfs_corruption(sb, SC_DIRENT_NAME_LEN, corrupt_dirent_name_len, "dir_ino %llu hash %llu key "SK_FMT" len %d", dir_ino, hash, SK_ARG(&key), ret); ret = -EIO; goto out; } if (dirent_names_equal(name, name_len, dent->name, ret)) { *dent_ret = *dent; ret = 0; break; } if (le64_to_cpu(key.skd_minor) == U64_MAX) { ret = -ENOENT; break; } le64_add_cpu(&key.skd_minor, 1); } out: kfree(dent); return ret; } static int lookup_dentry_dirent(struct super_block *sb, u64 dir_ino, struct dentry *dentry, struct scoutfs_dirent *dent_ret, struct scoutfs_lock *lock) { return lookup_dirent(sb, dir_ino, dentry->d_name.name, dentry->d_name.len, dirent_name_hash(dentry->d_name.name, dentry->d_name.len), dent_ret, lock); } static u64 dentry_parent_ino(struct dentry *dentry) { struct dentry *parent = NULL; struct inode *dir; u64 dir_ino = 0; if ((parent = dget_parent(dentry)) && (dir = parent->d_inode)) dir_ino = scoutfs_ino(dir); dput(parent); return dir_ino; } /* negative dentries return 0, our root ino is non-zero (1) */ static u64 dentry_ino(struct dentry *dentry) { return dentry->d_inode ? scoutfs_ino(dentry->d_inode) : 0; } static void set_dentry_fsdata(struct dentry *dentry, struct scoutfs_lock *lock) { void *now = (void *)(unsigned long)lock->refresh_gen; void *was; /* didn't want to alloc :/ */ BUILD_BUG_ON(sizeof(dentry->d_fsdata) != sizeof(u64)); BUILD_BUG_ON(sizeof(dentry->d_fsdata) != sizeof(long)); do { was = dentry->d_fsdata; } while (cmpxchg(&dentry->d_fsdata, was, now) != was); } static bool test_dentry_fsdata(struct dentry *dentry, u64 refresh) { u64 fsd = (unsigned long)READ_ONCE(dentry->d_fsdata); return fsd == refresh; } /* * Validate an operation caller's input dentry argument. If the fsdata * is valid then the underlying dirent items couldn't have changed and * we return 0. If fsdata is no longer protected by a lock or its * fields don't match then we check the dirent item. If the dirent item * doesn't match what the caller expected given their dentry fields then * we return an error. */ static int validate_dentry(struct super_block *sb, u64 dir_ino, struct dentry *dentry, struct scoutfs_lock *lock) { u64 ino = dentry_ino(dentry); struct scoutfs_dirent dent = {0,}; int ret; if (test_dentry_fsdata(dentry, lock->refresh_gen)) { ret = 0; goto out; } ret = lookup_dentry_dirent(sb, dir_ino, dentry, &dent, lock); if (ret < 0 && ret != -ENOENT) goto out; /* use negative zeroed dent when lookup gave -ENOENT */ if (!ino && dent.ino) { /* caller expected negative but there was a dirent */ ret = -EEXIST; } else if (ino && !dent.ino) { /* caller expected positive but there was no dirent */ ret = -ENOENT; } else if (ino != le64_to_cpu(dent.ino)) { /* name linked to different inode than caller's */ ret = -ESTALE; } else { /* dirent ino matches dentry ino */ ret = 0; } out: trace_scoutfs_validate_dentry(sb, dentry, dir_ino, ino, le64_to_cpu(dent.ino), lock->refresh_gen, ret); return ret; } static int scoutfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct super_block *sb = dentry->d_sb; u64 dir_ino = dentry_parent_ino(dentry); int ret; /* don't think this happens but we can find out */ if (IS_ROOT(dentry)) { scoutfs_inc_counter(sb, dentry_revalidate_root); if (!dentry->d_inode || (scoutfs_ino(dentry->d_inode) != SCOUTFS_ROOT_INO)) { ret = -EIO; } else { ret = 1; } goto out; } /* XXX what are the rules for _RCU? */ if (flags & LOOKUP_RCU) { scoutfs_inc_counter(sb, dentry_revalidate_rcu); ret = -ECHILD; goto out; } if (test_dentry_fsdata(dentry, scoutfs_lock_ino_refresh_gen(sb, dir_ino))) { scoutfs_inc_counter(sb, dentry_revalidate_valid); ret = 1; } else { scoutfs_inc_counter(sb, dentry_revalidate_invalid); ret = 0; } out: trace_scoutfs_d_revalidate(sb, dentry, flags, dir_ino, ret); if (ret < 0 && ret != -ECHILD) scoutfs_inc_counter(sb, dentry_revalidate_error); return ret; } /* * Because of rename, locks are ordered by inode number. To hold the * dir lock while calling iget, we might have to already hold a lesser * inode's lock while telling iget whether or not to lock. Instead of * adding all those moving pieces we drop the dir lock before calling * iget. We don't reuse inode numbers so we don't have to worry about * the target of the link changing. We will only follow the entry as it * existed before or after whatever modification is happening under the * dir lock and that can already legally race before or after our * lookup. */ static struct dentry *scoutfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct super_block *sb = dir->i_sb; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_dirent dent = {0,}; struct inode *inode; u64 ino = 0; u64 hash; int ret; hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); if (dentry->d_name.len > SCOUTFS_NAME_LEN) { ret = -ENAMETOOLONG; goto out; } ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, dir, &dir_lock); if (ret) goto out; ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, dentry->d_name.len, hash, &dent, dir_lock); if (ret == -ENOENT) { ino = 0; ret = 0; } else if (ret == 0) { ino = le64_to_cpu(dent.ino); } if (ret == 0) set_dentry_fsdata(dentry, dir_lock); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ); out: if (ret < 0) inode = ERR_PTR(ret); else if (ino == 0) inode = NULL; else inode = scoutfs_iget(sb, ino, 0, 0); /* * We can't splice dir aliases into the dcache. dir entries * might have changed on other nodes so our dcache could still * contain them, rather than having been moved in rename. For * dirs, we use d_materialize_unique to remove any existing * aliases which must be stale. Our inode numbers aren't reused * so inodes pointed to by entries can't change types. */ if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode)) return d_materialise_unique(dentry, inode); else return d_splice_alias(inode, dentry); } /* * Helper to make iterating through dirent ptrs aligned */ static inline struct scoutfs_dirent *next_aligned_dirent(struct scoutfs_dirent *dent, u8 len) { return (void *)dent + ALIGN(offsetof(struct scoutfs_dirent, name[len]), __alignof__(struct scoutfs_dirent)); } /* * readdir simply iterates over the dirent items for the dir inode and * uses their offset as the readdir position. * * It will need to be careful not to read past the region of the dirent * hash offset keys that it has access to. */ static int scoutfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_dirent *dent = NULL; /* we'll store name_len in dent->__pad[0] */ #define hacky_name_len __pad[0] struct scoutfs_key last_key; struct scoutfs_key key; struct page *page = NULL; int name_len; u64 pos; int entries = 0; int ret; int complete = 0; struct scoutfs_dirent *end; if (!dir_emit_dots(file, ctx)) return 0; page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; end = page_address(page) + PAGE_SIZE; init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode), SCOUTFS_DIRENT_LAST_POS, 0); /* * lock and fetch dirent items, until the page no longer fits * a max size dirent (288b). Then unlock and dir_emit the ones * we stored in the page. */ for (;;) { /* lock */ ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock); if (ret) break; dent = page_address(page); pos = ctx->pos; while (next_aligned_dirent(dent, SCOUTFS_NAME_LEN) < end) { init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode), pos, 0); ret = scoutfs_item_next(sb, &key, &last_key, dent, dirent_bytes(SCOUTFS_NAME_LEN), dir_lock); if (ret < 0) { if (ret == -ENOENT) { ret = 0; complete = 1; } break; } name_len = ret - sizeof(struct scoutfs_dirent); dent->hacky_name_len = name_len; if (name_len < 1 || name_len > SCOUTFS_NAME_LEN) { scoutfs_corruption(sb, SC_DIRENT_READDIR_NAME_LEN, corrupt_dirent_readdir_name_len, "dir_ino %llu pos %llu key "SK_FMT" len %d", scoutfs_ino(inode), pos, SK_ARG(&key), name_len); ret = -EIO; break; } pos = le64_to_cpu(dent->pos) + 1; dent = next_aligned_dirent(dent, name_len); entries++; } /* unlock */ scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_READ); if (ret < 0) break; dent = page_address(page); for (; entries > 0; entries--) { ctx->pos = le64_to_cpu(dent->pos); if (!dir_emit(ctx, dent->name, dent->hacky_name_len, le64_to_cpu(dent->ino), dentry_type(dent->type))) { ret = 0; goto out; } dent = next_aligned_dirent(dent, dent->hacky_name_len); /* always advance ctx->pos past */ ctx->pos++; } if (complete) break; } out: if (page) __free_page(page); return ret; } /* * Add all the items for the named link to the inode in the dir. Only * items are modified. The caller is responsible for locking, entering * a transaction, dirtying items, and managing the vfs structs. * * If this returns an error then nothing will have changed. */ static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash, u64 pos, const char *name, unsigned name_len, u64 ino, umode_t mode, struct scoutfs_lock *dir_lock, struct scoutfs_lock *inode_lock) { struct scoutfs_dirent *dent = NULL; struct scoutfs_key rdir_key; struct scoutfs_key ent_key; struct scoutfs_key lb_key; bool del_rdir = false; bool del_ent = false; int ret; dent = alloc_dirent(name_len); if (!dent) { return -ENOMEM; } /* initialize the dent */ *dent = (struct scoutfs_dirent) { .ino = cpu_to_le64(ino), .hash = cpu_to_le64(hash), .pos = cpu_to_le64(pos), .type = mode_to_type(mode), }; memcpy(dent->name, name, name_len); init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos); init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0); init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos); ret = scoutfs_item_create(sb, &ent_key, dent, dirent_bytes(name_len), dir_lock); if (ret) goto out; del_ent = true; ret = scoutfs_item_create(sb, &rdir_key, dent, dirent_bytes(name_len), dir_lock); if (ret) goto out; del_rdir = true; ret = scoutfs_item_create(sb, &lb_key, dent, dirent_bytes(name_len), inode_lock); out: if (ret < 0) { if (del_ent) scoutfs_item_delete(sb, &ent_key, dir_lock); if (del_rdir) scoutfs_item_delete(sb, &rdir_key, dir_lock); } kfree(dent); return ret; } /* * Delete all the items for the named link to the inode in the dir. * Only items are modified. The caller is responsible for locking, * entering a transaction, dirtying items, and managing the vfs structs. * * If this returns an error then nothing will have changed. */ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash, u64 pos, u64 ino, struct scoutfs_lock *dir_lock, struct scoutfs_lock *inode_lock) { struct scoutfs_key rdir_key; struct scoutfs_key ent_key; struct scoutfs_key lb_key; int ret; init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos); init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0); init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos); ret = scoutfs_item_dirty(sb, &ent_key, dir_lock) ?: scoutfs_item_dirty(sb, &rdir_key, dir_lock) ?: scoutfs_item_dirty(sb, &lb_key, inode_lock); if (ret == 0) { ret = scoutfs_item_delete(sb, &ent_key, dir_lock) ?: scoutfs_item_delete(sb, &rdir_key, dir_lock) ?: scoutfs_item_delete(sb, &lb_key, inode_lock); BUG_ON(ret); /* _dirty should have guaranteed success */ } return ret; } /* * Inode creation needs to hold dir and inode locks which can be greater * or less than each other. It seems easiest to keep the dual locking * here like it is for all the other dual locking of established inodes. * Except we don't have the inode struct yet when we're getting locks, * so we roll our own comparion between the two instead of pushing * complexity down the locking paths that acquire existing inodes in * order. */ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev, struct scoutfs_lock **dir_lock, struct scoutfs_lock **inode_lock, struct scoutfs_lock **orph_lock, struct list_head *ind_locks) { struct super_block *sb = dir->i_sb; struct inode *inode = NULL; u64 ind_seq; int ret = 0; u64 ino; ret = scoutfs_alloc_ino(sb, S_ISDIR(mode), &ino); if (ret) return ERR_PTR(ret); if (ino < scoutfs_ino(dir)) { ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, inode_lock) ?: scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, dir, dir_lock); } else { ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, dir, dir_lock) ?: scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, inode_lock); } if (ret) goto out_unlock; ret = scoutfs_quota_check_inode(sb, dir); if (ret) goto out_unlock; if (orph_lock) { ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock); if (ret < 0) goto out_unlock; } retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?: scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?: scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) goto out_unlock; ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode) ?: scoutfs_init_acl_locked(inode, dir, *inode_lock, *dir_lock, ind_locks); if (ret < 0) goto out; scoutfs_inode_set_proj(inode, scoutfs_inode_get_proj(dir)); ret = scoutfs_dirty_inode_item(dir, *dir_lock); out: if (ret) scoutfs_release_trans(sb); out_unlock: if (ret) { scoutfs_inode_index_unlock(sb, ind_locks); scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE); *dir_lock = NULL; scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE); *inode_lock = NULL; if (orph_lock) { scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY); *orph_lock = NULL; } if (!IS_ERR_OR_NULL(inode)) iput(inode); inode = ERR_PTR(ret); } return inode; } static int scoutfs_mknod(KC_VFS_NS_DEF struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct super_block *sb = dir->i_sb; struct inode *inode = NULL; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_lock *inode_lock = NULL; struct scoutfs_inode_info *si; LIST_HEAD(ind_locks); u64 hash; u64 pos; int ret; if (dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); inode = lock_hold_create(dir, dentry, mode, rdev, &dir_lock, &inode_lock, NULL, &ind_locks); if (IS_ERR(inode)) return PTR_ERR(inode); si = SCOUTFS_I(inode); ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock); if (ret < 0) goto out; pos = SCOUTFS_I(dir)->next_readdir_pos++; ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos, dentry->d_name.name, dentry->d_name.len, scoutfs_ino(inode), inode->i_mode, dir_lock, inode_lock); if (ret) goto out; set_dentry_fsdata(dentry, dir_lock); i_size_write(dir, i_size_read(dir) + dentry->d_name.len); dir->i_mtime = dir->i_ctime = current_time(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime; si->crtime = inode->i_mtime; inode_inc_iversion(dir); inode_inc_iversion(inode); scoutfs_forest_inc_inode_count(sb); if (S_ISDIR(mode)) { inc_nlink(inode); inc_nlink(dir); } scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); scoutfs_inode_index_unlock(sb, &ind_locks); insert_inode_hash(inode); d_instantiate(dentry, inode); out: scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); /* XXX delete the inode item here */ if (ret && !IS_ERR_OR_NULL(inode)) iput(inode); return ret; } /* XXX hmm, do something with excl? */ static int scoutfs_create(KC_VFS_NS_DEF struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return scoutfs_mknod(KC_VFS_NS dir, dentry, mode | S_IFREG, 0); } static int scoutfs_mkdir(KC_VFS_NS_DEF struct inode *dir, struct dentry *dentry, umode_t mode) { return scoutfs_mknod(KC_VFS_NS dir, dentry, mode | S_IFDIR, 0); } static int scoutfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; struct super_block *sb = dir->i_sb; struct scoutfs_lock *dir_lock; struct scoutfs_lock *inode_lock = NULL; struct scoutfs_lock *orph_lock = NULL; LIST_HEAD(ind_locks); bool del_orphan = false; u64 dir_size; u64 ind_seq; u64 hash; u64 pos; int ret; int err; hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); if (dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, dir, &dir_lock, inode, &inode_lock, NULL, NULL, NULL, NULL); if (ret) return ret; ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock); if (ret < 0) goto out_unlock; if (inode->i_nlink >= SCOUTFS_LINK_MAX) { ret = -EMLINK; goto out_unlock; } dir_size = i_size_read(dir) + dentry->d_name.len; if (inode->i_nlink == 0) { del_orphan = true; ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode), &orph_lock); if (ret < 0) goto out_unlock; } retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) goto out_unlock; ret = scoutfs_dirty_inode_item(dir, dir_lock); if (ret) goto out; if (del_orphan) { ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret) goto out; } pos = SCOUTFS_I(dir)->next_readdir_pos++; ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos, dentry->d_name.name, dentry->d_name.len, scoutfs_ino(inode), inode->i_mode, dir_lock, inode_lock); if (ret) { err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */ goto out; } set_dentry_fsdata(dentry, dir_lock); i_size_write(dir, dir_size); dir->i_mtime = dir->i_ctime = current_time(inode); inode->i_ctime = dir->i_mtime; inc_nlink(inode); inode_inc_iversion(dir); inode_inc_iversion(inode); scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); atomic_inc(&inode->i_count); d_instantiate(dentry, inode); out: scoutfs_release_trans(sb); out_unlock: scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); return ret; } static bool should_orphan(struct inode *inode) { if (inode == NULL) return false; if (S_ISDIR(inode->i_mode)) return inode->i_nlink == 2; return inode->i_nlink == 1; } /* * Unlink removes the entry from its item and removes the item if ours * was the only remaining entry. */ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; struct inode *inode = dentry->d_inode; struct kc_timespec ts = current_time(inode); struct scoutfs_lock *inode_lock = NULL; struct scoutfs_lock *orph_lock = NULL; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_dirent dent; LIST_HEAD(ind_locks); u64 ind_seq; u64 hash; int ret; ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, dir, &dir_lock, inode, &inode_lock, NULL, NULL, NULL, NULL); if (ret) return ret; ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock); if (ret < 0) goto unlock; if (S_ISDIR(inode->i_mode) && i_size_read(inode)) { ret = -ENOTEMPTY; goto unlock; } ret = scoutfs_inode_check_retention(inode); if (ret < 0) goto unlock; hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, dentry->d_name.len, hash, &dent, dir_lock); if (ret < 0) goto unlock; if (should_orphan(inode)) { ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode), &orph_lock); if (ret < 0) goto unlock; } retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?: scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; if (ret) goto unlock; if (should_orphan(inode)) { ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret < 0) goto out; } ret = del_entry_items(sb, scoutfs_ino(dir), le64_to_cpu(dent.hash), le64_to_cpu(dent.pos), scoutfs_ino(inode), dir_lock, inode_lock); if (ret) { ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock); WARN_ON_ONCE(ret); /* should have been dirty */ goto out; } set_dentry_fsdata(dentry, dir_lock); dir->i_ctime = ts; dir->i_mtime = ts; i_size_write(dir, i_size_read(dir) - dentry->d_name.len); inode_inc_iversion(dir); inode_inc_iversion(inode); inode->i_ctime = ts; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); drop_nlink(inode); } scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); out: scoutfs_release_trans(sb); unlock: scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); return ret; } static void init_symlink_key(struct scoutfs_key *key, u64 ino, u8 nr) { *key = (struct scoutfs_key) { .sk_zone = SCOUTFS_FS_ZONE, .sks_ino = cpu_to_le64(ino), .sk_type = SCOUTFS_SYMLINK_TYPE, .sks_nr = cpu_to_le64(nr), }; } /* * Operate on all the items that make up a symlink whose target might * have to be split up into multiple items each with a maximally sized * value. * * returns 0 or -errno from the item calls, particularly including * EEXIST, EIO, or ENOENT if the item population doesn't match what was * expected given the op. * * The target name can be null for deletion when val isn't used. Size * still has to be provided to determine the number of items. */ enum symlink_ops { SYM_CREATE = 0, SYM_LOOKUP, SYM_DELETE, }; static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino, struct scoutfs_lock *lock, const char *target, size_t size) { struct scoutfs_key key; unsigned bytes; unsigned nr; int ret; int i; if (WARN_ON_ONCE(size == 0 || size > SCOUTFS_SYMLINK_MAX_SIZE || op > SYM_DELETE)) return -EINVAL; nr = DIV_ROUND_UP(size, SCOUTFS_MAX_VAL_SIZE); for (i = 0; i < nr; i++) { init_symlink_key(&key, ino, i); bytes = min_t(u64, size, SCOUTFS_MAX_VAL_SIZE); if (op == SYM_CREATE) ret = scoutfs_item_create(sb, &key, (void *)target, bytes, lock); else if (op == SYM_LOOKUP) ret = scoutfs_item_lookup_exact(sb, &key, (void *)target, bytes, lock); else if (op == SYM_DELETE) ret = scoutfs_item_delete(sb, &key, lock); if (ret) break; target += SCOUTFS_MAX_VAL_SIZE; size -= bytes; } return ret; } /* * Fill a buffer with the null terminated symlink, and return it * so callers can free it once the vfs is done. * * We chose to pay the runtime cost of per-call allocation and copy * overhead instead of wiring up symlinks to the page cache, storing * each small link in a full page, and later having to reclaim them. */ static void *scoutfs_get_link_target(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; char *path = NULL; loff_t size; int ret; ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); if (ret) return ERR_PTR(ret); size = i_size_read(inode); if (size == 0 || size > SCOUTFS_SYMLINK_MAX_SIZE) { scoutfs_corruption(sb, SC_SYMLINK_INODE_SIZE, corrupt_symlink_inode_size, "ino %llu size %llu", scoutfs_ino(inode), (u64)size); ret = -EIO; goto out; } /* unlikely, but possible I suppose */ if (size > PATH_MAX) { ret = -ENAMETOOLONG; goto out; } path = kmalloc(size, GFP_NOFS); if (!path) { ret = -ENOMEM; goto out; } ret = symlink_item_ops(sb, SYM_LOOKUP, scoutfs_ino(inode), inode_lock, path, size); if (ret == -ENOENT) { scoutfs_corruption(sb, SC_SYMLINK_MISSING_ITEM, corrupt_symlink_missing_item, "ino %llu size %llu", scoutfs_ino(inode), size); ret = -EIO; } else if (ret == 0 && path[size - 1]) { scoutfs_corruption(sb, SC_SYMLINK_NOT_NULL_TERM, corrupt_symlink_not_null_term, "ino %llu last %u", scoutfs_ino(inode), path[size - 1]); ret = -EIO; } out: if (ret < 0) { kfree(path); path = ERR_PTR(ret); } scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); return path; } #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER static void *scoutfs_follow_link(struct dentry *dentry, struct nameidata *nd) { char *path; path = scoutfs_get_link_target(dentry); if (!IS_ERR_OR_NULL(path)) nd_set_link(nd, path); return path; } static void scoutfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { if (!IS_ERR_OR_NULL(cookie)) kfree(cookie); } #else static const char *scoutfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *path; path = scoutfs_get_link_target(dentry); if (!IS_ERR_OR_NULL(path)) set_delayed_call(done, kfree_link, path); return path; } #endif /* * Symlink target paths can be annoyingly large. We store relatively * rare large paths in multiple items. */ static int scoutfs_symlink(KC_VFS_NS_DEF struct inode *dir, struct dentry *dentry, const char *symname) { struct super_block *sb = dir->i_sb; const int name_len = strlen(symname) + 1; struct inode *inode = NULL; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_lock *inode_lock = NULL; struct scoutfs_inode_info *si; LIST_HEAD(ind_locks); u64 hash; u64 pos; int ret; hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len); /* path_max includes null as does our value for nd_set_link */ if (dentry->d_name.len > SCOUTFS_NAME_LEN || name_len > PATH_MAX || name_len > SCOUTFS_SYMLINK_MAX_SIZE) return -ENAMETOOLONG; inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, &dir_lock, &inode_lock, NULL, &ind_locks); if (IS_ERR(inode)) return PTR_ERR(inode); si = SCOUTFS_I(inode); ret = validate_dentry(sb, scoutfs_ino(dir), dentry, dir_lock); if (ret < 0) goto out; ret = symlink_item_ops(sb, SYM_CREATE, scoutfs_ino(inode), inode_lock, symname, name_len); if (ret) goto out; pos = SCOUTFS_I(dir)->next_readdir_pos++; ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos, dentry->d_name.name, dentry->d_name.len, scoutfs_ino(inode), inode->i_mode, dir_lock, inode_lock); if (ret) goto out; set_dentry_fsdata(dentry, dir_lock); i_size_write(dir, i_size_read(dir) + dentry->d_name.len); dir->i_mtime = dir->i_ctime = current_time(inode); inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime; si->crtime = inode->i_ctime; i_size_write(inode, name_len); inode_inc_iversion(inode); scoutfs_forest_inc_inode_count(sb); scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); insert_inode_hash(inode); /* XXX need to set i_op/fop before here for sec callbacks */ d_instantiate(dentry, inode); inode = NULL; ret = 0; out: if (ret < 0) { /* XXX remove inode items */ symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock, NULL, name_len); } scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); if (!IS_ERR_OR_NULL(inode)) iput(inode); return ret; } int scoutfs_symlink_drop(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, u64 i_size) { int ret; ret = symlink_item_ops(sb, SYM_DELETE, ino, lock, NULL, i_size); if (ret == -ENOENT) ret = 0; return ret; } /* * Find the next link backref items for the given ino starting from the * given dir inode and final entry position. For each backref item we * add an allocated copy of it to the head of the caller's list. * * Callers who are building a path can add one entry for each parent. * They're left with a list of entries from the root down in list order. * * Callers who are gathering multiple entries for one inode get the * entries in the opposite order that their items are found. * * Returns +ve for number of entries added, -ENOENT if no entries were * found, or -errno on error. It weirdly won't return 0, but early * callers preferred -ENOENT so we use that for the case of no entries. * * Callers are comfortable with the race inherent to incrementally * gathering backrefs across multiple lock acquisitions. */ int scoutfs_dir_add_next_linkrefs(struct super_block *sb, u64 ino, u64 dir_ino, u64 dir_pos, int count, struct list_head *list) { struct scoutfs_link_backref_entry *prev_ent = NULL; struct scoutfs_link_backref_entry *ent = NULL; struct scoutfs_lock *lock = NULL; struct scoutfs_key last_key; struct scoutfs_key key; int nr = 0; int len; int ret; init_dirent_key(&key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, dir_pos); init_dirent_key(&last_key, SCOUTFS_LINK_BACKREF_TYPE, ino, U64_MAX, U64_MAX); ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock); if (ret) goto out; while (nr < count) { ent = kmalloc(offsetof(struct scoutfs_link_backref_entry, dent.name[SCOUTFS_NAME_LEN]), GFP_NOFS); if (!ent) { ret = -ENOMEM; goto out; } INIT_LIST_HEAD(&ent->head); ret = scoutfs_item_next(sb, &key, &last_key, &ent->dent, dirent_bytes(SCOUTFS_NAME_LEN), lock); if (ret < 0) { if (ret == -ENOENT && prev_ent) prev_ent->last = true; goto out; } len = ret - sizeof(struct scoutfs_dirent); if (len < 1 || len > SCOUTFS_NAME_LEN) { scoutfs_corruption(sb, SC_DIRENT_BACKREF_NAME_LEN, corrupt_dirent_backref_name_len, "ino %llu dir_ino %llu pos %llu key "SK_FMT" len %d", ino, dir_ino, dir_pos, SK_ARG(&key), len); ret = -EIO; goto out; } ent->dir_ino = le64_to_cpu(key.skd_major); ent->dir_pos = le64_to_cpu(key.skd_minor); ent->name_len = len; ent->d_type = dentry_type(ent->dent.type); ent->last = false; trace_scoutfs_dir_add_next_linkref_found(sb, ino, ent->dir_ino, ent->dir_pos, ent->name_len); list_add(&ent->head, list); prev_ent = ent; ent = NULL; nr++; scoutfs_key_inc(&key); } ret = 0; out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); trace_scoutfs_dir_add_next_linkrefs(sb, ino, dir_ino, dir_pos, count, nr, ret); kfree(ent); return nr ?: ret; } static u64 first_backref_dir_ino(struct list_head *list) { struct scoutfs_link_backref_entry *ent; ent = list_first_entry(list, struct scoutfs_link_backref_entry, head); return ent->dir_ino; } void scoutfs_dir_free_backref_path(struct super_block *sb, struct list_head *list) { struct scoutfs_link_backref_entry *ent; struct scoutfs_link_backref_entry *pos; list_for_each_entry_safe(ent, pos, list, head) { list_del_init(&ent->head); kfree(ent); } } /* * Give the caller the next path from the root to the inode by walking * backref items from the dir and name position, putting the backref keys * we find in the caller's list. * * Return 0 if we found a path, -ENOENT if we didn't, and -errno on error. * * If parents get unlinked while we're searching we can fail to make it * up to the root. We restart the search in that case. Parent dirs * couldn't have been unlinked while they still had entries and we won't * see links to the inode that have been unlinked. * * XXX Each path component traversal is consistent but that doesn't mean * that the total traversed path is consistent. If renames hit dirs * that have been visited and then dirs to be visited we can return a * path that was never present in the system: * * path to inode mv performed built up path * ---- * a/b/c/d/e/f * d/e/f * mv a/b/c/d/e a/b/c/ * a/b/c/e/f * mv a/b/c a/ * a/c/e/f * a/c/d/e/f * * XXX We'll protect against this by sampling the seq before the * traversal and restarting if we saw backref items whose seq was * greater than the start point. It's not precise in that it doesn't * also capture the rename of a dir that we already traversed but it * lets us complete the traversal in one pass that very rarely restarts. * * XXX and worry about traversing entirely dirty backref items with * equal seqs that have seen crazy modification? seems like we have to * sync if we see our dirty seq. */ int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino, u64 dir_pos, struct list_head *list) { int retries = 10; u64 par_ino; int ret; retry: if (retries-- == 0) { scoutfs_inc_counter(sb, dir_backref_excessive_retries); ret = -ELOOP; goto out; } /* get the next link name to the given inode */ ret = scoutfs_dir_add_next_linkrefs(sb, ino, dir_ino, dir_pos, 1, list); if (ret < 0) goto out; /* then get the names of all the parent dirs */ par_ino = first_backref_dir_ino(list); while (par_ino != SCOUTFS_ROOT_INO) { ret = scoutfs_dir_add_next_linkrefs(sb, par_ino, 0, 0, 1, list); if (ret < 0) { if (ret == -ENOENT) { /* restart if there was no parent component */ scoutfs_dir_free_backref_path(sb, list); goto retry; } goto out; } par_ino = first_backref_dir_ino(list); } ret = 0; out: if (ret < 0) scoutfs_dir_free_backref_path(sb, list); return ret; } /* * Given two parent dir inos, return the ancestor of p2 that is p1's * child when p1 is also an ancestor of p2: p1/p/[...]/p2. This can * return p2. * * We do this by walking link backref items. Each entry can be thought * of as a dirent stored at the target. So the parent dir is stored in * the target. * * The caller holds the global rename lock and link backref walk locks * each inode as it looks up backrefs. */ static int item_d_ancestor(struct super_block *sb, u64 p1, u64 p2, u64 *p_ret) { struct scoutfs_link_backref_entry *ent; LIST_HEAD(list); int ret; u64 p; *p_ret = 0; if (p2 == SCOUTFS_ROOT_INO) { ret = 0; goto out; } ret = scoutfs_dir_get_backref_path(sb, p2, 0, 0, &list); if (ret) goto out; p = p2; list_for_each_entry(ent, &list, head) { if (ent->dir_ino == p1) { *p_ret = p; ret = 0; break; } p = ent->dir_ino; } out: scoutfs_dir_free_backref_path(sb, &list); return ret; } /* * The vfs checked the relationship between dirs, the source, and target * before acquiring clusters locks. All that could have changed. If * we're renaming between parent dirs then we try to verify the basics * of those checks using our backref items. * * Compare this to lock_rename()'s use of d_ancestor() and what it's * caller does with the returned ancestor. * * The caller only holds the global rename cluster lock. * item_d_ancestor is going to walk backref paths and acquire and * release locks for each target inode in the path. */ static int verify_ancestors(struct super_block *sb, u64 p1, u64 p2, u64 old_ino, u64 new_ino) { int ret; u64 p; ret = item_d_ancestor(sb, p1, p2, &p); if (ret == 0 && p == 0) ret = item_d_ancestor(sb, p2, p1, &p); if (ret == 0 && p && (p == old_ino || p == new_ino)) ret = -EINVAL; return ret; } /* * The vfs performs checks on cached inodes and dirents before calling * here. It doesn't hold any locks so all of those checks can be based * on cached state that has been invalidated by other operations in the * cluster before we get here. * * We do the expedient thing today and verify the basic structural * checks after we get cluster locks. We perform topology checks * analagous to the d_ancestor() walks in lock_rename() after acquiring * a clustered equivalent of the vfs rename lock. We then lock the dir * and target inodes and verify that the entries assumed by the function * arguments still exist. * * We don't duplicate all the permissions checking in the vfs * (may_create(), etc, are all static.). This means racing renames can * succeed after other nodes have gotten success out of changes to * permissions that should have forbidden renames. * * All of this wouldn't be necessary if we could get prepare/complete * callbacks around rename that'd let us lock the inodes, dirents, and * topology while the vfs walks dentries and uses inodes. * * We acquire the inode locks in inode number order. Because of our * inode group locking we can't define lock ordering correctness by * properties that can be different in a given group. This prevents us * from using parent/child locking orders as two groups can have both * parent and child relationships to each other. */ static int scoutfs_rename_common(KC_VFS_NS_DEF struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct super_block *sb = old_dir->i_sb; struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct scoutfs_lock *rename_lock = NULL; struct scoutfs_lock *old_dir_lock = NULL; struct scoutfs_lock *new_dir_lock = NULL; struct scoutfs_lock *old_inode_lock = NULL; struct scoutfs_lock *new_inode_lock = NULL; struct scoutfs_lock *orph_lock = NULL; struct scoutfs_dirent new_dent; struct scoutfs_dirent old_dent; struct kc_timespec now; bool ins_new = false; bool del_new = false; bool ins_old = false; LIST_HEAD(ind_locks); u64 ind_seq; u64 old_hash; u64 new_hash; u64 new_pos; int ret; int err; trace_scoutfs_rename(sb, old_dir, old_dentry, new_dir, new_dentry); old_hash = dirent_name_hash(old_dentry->d_name.name, old_dentry->d_name.len); new_hash = dirent_name_hash(new_dentry->d_name.name, new_dentry->d_name.len); if (new_dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; /* if dirs are different make sure ancestor relationships are valid */ if (old_dir != new_dir) { ret = scoutfs_lock_rename(sb, SCOUTFS_LOCK_WRITE, 0, &rename_lock); if (ret) return ret; ret = verify_ancestors(sb, scoutfs_ino(old_dir), scoutfs_ino(new_dir), scoutfs_ino(old_inode), new_inode ? scoutfs_ino(new_inode) : 0); if (ret) goto out_unlock; } /* lock all the inodes */ ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, old_dir, &old_dir_lock, new_dir, &new_dir_lock, old_inode, &old_inode_lock, new_inode, &new_inode_lock); if (ret) goto out_unlock; /* make sure that the entries assumed by the argument still exist */ ret = validate_dentry(sb, scoutfs_ino(old_dir), old_dentry, old_dir_lock) ?: validate_dentry(sb, scoutfs_ino(new_dir), new_dentry, new_dir_lock); if (ret) goto out_unlock; /* test dir i_size now that it's refreshed */ if (new_inode && S_ISDIR(new_inode->i_mode) && i_size_read(new_inode)) { ret = -ENOTEMPTY; goto out_unlock; } if ((flags & RENAME_NOREPLACE) && (new_inode != NULL)) { ret = -EEXIST; goto out_unlock; } if ((old_inode && (ret = scoutfs_inode_check_retention(old_inode))) || (new_inode && (ret = scoutfs_inode_check_retention(new_inode)))) goto out_unlock; if (should_orphan(new_inode)) { ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode), &orph_lock); if (ret < 0) goto out_unlock; } retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?: scoutfs_inode_index_prepare(sb, &ind_locks, old_inode, false) ?: (new_dir == old_dir ? 0 : scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?: (new_inode == NULL ? 0 : scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?: scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true); if (ret > 0) goto retry; if (ret) goto out_unlock; /* get a pos for the new entry */ new_pos = SCOUTFS_I(new_dir)->next_readdir_pos++; /* dirty the inodes so that updating doesn't fail */ ret = scoutfs_dirty_inode_item(old_dir, old_dir_lock) ?: scoutfs_dirty_inode_item(old_inode, old_inode_lock) ?: (old_dir != new_dir ? scoutfs_dirty_inode_item(new_dir, new_dir_lock) : 0) ?: (new_inode ? scoutfs_dirty_inode_item(new_inode, new_inode_lock) : 0); if (ret) goto out; /* remove the new entry if it exists */ if (new_inode) { ret = lookup_dirent(sb, scoutfs_ino(new_dir), new_dentry->d_name.name, new_dentry->d_name.len, new_hash, &new_dent, new_dir_lock); if (ret < 0) goto out; ret = del_entry_items(sb, scoutfs_ino(new_dir), le64_to_cpu(new_dent.hash), le64_to_cpu(new_dent.pos), scoutfs_ino(new_inode), new_dir_lock, new_inode_lock); if (ret) goto out; ins_new = true; } /* create the new entry */ ret = add_entry_items(sb, scoutfs_ino(new_dir), new_hash, new_pos, new_dentry->d_name.name, new_dentry->d_name.len, scoutfs_ino(old_inode), old_inode->i_mode, new_dir_lock, old_inode_lock); if (ret) goto out; del_new = true; ret = lookup_dirent(sb, scoutfs_ino(old_dir), old_dentry->d_name.name, old_dentry->d_name.len, old_hash, &old_dent, old_dir_lock); if (ret < 0) goto out; /* remove the old entry */ ret = del_entry_items(sb, scoutfs_ino(old_dir), le64_to_cpu(old_dent.hash), le64_to_cpu(old_dent.pos), scoutfs_ino(old_inode), old_dir_lock, old_inode_lock); if (ret) goto out; ins_old = true; if (should_orphan(new_inode)) { ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock, new_inode_lock); if (ret) goto out; } /* won't fail from here on out, update all the vfs structs */ /* the caller will use d_move to move the old_dentry into place */ set_dentry_fsdata(old_dentry, new_dir_lock); i_size_write(old_dir, i_size_read(old_dir) - old_dentry->d_name.len); if (!new_inode) i_size_write(new_dir, i_size_read(new_dir) + new_dentry->d_name.len); if (new_inode) { drop_nlink(new_inode); if (S_ISDIR(new_inode->i_mode)) { drop_nlink(new_dir); drop_nlink(new_inode); } } if (S_ISDIR(old_inode->i_mode) && (old_dir != new_dir)) { drop_nlink(old_dir); inc_nlink(new_dir); } now = current_time(old_inode); old_dir->i_ctime = now; old_dir->i_mtime = now; if (new_dir != old_dir) { new_dir->i_ctime = now; new_dir->i_mtime = now; } old_inode->i_ctime = now; if (new_inode) new_inode->i_ctime = now; inode_inc_iversion(old_dir); inode_inc_iversion(old_inode); if (new_dir != old_dir) inode_inc_iversion(new_dir); if (new_inode) inode_inc_iversion(new_inode); scoutfs_update_inode_item(old_dir, old_dir_lock, &ind_locks); scoutfs_update_inode_item(old_inode, old_inode_lock, &ind_locks); if (new_dir != old_dir) scoutfs_update_inode_item(new_dir, new_dir_lock, &ind_locks); if (new_inode) scoutfs_update_inode_item(new_inode, new_inode_lock, &ind_locks); ret = 0; out: if (ret) { /* * XXX We have to clean up partial item deletions today * because we can't have two dirents existing in a * directory that point to different inodes. If we * could we'd create the new name then everything after * that is deletion that will only fail cleanly or * succeed. Maybe we could have an item replace call * that gives us the dupe to re-insert on cleanup? Not * sure. * * It's safe to use dentry_info here 'cause they haven't * been updated if we saw an error. */ err = 0; if (ins_old) err = add_entry_items(sb, scoutfs_ino(old_dir), le64_to_cpu(old_dent.hash), le64_to_cpu(old_dent.pos), old_dentry->d_name.name, old_dentry->d_name.len, scoutfs_ino(old_inode), old_inode->i_mode, old_dir_lock, old_inode_lock); if (del_new && err == 0) err = del_entry_items(sb, scoutfs_ino(new_dir), new_hash, new_pos, scoutfs_ino(old_inode), new_dir_lock, old_inode_lock); if (ins_new && err == 0) err = add_entry_items(sb, scoutfs_ino(new_dir), le64_to_cpu(new_dent.hash), le64_to_cpu(new_dent.pos), new_dentry->d_name.name, new_dentry->d_name.len, scoutfs_ino(new_inode), new_inode->i_mode, new_dir_lock, new_inode_lock); /* XXX freak out: panic, go read only, etc */ BUG_ON(err); } scoutfs_release_trans(sb); out_unlock: scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, old_inode_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, new_inode_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); return ret; } #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { return scoutfs_rename_common(KC_VFS_INIT_NS old_dir, old_dentry, new_dir, new_dentry, 0); } #endif static int scoutfs_rename2(KC_VFS_NS_DEF struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (flags & ~RENAME_NOREPLACE) return -EINVAL; return scoutfs_rename_common(KC_VFS_NS old_dir, old_dentry, new_dir, new_dentry, flags); } #ifdef KC_FMODE_KABI_ITERATE /* we only need this to set the iterate flag for kabi :/ */ static int scoutfs_dir_open(struct inode *inode, struct file *file) { file->f_mode |= FMODE_KABI_ITERATE; return 0; } #endif static int scoutfs_tmpfile(KC_VFS_NS_DEF struct inode *dir, #ifdef KC_D_TMPFILE_DENTRY struct dentry *dentry, #else struct file *file, #endif umode_t mode) { #ifndef KC_D_TMPFILE_DENTRY struct dentry *dentry = file->f_path.dentry; #endif struct super_block *sb = dir->i_sb; struct inode *inode = NULL; struct scoutfs_lock *dir_lock = NULL; struct scoutfs_lock *inode_lock = NULL; struct scoutfs_lock *orph_lock = NULL; struct scoutfs_inode_info *si; LIST_HEAD(ind_locks); int ret; if (dentry->d_name.len > SCOUTFS_NAME_LEN) return -ENAMETOOLONG; inode = lock_hold_create(dir, dentry, mode, 0, &dir_lock, &inode_lock, &orph_lock, &ind_locks); if (IS_ERR(inode)) return PTR_ERR(inode); si = SCOUTFS_I(inode); ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret < 0) goto out; /* XXX returning error but items created */ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); si->crtime = inode->i_mtime; insert_inode_hash(inode); ihold(inode); /* need to update inode modifications in d_tmpfile */ #ifdef KC_D_TMPFILE_DENTRY d_tmpfile(dentry, inode); #else d_tmpfile(file, inode); #endif inode_inc_iversion(inode); scoutfs_forest_inc_inode_count(sb); scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); scoutfs_inode_index_unlock(sb, &ind_locks); #ifndef KC_D_TMPFILE_DENTRY ret = finish_open_simple(file, 0); #endif out: scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); if (!IS_ERR_OR_NULL(inode)) iput(inode); return ret; } const struct inode_operations scoutfs_symlink_iops = { #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER .readlink = generic_readlink, .follow_link = scoutfs_follow_link, .put_link = scoutfs_put_link, #else .get_link = scoutfs_get_link, #endif .getattr = scoutfs_getattr, .setattr = scoutfs_setattr, #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER .setxattr = generic_setxattr, .getxattr = generic_getxattr, #endif .listxattr = scoutfs_listxattr, #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER .removexattr = generic_removexattr, #endif #ifdef KC_GET_INODE_ACL .get_inode_acl = scoutfs_get_acl, #else .get_acl = scoutfs_get_acl, #endif #ifndef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER .tmpfile = scoutfs_tmpfile, .rename = scoutfs_rename_common, .symlink = scoutfs_symlink, .unlink = scoutfs_unlink, .link = scoutfs_link, .mkdir = scoutfs_mkdir, .create = scoutfs_create, .lookup = scoutfs_lookup, #endif }; const struct file_operations scoutfs_dir_fops = { .iterate = scoutfs_readdir, #ifdef KC_FMODE_KABI_ITERATE .open = scoutfs_dir_open, #endif .unlocked_ioctl = scoutfs_ioctl, .fsync = scoutfs_file_fsync, .llseek = generic_file_llseek, }; #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER const struct inode_operations_wrapper scoutfs_dir_iops = { .ops = { #else const struct inode_operations scoutfs_dir_iops = { #endif .lookup = scoutfs_lookup, .mknod = scoutfs_mknod, .create = scoutfs_create, .mkdir = scoutfs_mkdir, .link = scoutfs_link, .unlink = scoutfs_unlink, .rmdir = scoutfs_unlink, .getattr = scoutfs_getattr, .setattr = scoutfs_setattr, #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER .rename = scoutfs_rename, .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = scoutfs_listxattr, #ifdef KC_GET_INODE_ACL .get_inode_acl = scoutfs_get_acl, #else .get_acl = scoutfs_get_acl, #endif #ifdef KC_SET_ACL_DENTRY .set_acl = scoutfs_set_acl, #endif .symlink = scoutfs_symlink, .permission = scoutfs_permission, #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER }, #endif .tmpfile = scoutfs_tmpfile, #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER .rename2 = scoutfs_rename2, #else .rename = scoutfs_rename2, #endif };