Files
scoutfs/kmod/src/inode.c
Zach Brown 1cbd84eece scoutfs: wire up sop->dirty_inode
We're using the generic block buffer_head write_begin and write_end
functions.  They call sop->dirty_inode() to update the inode i_size.  We
didn't have that method wired up so updates to the inode in the write
path wasn't dirtying the inode item.  Lost i_size updates would
trivially lose data but we first noticed this when looking at inode item
sequence numbers while overwriting.

Signed-off-by: Zach Brown <zab@versity.com>
2016-11-08 16:05:36 -08:00

604 lines
15 KiB
C

/*
* Copyright (C) 2015 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/xattr.h>
#include <linux/mm.h>
#include "format.h"
#include "super.h"
#include "key.h"
#include "inode.h"
#include "btree.h"
#include "dir.h"
#include "filerw.h"
#include "scoutfs_trace.h"
#include "xattr.h"
#include "trans.h"
#include "btree.h"
#include "msg.h"
/*
* XXX
* - worry about i_ino trunctation, not sure if we do anything
* - use inode item value lengths for forward/back compat
*/
static struct kmem_cache *scoutfs_inode_cachep;
static void scoutfs_inode_ctor(void *obj)
{
struct scoutfs_inode_info *ci = obj;
init_rwsem(&ci->xattr_rwsem);
inode_init_once(&ci->inode);
}
struct inode *scoutfs_alloc_inode(struct super_block *sb)
{
struct scoutfs_inode_info *ci;
ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
return &ci->inode;
}
static void scoutfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
trace_printk("freeing inode %p\n", inode);
kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
}
void scoutfs_destroy_inode(struct inode *inode)
{
call_rcu(&inode->i_rcu, scoutfs_i_callback);
}
static const struct inode_operations scoutfs_file_iops = {
.setxattr = scoutfs_setxattr,
.getxattr = scoutfs_getxattr,
.listxattr = scoutfs_listxattr,
.removexattr = scoutfs_removexattr,
};
/*
* Called once new inode allocation or inode reading has initialized
* enough of the inode for us to set the ops based on the mode.
*/
static void set_inode_ops(struct inode *inode)
{
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &scoutfs_file_aops;
inode->i_op = &scoutfs_file_iops;
inode->i_fop = &scoutfs_file_fops;
break;
case S_IFDIR:
inode->i_op = &scoutfs_dir_iops;
inode->i_fop = &scoutfs_dir_fops;
break;
case S_IFLNK:
inode->i_op = &scoutfs_symlink_iops;
break;
default:
// inode->i_op = &scoutfs_special_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
break;
}
}
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
i_size_write(inode, le64_to_cpu(cinode->size));
set_nlink(inode, le32_to_cpu(cinode->nlink));
i_uid_write(inode, le32_to_cpu(cinode->uid));
i_gid_write(inode, le32_to_cpu(cinode->gid));
inode->i_mode = le32_to_cpu(cinode->mode);
inode->i_rdev = le32_to_cpu(cinode->rdev);
inode->i_atime.tv_sec = le64_to_cpu(cinode->atime.sec);
inode->i_atime.tv_nsec = le32_to_cpu(cinode->atime.nsec);
inode->i_mtime.tv_sec = le64_to_cpu(cinode->mtime.sec);
inode->i_mtime.tv_nsec = le32_to_cpu(cinode->mtime.nsec);
inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
ci->salt = le32_to_cpu(cinode->salt);
atomic64_set(&ci->link_counter, le64_to_cpu(cinode->link_counter));
}
static int scoutfs_read_locked_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_btree_val val;
struct scoutfs_inode sinode;
struct scoutfs_key key;
int ret;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
scoutfs_btree_init_val(&val, &sinode, sizeof(sinode));
ret = scoutfs_btree_lookup(sb, meta, &key, &val);
if (ret == sizeof(sinode)) {
load_inode(inode, &sinode);
ret = 0;
} else if (ret >= 0) {
ret = -EIO;
}
return ret;
}
static int scoutfs_iget_test(struct inode *inode, void *arg)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
u64 *ino = arg;
return ci->ino == *ino;
}
static int scoutfs_iget_set(struct inode *inode, void *arg)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
u64 *ino = arg;
inode->i_ino = *ino;
ci->ino = *ino;
return 0;
}
struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
{
struct inode *inode;
int ret;
inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set,
&ino);
if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
ret = scoutfs_read_locked_inode(inode);
if (ret) {
iget_failed(inode);
inode = ERR_PTR(ret);
} else {
set_inode_ops(inode);
unlock_new_inode(inode);
}
}
return inode;
}
static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
cinode->size = cpu_to_le64(i_size_read(inode));
cinode->nlink = cpu_to_le32(inode->i_nlink);
cinode->uid = cpu_to_le32(i_uid_read(inode));
cinode->gid = cpu_to_le32(i_gid_read(inode));
cinode->mode = cpu_to_le32(inode->i_mode);
cinode->rdev = cpu_to_le32(inode->i_rdev);
cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec);
cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec);
cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec);
cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
cinode->salt = cpu_to_le32(ci->salt);
cinode->link_counter = cpu_to_le64(atomic64_read(&ci->link_counter));
}
/*
* Create a pinned dirty inode item so that we can later update the
* inode item without risking failure. We often wouldn't want to have
* to unwind inode modifcations (perhaps by shared vfs code!) if our
* item update failed. This is our chance to return errors for enospc
* for lack of space for new logged dirty inode items.
*
* This dirty inode item will be found by lookups in the interim so we
* have to update it now with the current inode contents.
*
* Callers don't delete these dirty items on errors. They're still
* valid and will be merged with the current item eventually. They can
* be found in the dirty block to avoid future dirtying (say repeated
* creations in a directory).
*
* The caller has to prevent sync between dirtying and updating the
* inodes.
*
* XXX this will have to do something about variable length inodes
*/
int scoutfs_dirty_inode_item(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_key key;
int ret;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
ret = scoutfs_btree_dirty(sb, meta, &key);
if (!ret)
trace_scoutfs_dirty_inode(inode);
return ret;
}
/*
* Every time we modify the inode in memory we copy it to its inode
* item. This lets us write out blocks of items without having to track
* down dirty vfs inodes and safely copy them into items before writing.
*
* The caller makes sure that the item is dirty and pinned so they don't
* have to deal with errors and unwinding after they've modified the
* vfs inode and get here.
*/
void scoutfs_update_inode_item(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_btree_val val;
struct scoutfs_inode sinode;
struct scoutfs_key key;
int err;
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
scoutfs_btree_init_val(&val, &sinode, sizeof(sinode));
store_inode(&sinode, inode);
err = scoutfs_btree_update(sb, meta, &key, &val);
BUG_ON(err);
trace_scoutfs_update_inode(inode);
}
/*
* sop->dirty_inode() can't return failure. Our use of it has to be
* careful to pin the inode during a transaction. The generic write
* paths pin the inode in write_begin and get called to update the inode
* in write_end.
*
* The caller should have a trans but it's cheap for us to grab it
* ourselves to make sure.
*
* This will holler at us if a caller didn't pin the inode and we
* couldn't dirty the inode ourselves.
*/
void scoutfs_dirty_inode(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
int ret;
ret = scoutfs_hold_trans(sb);
if (ret == 0) {
ret = scoutfs_dirty_inode_item(inode);
if (ret == 0)
scoutfs_update_inode_item(inode);
scoutfs_release_trans(sb);
}
WARN_ON_ONCE(ret);
}
/*
* A quick atomic sample of the last inode number that's been allocated.
*/
u64 scoutfs_last_ino(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_super_block *super = &sbi->super;
u64 last;
spin_lock(&sbi->next_ino_lock);
last = le64_to_cpu(super->next_ino);
spin_unlock(&sbi->next_ino_lock);
return last;
}
static int alloc_ino(struct super_block *sb, u64 *ino)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_super_block *super = &sbi->super;
int ret;
spin_lock(&sbi->next_ino_lock);
if (super->next_ino == 0) {
ret = -ENOSPC;
} else {
*ino = le64_to_cpu(super->next_ino);
le64_add_cpu(&super->next_ino, 1);
ret = 0;
}
spin_unlock(&sbi->next_ino_lock);
return ret;
}
/*
* Allocate and initialize a new inode. The caller is responsible for
* creating links to it and updating it. @dir can be null.
*/
struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t rdev)
{
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_inode_info *ci;
struct scoutfs_btree_val val;
struct scoutfs_inode sinode;
struct scoutfs_key key;
struct inode *inode;
u64 ino;
int ret;
ret = alloc_ino(sb, &ino);
if (ret)
return ERR_PTR(ret);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ci = SCOUTFS_I(inode);
ci->ino = ino;
get_random_bytes(&ci->salt, sizeof(ci->salt));
atomic64_set(&ci->link_counter, 0);
inode->i_ino = ino; /* XXX overflow */
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_rdev = rdev;
set_inode_ops(inode);
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_INODE_KEY, 0);
scoutfs_btree_init_val(&val, &sinode, sizeof(sinode));
store_inode(&sinode, inode);
ret = scoutfs_btree_insert(inode->i_sb, meta, &key, &val);
if (ret) {
iput(inode);
return ERR_PTR(ret);
}
return inode;
}
static int remove_orphan_item(struct super_block *sb, u64 ino)
{
struct scoutfs_key key;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
int ret;
scoutfs_set_key(&key, ino, SCOUTFS_ORPHAN_KEY, 0);
ret = scoutfs_btree_delete(sb, meta, &key);
if (ret == -ENOENT)
ret = 0;
return ret;
}
static int __delete_inode(struct super_block *sb, struct scoutfs_key *key,
u64 ino, umode_t mode)
{
int ret;
bool release = false;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
trace_delete_inode(sb, ino, mode);
ret = scoutfs_hold_trans(sb);
if (ret)
goto out;
release = true;
ret = scoutfs_xattr_drop(sb, ino);
if (ret)
goto out;
if (S_ISLNK(mode))
ret = scoutfs_symlink_drop(sb, ino);
else if (S_ISREG(mode))
ret = scoutfs_truncate_block_items(sb, ino, 0);
if (ret)
goto out;
ret = scoutfs_btree_delete(sb, meta, key);
if (ret)
goto out;
ret = remove_orphan_item(sb, ino);
out:
if (release)
scoutfs_release_trans(sb);
return ret;
}
/*
* Remove all the items associated with a given inode.
*/
static void delete_inode(struct super_block *sb, u64 ino)
{
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_btree_val val;
struct scoutfs_inode sinode;
struct scoutfs_key key;
umode_t mode;
int ret;
/* sample the inode mode, XXX don't need to copy whole thing here */
scoutfs_set_key(&key, ino, SCOUTFS_INODE_KEY, 0);
scoutfs_btree_init_val(&val, &sinode, sizeof(sinode));
val.check_size_eq = 1;
ret = scoutfs_btree_lookup(sb, meta, &key, &val);
if (ret < 0)
goto out;
mode = le32_to_cpu(sinode.mode);
ret = __delete_inode(sb, &key, ino, mode);
out:
if (ret)
trace_printk("drop items failed ret %d ino %llu\n", ret, ino);
}
/*
* iput_final has already written out the dirty pages to the inode
* before we get here. We're left with a clean inode that we have to
* tear down. If there are no more links to the inode then we also
* remove all its persistent structures.
*/
void scoutfs_evict_inode(struct inode *inode)
{
trace_printk("ino %llu nlink %d bad %d\n",
scoutfs_ino(inode), inode->i_nlink, is_bad_inode(inode));
if (is_bad_inode(inode))
goto clear;
truncate_inode_pages_final(&inode->i_data);
if (inode->i_nlink == 0)
delete_inode(inode->i_sb, scoutfs_ino(inode));
clear:
clear_inode(inode);
}
int scoutfs_drop_inode(struct inode *inode)
{
int ret = generic_drop_inode(inode);
trace_printk("ret %d nlink %d unhashed %d\n",
ret, inode->i_nlink, inode_unhashed(inode));
return ret;
}
static int process_orphaned_inode(struct super_block *sb, u64 ino)
{
int ret;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_btree_val val;
struct scoutfs_inode sinode;
struct scoutfs_key key;
scoutfs_set_key(&key, ino, SCOUTFS_INODE_KEY, 0);
scoutfs_btree_init_val(&val, &sinode, sizeof(sinode));
val.check_size_eq = 1;
ret = scoutfs_btree_lookup(sb, meta, &key, &val);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
return ret;
}
if (le32_to_cpu(sinode.nlink) == 0)
__delete_inode(sb, &key, ino, le32_to_cpu(sinode.mode));
else
scoutfs_warn(sb, "Dangling orphan item for inode %llu.", ino);
return ret;
}
/*
* Scan the metadata tree for orphan items and process each one.
*
* Runtime of this will be bounded by the number of orphans, which could
* theoretically be very large. If that becomes a problem we might want to push
* this work off to a thread.
*/
int scoutfs_scan_orphans(struct super_block *sb)
{
int ret, err = 0;
struct scoutfs_key first, last, found;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
trace_scoutfs_scan_orphans(sb);
scoutfs_set_key(&first, 0, SCOUTFS_ORPHAN_KEY, 0);
scoutfs_set_key(&last, ~0ULL, SCOUTFS_ORPHAN_KEY, 0);
while (1) {
ret = scoutfs_btree_next(sb, meta, &first, &last, &found, NULL);
if (ret == -ENOENT) /* No more orphan items */
break;
if (ret < 0)
goto out;
ret = process_orphaned_inode(sb, le64_to_cpu(found.inode));
if (ret && ret != -ENOENT && !err)
err = ret;
first = found;
scoutfs_inc_key(&first);
}
ret = 0;
out:
return err ? err : ret;
}
int scoutfs_orphan_inode(struct inode *inode)
{
int ret;
struct super_block *sb = inode->i_sb;
struct scoutfs_key key;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
trace_scoutfs_orphan_inode(sb, inode);
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_ORPHAN_KEY, 0);
ret = scoutfs_btree_insert(sb, meta, &key, NULL);
return ret;
}
void scoutfs_inode_exit(void)
{
if (scoutfs_inode_cachep) {
rcu_barrier();
kmem_cache_destroy(scoutfs_inode_cachep);
scoutfs_inode_cachep = NULL;
}
}
int scoutfs_inode_init(void)
{
scoutfs_inode_cachep = kmem_cache_create("scoutfs_inode_info",
sizeof(struct scoutfs_inode_info), 0,
SLAB_RECLAIM_ACCOUNT,
scoutfs_inode_ctor);
if (!scoutfs_inode_cachep)
return -ENOMEM;
return 0;
}