/* * Copyright (C) 2016 Versity Software, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "format.h" #include "key.h" #include "dir.h" #include "ioctl.h" #include "super.h" #include "inode.h" #include "item.h" #include "forest.h" #include "data.h" #include "client.h" #include "lock.h" #include "trans.h" #include "xattr.h" #include "hash.h" #include "srch.h" #include "alloc.h" #include "server.h" #include "counters.h" #include "attr_x.h" #include "totl.h" #include "wkic.h" #include "quota.h" #include "scoutfs_trace.h" #include "util.h" /* * We make inode index items coherent by locking fixed size regions of * the key space. But the inode index item key space is vast and can * have huge sparse regions. To avoid trying every possible lock in the * sparse regions we use the manifest to find the next stable key in the * key space after we find no items in a given lock region. This is * relatively cheap because reading is going to check the segments * anyway. */ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_walk_inodes __user *uwalk = (void __user *)arg; struct scoutfs_ioctl_walk_inodes walk; struct scoutfs_ioctl_walk_inodes_entry *ent = NULL; struct scoutfs_ioctl_walk_inodes_entry *end; struct scoutfs_key next_key; struct scoutfs_key last_key; struct scoutfs_key key; struct scoutfs_lock *lock; struct page *page = NULL; u64 last_seq; u64 entries = 0; int ret = 0; int complete = 0; u32 nr = 0; u8 type; if (copy_from_user(&walk, uwalk, sizeof(walk))) return -EFAULT; trace_scoutfs_ioc_walk_inodes(sb, &walk); if (walk.index == SCOUTFS_IOC_WALK_INODES_META_SEQ) type = SCOUTFS_INODE_INDEX_META_SEQ_TYPE; else if (walk.index == SCOUTFS_IOC_WALK_INODES_DATA_SEQ) type = SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE; else return -EINVAL; /* clamp results to the inodes in the farthest stable seq */ if (type == SCOUTFS_INODE_INDEX_META_SEQ_TYPE || type == SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE) { ret = scoutfs_client_get_last_seq(sb, &last_seq); if (ret) return ret; if (last_seq < walk.last.major) { walk.last.major = last_seq; walk.last.minor = ~0; walk.last.ino = ~0ULL; } } page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; scoutfs_inode_init_index_key(&key, type, walk.first.major, walk.first.minor, walk.first.ino); scoutfs_inode_init_index_key(&last_key, type, walk.last.major, walk.last.minor, walk.last.ino); /* cap nr to the max the ioctl can return to a compat task */ walk.nr_entries = min_t(u64, walk.nr_entries, INT_MAX); end = page_address(page) + PAGE_SIZE; /* outer loop */ for (nr = 0;;) { ent = page_address(page); /* make sure _pad and minor are zeroed */ memset(ent, 0, PAGE_SIZE); ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type, le64_to_cpu(key.skii_major), le64_to_cpu(key.skii_ino), &lock); if (ret) break; /* inner loop 1 */ while (ent + 1 < end) { ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock); if (ret < 0 && ret != -ENOENT) break; if (ret == -ENOENT) { /* done if lock covers last iteration key */ if (scoutfs_key_compare(&last_key, &lock->end) <= 0) { ret = 0; complete = 1; break; } /* continue iterating after locked empty region */ key = lock->end; scoutfs_key_inc(&key); scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); /* avoid double-unlocking here after break */ lock = NULL; ret = scoutfs_forest_next_hint(sb, &key, &next_key); if (ret < 0 && ret != -ENOENT) break; if (ret == -ENOENT || scoutfs_key_compare(&next_key, &last_key) > 0) { ret = 0; complete = 1; break; } key = next_key; ret = scoutfs_lock_inode_index(sb, SCOUTFS_LOCK_READ, type, le64_to_cpu(key.skii_major), le64_to_cpu(key.skii_ino), &lock); if (ret) break; continue; } ent->major = le64_to_cpu(key.skii_major); ent->ino = le64_to_cpu(key.skii_ino); scoutfs_key_inc(&key); ent++; entries++; if (nr + entries >= walk.nr_entries) { complete = 1; break; } } scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); if (ret < 0) break; /* inner loop 2 */ ent = page_address(page); for (; entries > 0; entries--) { if (copy_to_user((void __user *)walk.entries_ptr, ent, sizeof(struct scoutfs_ioctl_walk_inodes_entry))) { ret = -EFAULT; goto out; } walk.entries_ptr += sizeof(struct scoutfs_ioctl_walk_inodes_entry); ent++; nr++; } if (complete) break; } out: if (page) __free_page(page); if (nr > 0) ret = nr; return ret; } /* * See the comment above the definition of struct scoutfs_ioctl_ino_path * for ioctl semantics. */ static long scoutfs_ioc_ino_path(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_ino_path_result __user *ures; struct scoutfs_link_backref_entry *last_ent; struct scoutfs_link_backref_entry *ent; struct scoutfs_ioctl_ino_path args; LIST_HEAD(list); u16 copied; char term; int ret; if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; if (copy_from_user(&args, (void __user *)arg, sizeof(args))) return -EFAULT; ures = (void __user *)(unsigned long)args.result_ptr; ret = scoutfs_dir_get_backref_path(sb, args.ino, args.dir_ino, args.dir_pos, &list); if (ret < 0) goto out; last_ent = list_last_entry(&list, struct scoutfs_link_backref_entry, head); copied = 0; list_for_each_entry(ent, &list, head) { if (offsetof(struct scoutfs_ioctl_ino_path_result, path[copied + ent->name_len + 1]) > args.result_bytes) { ret = -ENAMETOOLONG; goto out; } if (copy_to_user(&ures->path[copied], ent->dent.name, ent->name_len)) { ret = -EFAULT; goto out; } copied += ent->name_len; if (ent == last_ent) term = '\0'; else term = '/'; if (put_user(term, &ures->path[copied])) { ret = -EFAULT; break; } copied++; } /* fill the result header now that we know the copied path length */ if (put_user(last_ent->dir_ino, &ures->dir_ino) || put_user(last_ent->dir_pos, &ures->dir_pos) || put_user(copied, &ures->path_bytes)) { ret = -EFAULT; } else { ret = 0; } out: scoutfs_dir_free_backref_path(sb, &list); return ret; } /* * The caller has a version of the data available in the given byte * range in an external archive. As long as the data version still * matches we free the blocks fully contained in the range and mark them * offline. Attempts to use the blocks in the future will trigger * recall from the archive. * * If the file's online blocks drop to 0 then we also truncate any * blocks beyond i_size. This honors the intent of fully releasing a file * without the user needing to know to release past i_size or truncate. * * XXX permissions? * XXX a lot of this could be generic file write prep */ static long scoutfs_ioc_release(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct scoutfs_ioctl_release args; struct scoutfs_lock *lock = NULL; u64 sblock; u64 eblock; u64 online; u64 offline; u64 isize; __u64 tmp; int ret; if (copy_from_user(&args, (void __user *)arg, sizeof(args))) return -EFAULT; trace_scoutfs_ioc_release(sb, scoutfs_ino(inode), &args); if (args.length == 0) return 0; if ((check_add_overflow(args.offset, args.length - 1, &tmp)) || (args.offset & SCOUTFS_BLOCK_SM_MASK) || (args.length & SCOUTFS_BLOCK_SM_MASK)) return -EINVAL; ret = mnt_want_write_file(file); if (ret) return ret; inode_lock(inode); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (ret) goto out; if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; goto out; } if (!(file->f_mode & FMODE_WRITE)) { ret = -EINVAL; goto out; } if (scoutfs_inode_data_version(inode) != args.data_version) { ret = -ESTALE; goto out; } inode_dio_wait(inode); /* drop all clean and dirty cached blocks in the range */ truncate_inode_pages_range(&inode->i_data, args.offset, args.offset + args.length - 1); sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT; eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT; ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode), sblock, eblock, true, lock); if (ret == 0) { scoutfs_inode_get_onoff(inode, &online, &offline); isize = i_size_read(inode); if (online == 0 && isize) { sblock = (isize + SCOUTFS_BLOCK_SM_SIZE - 1) >> SCOUTFS_BLOCK_SM_SHIFT; ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode), sblock, U64_MAX, false, lock); } } out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); inode_unlock(inode); mnt_drop_write_file(file); trace_scoutfs_ioc_release_ret(sb, scoutfs_ino(inode), ret); return ret; } static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_data_wait_err args; struct scoutfs_lock *lock = NULL; struct inode *inode = NULL; u64 sblock; u64 eblock; long ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&args, (void __user *)arg, sizeof(args))) return -EFAULT; if (args.count == 0) return 0; if ((args.op & SCOUTFS_IOC_DWO_UNKNOWN) || !IS_ERR_VALUE(args.err)) return -EINVAL; if ((args.op & SCOUTFS_IOC_DWO_UNKNOWN) || !IS_ERR_VALUE(args.err)) return -EINVAL; trace_scoutfs_ioc_data_wait_err(sb, &args); sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT; eblock = (args.offset + args.count - 1) >> SCOUTFS_BLOCK_SM_SHIFT; if (sblock > eblock) return -EINVAL; inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino); if (!inode) { ret = -ESTALE; goto out; } inode_lock(inode); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (ret) goto unlock; if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; } else { ret = scoutfs_data_wait_err(inode, sblock, eblock, args.op, args.err); } scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); unlock: inode_unlock(inode); iput(inode); out: return ret; } /* * Write the archived contents of the file back if the data_version * still matches. * * This is a data plane operation only. We don't want the write to * change any fields in the inode. It only changes the file contents. * * Keep in mind that the staging writes can easily span transactions and * can crash partway through. If we called the normal write path and * restored the inode afterwards the modified inode could be commited * partway through by a transaction and then left that way by a crash * before the write finishes and we restore the fields. It also * wouldn't be great if the temporarily updated inode was visible to * paths that don't serialize with write. * * We're implementing the buffered write path down to the start of * generic_file_buffered_writes() without all the stuff that would * change the inode: file_remove_suid(), file_update_time(). The * easiest way to do that is to call generic_file_buffered_write(). * We're careful to only allow staging writes inside i_size. * * We set a bool on the inode which tells our code to update the * offline extents and to not update the data_version counter. * * This doesn't support any fancy write modes or side-effects: aio, * direct, append, sync, breaking suid, sending rlimit signals. */ static long scoutfs_ioc_stage(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct scoutfs_inode_info *si = SCOUTFS_I(inode); SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); struct scoutfs_ioctl_stage args; struct scoutfs_lock *lock = NULL; struct kiocb kiocb; struct iovec iov; size_t written; loff_t end_size; loff_t isize; loff_t pos; int ret; if (copy_from_user(&args, (void __user *)arg, sizeof(args))) return -EFAULT; trace_scoutfs_ioc_stage(sb, scoutfs_ino(inode), &args); end_size = args.offset + args.length; /* verify arg constraints that aren't dependent on file */ if (args.length < 0 || (end_size < args.offset) || args.offset & SCOUTFS_BLOCK_SM_MASK) { return -EINVAL; } if (args.length == 0) return 0; /* the iocb is really only used for the file pointer :P */ init_sync_kiocb(&kiocb, file); kiocb.ki_pos = args.offset; #ifdef KC_LINUX_AIO_KI_LEFT kiocb.ki_left = args.length; kiocb.ki_nbytes = args.length; #endif iov.iov_base = (void __user *)(unsigned long)args.buf_ptr; iov.iov_len = args.length; ret = mnt_want_write_file(file); if (ret) return ret; inode_lock(inode); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (ret) goto out; scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, lock); isize = i_size_read(inode); if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE) || (file->f_flags & (O_APPEND | O_DIRECT | O_DSYNC)) || IS_SYNC(file->f_mapping->host) || (end_size > isize) || ((end_size & SCOUTFS_BLOCK_SM_MASK) && (end_size != isize))) { ret = -EINVAL; goto out; } if (scoutfs_inode_data_version(inode) != args.data_version) { ret = -ESTALE; goto out; } si->staging = true; #ifdef KC_CURRENT_BACKING_DEV_INFO current->backing_dev_info = inode_to_bdi(inode); #endif pos = args.offset; written = 0; do { ret = generic_file_buffered_write(&kiocb, &iov, 1, pos, &pos, args.length, written); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) written += ret; } while (ret > 0 && written < args.length); si->staging = false; #ifdef KC_CURRENT_BACKING_DEV_INFO current->backing_dev_info = NULL; #endif out: scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); inode_unlock(inode); mnt_drop_write_file(file); trace_scoutfs_ioc_stage_ret(sb, scoutfs_ino(inode), ret); return ret; } static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); struct scoutfs_ioctl_inode_attr_x *iax = NULL; struct scoutfs_ioctl_stat_more *stm = NULL; int ret; iax = kmalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL); stm = kmalloc(sizeof(struct scoutfs_ioctl_stat_more), GFP_KERNEL); if (!iax || !stm) { ret = -ENOMEM; goto out; } iax->x_mask = SCOUTFS_IOC_IAX_META_SEQ | SCOUTFS_IOC_IAX_DATA_SEQ | SCOUTFS_IOC_IAX_DATA_VERSION | SCOUTFS_IOC_IAX_ONLINE_BLOCKS | SCOUTFS_IOC_IAX_OFFLINE_BLOCKS | SCOUTFS_IOC_IAX_CRTIME; iax->x_flags = 0; ret = scoutfs_get_attr_x(inode, iax); if (ret < 0) goto out; stm->meta_seq = iax->meta_seq; stm->data_seq = iax->data_seq; stm->data_version = iax->data_version; stm->online_blocks = iax->online_blocks; stm->offline_blocks = iax->offline_blocks; stm->crtime_sec = iax->crtime_sec; stm->crtime_nsec = iax->crtime_nsec; if (copy_to_user((void __user *)arg, stm, sizeof(struct scoutfs_ioctl_stat_more))) ret = -EFAULT; else ret = 0; out: kfree(iax); kfree(stm); return ret; } static bool inc_wrapped(u64 *ino, u64 *iblock) { return (++(*iblock) == 0) && (++(*ino) == 0); } static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_data_waiting idw; struct scoutfs_ioctl_data_waiting_entry __user *udwe; struct scoutfs_ioctl_data_waiting_entry dwe[16]; unsigned int nr; int total; int ret; if (copy_from_user(&idw, (void __user *)arg, sizeof(idw))) return -EFAULT; if (idw.flags & SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN) return -EINVAL; udwe = (void __user *)(long)idw.ents_ptr; total = 0; ret = 0; while (idw.ents_nr && !inc_wrapped(&idw.after_ino, &idw.after_iblock)) { nr = min_t(size_t, idw.ents_nr, ARRAY_SIZE(dwe)); ret = scoutfs_data_waiting(sb, idw.after_ino, idw.after_iblock, dwe, nr); BUG_ON(ret > nr); /* stack overflow \o/ */ if (ret <= 0) break; if (copy_to_user(udwe, dwe, ret * sizeof(dwe[0]))) { ret = -EFAULT; break; } idw.after_ino = dwe[ret - 1].ino; idw.after_iblock = dwe[ret - 1].iblock; udwe += ret; idw.ents_nr -= ret; total += ret; ret = 0; } return ret ?: total; } /* * This is used when restoring files, it lets the caller set all the * inode attributes which are otherwise unreachable. Changing the file * size can only be done for regular files with a data_version of 0. * * We unconditionally fill the iax attributes from the sm set and let * set_attr_x check them. */ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); struct scoutfs_ioctl_setattr_more __user *usm = (void __user *)arg; struct scoutfs_ioctl_inode_attr_x *iax = NULL; struct scoutfs_ioctl_setattr_more sm; LIST_HEAD(ind_locks); int ret; if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out; } if (copy_from_user(&sm, usm, sizeof(sm))) { ret = -EFAULT; goto out; } if (sm.flags & SCOUTFS_IOC_SETATTR_MORE_UNKNOWN) { ret = -EINVAL; goto out; } iax = kzalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL); if (!iax) { ret = -ENOMEM; goto out; } iax->x_mask = SCOUTFS_IOC_IAX_CTIME | SCOUTFS_IOC_IAX_CRTIME | SCOUTFS_IOC_IAX_SIZE; iax->data_version = sm.data_version; iax->ctime_sec = sm.ctime_sec; iax->ctime_nsec = sm.ctime_nsec; iax->crtime_sec = sm.crtime_sec; iax->crtime_nsec = sm.crtime_nsec; iax->size = sm.i_size; if (sm.flags & SCOUTFS_IOC_SETATTR_MORE_OFFLINE) iax->x_flags |= SCOUTFS_IOC_IAX_F_SIZE_OFFLINE; if (sm.data_version != 0) iax->x_mask |= SCOUTFS_IOC_IAX_DATA_VERSION; ret = mnt_want_write_file(file); if (ret < 0) goto out; ret = scoutfs_set_attr_x(inode, iax); mnt_drop_write_file(file); out: kfree(iax); return ret; } /* * This lists .hide. attributes on the inode. It doesn't include normal * xattrs that are visible to listxattr because we don't perform as * rigorous security access checks as normal vfs listxattr does. */ static long scoutfs_ioc_listxattr_hidden(struct file *file, unsigned long arg) { struct inode *inode = file->f_inode; struct scoutfs_ioctl_listxattr_hidden __user *ulxr = (void __user *)arg; struct scoutfs_ioctl_listxattr_hidden lxh; struct page *page = NULL; unsigned int bytes; int total = 0; int ret; ret = inode_permission(KC_VFS_INIT_NS inode, MAY_READ); if (ret < 0) goto out; if (copy_from_user(&lxh, ulxr, sizeof(lxh))) { ret = -EFAULT; goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } while (lxh.buf_bytes) { bytes = min_t(int, lxh.buf_bytes, PAGE_SIZE); ret = scoutfs_list_xattrs(inode, page_address(page), bytes, &lxh.hash_pos, &lxh.id_pos, false, true); if (ret <= 0) break; if (copy_to_user((void __user *)lxh.buf_ptr, page_address(page), ret)) { ret = -EFAULT; break; } lxh.buf_ptr += ret; lxh.buf_bytes -= ret; total += ret; ret = 0; } out: if (page) __free_page(page); if (ret == 0 && (__put_user(lxh.hash_pos, &ulxr->hash_pos) || __put_user(lxh.id_pos, &ulxr->id_pos))) ret = -EFAULT; return ret ?: total; } /* * Return the inode numbers of inodes which might contain the given * named xattr. This will only find scoutfs xattrs with the index tag * but we don't check that the callers xattr name contains the tag and * search for it regardless. */ static long scoutfs_ioc_search_xattrs(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_search_xattrs __user *usx = (void __user *)arg; struct scoutfs_ioctl_search_xattrs sx; struct scoutfs_xattr_prefix_tags tgs; struct scoutfs_srch_rb_root sroot; struct scoutfs_srch_rb_node *snode; u64 __user *uinos; struct rb_node *node; char *name = NULL; bool done = false; u64 prev_ino = 0; u64 total = 0; int ret; if (!(file->f_mode & FMODE_READ)) { ret = -EBADF; goto out; } if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } if (copy_from_user(&sx, usx, sizeof(sx))) { ret = -EFAULT; goto out; } uinos = (u64 __user *)sx.inodes_ptr; if (sx.name_bytes > SCOUTFS_XATTR_MAX_NAME_LEN) { ret = -EINVAL; goto out; } if (sx.nr_inodes == 0 || sx.last_ino < sx.next_ino) { ret = 0; goto out; } name = kmalloc(sx.name_bytes, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto out; } if (copy_from_user(name, (void __user *)sx.name_ptr, sx.name_bytes)) { ret = -EFAULT; goto out; } if (scoutfs_xattr_parse_tags(name, sx.name_bytes, &tgs) < 0 || !tgs.srch) { ret = -EINVAL; goto out; } ret = scoutfs_srch_search_xattrs(sb, &sroot, scoutfs_hash64(name, sx.name_bytes), sx.next_ino, sx.last_ino, &done); if (ret < 0) goto out; prev_ino = 0; scoutfs_srch_foreach_rb_node(snode, node, &sroot) { if (prev_ino == snode->ino) continue; if (put_user(snode->ino, uinos + total)) { ret = -EFAULT; break; } prev_ino = snode->ino; if (++total == sx.nr_inodes) break; } sx.output_flags = 0; if (done && total == sroot.nr) sx.output_flags |= SCOUTFS_SEARCH_XATTRS_OFLAG_END; if (put_user(sx.output_flags, &usx->output_flags)) ret = -EFAULT; else ret = 0; scoutfs_srch_destroy_rb_root(&sroot); out: kfree(name); return ret ?: total; } static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super; struct scoutfs_ioctl_statfs_more sfm; int ret; super = kzalloc(sizeof(struct scoutfs_super_block), GFP_NOFS); if (!super) return -ENOMEM; ret = scoutfs_read_super(sb, super); if (ret) goto out; sfm.fsid = le64_to_cpu(super->hdr.fsid); sfm.rid = sbi->rid; sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks); sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks); sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb); ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq); if (ret) goto out; if (copy_to_user((void __user *)arg, &sfm, sizeof(sfm))) ret = -EFAULT; else ret = 0; out: kfree(super); return ret; } struct copy_alloc_detail_args { struct scoutfs_ioctl_alloc_detail_entry __user *uade; u64 nr; u64 copied; }; static int copy_alloc_detail_to_user(struct super_block *sb, void *arg, int owner, u64 id, bool meta, bool avail, u64 blocks) { struct copy_alloc_detail_args *args = arg; struct scoutfs_ioctl_alloc_detail_entry ade; if (args->copied == args->nr) return -EOVERFLOW; /* .type and .pad need clearing */ memset(&ade, 0, sizeof(struct scoutfs_ioctl_alloc_detail_entry)); ade.blocks = blocks; ade.id = id; ade.meta = !!meta; ade.avail = !!avail; if (copy_to_user(&args->uade[args->copied], &ade, sizeof(ade))) return -EFAULT; args->copied++; return 0; } static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_alloc_detail __user *uad = (void __user *)arg; struct scoutfs_ioctl_alloc_detail ad; struct copy_alloc_detail_args args; if (copy_from_user(&ad, uad, sizeof(ad))) return -EFAULT; args.uade = (struct scoutfs_ioctl_alloc_detail_entry __user *) (uintptr_t)ad.entries_ptr; args.nr = ad.entries_nr; args.copied = 0; return scoutfs_alloc_foreach(sb, copy_alloc_detail_to_user, &args) ?: args.copied; } static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg) { struct inode *to = file_inode(file); struct super_block *sb = to->i_sb; struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg; struct scoutfs_ioctl_move_blocks mb; struct file *from_file; struct inode *from; u64 tmp; int ret; if (copy_from_user(&mb, umb, sizeof(mb))) return -EFAULT; if (mb.len == 0) return 0; if ((check_add_overflow(mb.from_off, mb.len - 1, &tmp)) || (check_add_overflow(mb.to_off, mb.len - 1, &tmp))) return -EOVERFLOW; from_file = fget(mb.from_fd); if (!from_file) return -EBADF; from = file_inode(from_file); if (from == to) { ret = -EINVAL; goto out; } if (from->i_sb != sb) { ret = -EXDEV; goto out; } if (mb.flags & SCOUTFS_IOC_MB_UNKNOWN) { ret = -EINVAL; goto out; } ret = mnt_want_write_file(file); if (ret < 0) goto out; ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len, to, mb.to_off, !!(mb.flags & SCOUTFS_IOC_MB_STAGE), mb.data_version); mnt_drop_write_file(file); out: fput(from_file); return ret; } static long scoutfs_ioc_resize_devices(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_resize_devices __user *urd = (void __user *)arg; struct scoutfs_ioctl_resize_devices rd; struct scoutfs_net_resize_devices nrd; int ret; if (!(file->f_mode & FMODE_READ)) { ret = -EBADF; goto out; } if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } if (copy_from_user(&rd, urd, sizeof(rd))) { ret = -EFAULT; goto out; } nrd.new_total_meta_blocks = cpu_to_le64(rd.new_total_meta_blocks); nrd.new_total_data_blocks = cpu_to_le64(rd.new_total_data_blocks); ret = scoutfs_client_resize_devices(sb, &nrd); out: return ret; } struct read_xattr_total_iter_cb_args { struct scoutfs_ioctl_xattr_total *xt; unsigned int copied; unsigned int total; }; /* * This is called under an RCU read lock so it can't copy to userspace. */ static int read_xattr_total_iter_cb(struct scoutfs_key *key, void *val, unsigned int val_len, void *cb_arg) { struct read_xattr_total_iter_cb_args *cba = cb_arg; struct scoutfs_xattr_totl_val *tval = val; struct scoutfs_ioctl_xattr_total *xt = &cba->xt[cba->copied]; xt->name[0] = le64_to_cpu(key->skxt_a); xt->name[1] = le64_to_cpu(key->skxt_b); xt->name[2] = le64_to_cpu(key->skxt_c); xt->total = le64_to_cpu(tval->total); xt->count = le64_to_cpu(tval->count); if (++cba->copied < cba->total) return -EAGAIN; else return 0; } /* * Starting from the caller's pos_name, copy the names, totals, and * counts for the .totl. tagged xattrs in the system sorted by their * name until the user's buffer is full. This only sees xattrs that * have been committed. It doesn't use locking to force commits and * block writers so it can be a little bit out of date with respect to * dirty xattrs in memory across the system. */ static long scoutfs_ioc_read_xattr_totals(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_read_xattr_totals __user *urxt = (void __user *)arg; struct scoutfs_ioctl_read_xattr_totals rxt; struct scoutfs_ioctl_xattr_total __user *uxt; struct read_xattr_total_iter_cb_args cba = {NULL, }; struct scoutfs_key range_start; struct scoutfs_key range_end; struct scoutfs_key key; unsigned int copied = 0; unsigned int total; unsigned int ready; int ret; if (!(file->f_mode & FMODE_READ)) { ret = -EBADF; goto out; } if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } cba.xt = (void *)__get_free_page(GFP_KERNEL); if (!cba.xt) { ret = -ENOMEM; goto out; } cba.total = PAGE_SIZE / sizeof(struct scoutfs_ioctl_xattr_total); if (copy_from_user(&rxt, urxt, sizeof(rxt))) { ret = -EFAULT; goto out; } uxt = (void __user *)rxt.totals_ptr; if ((rxt.totals_ptr & (sizeof(__u64) - 1)) || (rxt.totals_bytes < sizeof(struct scoutfs_ioctl_xattr_total))) { ret = -EINVAL; goto out; } total = div_u64(min_t(u64, rxt.totals_bytes, INT_MAX), sizeof(struct scoutfs_ioctl_xattr_total)); scoutfs_totl_set_range(&range_start, &range_end); scoutfs_xattr_init_totl_key(&key, rxt.pos_name); while (copied < total) { cba.copied = 0; ret = scoutfs_wkic_iterate(sb, &key, &range_end, &range_start, &range_end, read_xattr_total_iter_cb, &cba); if (ret < 0) goto out; if (cba.copied == 0) break; ready = min(total - copied, cba.copied); if (copy_to_user(&uxt[copied], cba.xt, ready * sizeof(cba.xt[0]))) { ret = -EFAULT; goto out; } scoutfs_xattr_init_totl_key(&key, cba.xt[ready - 1].name); scoutfs_key_inc(&key); copied += ready; } ret = 0; out: if (cba.xt) free_page((long)cba.xt); return ret ?: copied; } static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_get_allocated_inos __user *ugai = (void __user *)arg; struct scoutfs_ioctl_get_allocated_inos gai; struct scoutfs_lock *lock = NULL; struct scoutfs_key key; struct scoutfs_key end; struct page *page = NULL; u64 __user *uinos; u64 bytes; u64 *ino; u64 *ino_end; int entries = 0; int nr; int ret; int complete = 0; if (!(file->f_mode & FMODE_READ)) { ret = -EBADF; goto out; } if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } if (copy_from_user(&gai, ugai, sizeof(gai))) { ret = -EFAULT; goto out; } if ((gai.inos_ptr & (sizeof(__u64) - 1)) || (gai.inos_bytes < sizeof(__u64))) { ret = -EINVAL; goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } ino_end = page_address(page) + PAGE_SIZE; scoutfs_inode_init_key(&key, gai.start_ino); scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK); uinos = (void __user *)gai.inos_ptr; bytes = gai.inos_bytes; nr = 0; for (;;) { ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock); if (ret < 0) goto out; ino = page_address(page); while (ino < ino_end) { ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock); if (ret < 0) { if (ret == -ENOENT) { ret = 0; complete = 1; } break; } if (key.sk_zone != SCOUTFS_FS_ZONE) { ret = 0; complete = 1; break; } /* all fs items are owned by allocated inodes, and _first is always ino */ *ino = le64_to_cpu(key._sk_first); scoutfs_inode_init_key(&key, *ino + 1); ino++; entries++; nr++; bytes -= sizeof(*uinos); if (bytes < sizeof(*uinos)) { complete = 1; break; } if (nr == INT_MAX) { complete = 1; break; } } scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); if (ret < 0) break; ino = page_address(page); if (copy_to_user(uinos, ino, entries * sizeof(*uinos))) { ret = -EFAULT; goto out; } uinos += entries; entries = 0; if (complete) break; } out: if (page) __free_page(page); return ret ?: nr; } /* * Copy entries that point to an inode to the user's buffer. We copy to * userspace from copies of the entries that are acquired under a lock * so that we don't fault while holding cluster locks. It also gives us * a chance to limit the amount of work under each lock hold. */ static long scoutfs_ioc_get_referring_entries(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_get_referring_entries gre; struct scoutfs_link_backref_entry *bref = NULL; struct scoutfs_link_backref_entry *bref_tmp; struct scoutfs_ioctl_dirent __user *uent; struct scoutfs_ioctl_dirent ent; LIST_HEAD(list); u64 copied; int name_len; int bytes; long nr; int ret; if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; if (copy_from_user(&gre, (void __user *)arg, sizeof(gre))) return -EFAULT; uent = (void __user *)(unsigned long)gre.entries_ptr; copied = 0; nr = 0; /* use entry as cursor between calls */ ent.dir_ino = gre.dir_ino; ent.dir_pos = gre.dir_pos; for (;;) { ret = scoutfs_dir_add_next_linkrefs(sb, gre.ino, ent.dir_ino, ent.dir_pos, 1024, &list); if (ret < 0) { if (ret == -ENOENT) ret = 0; goto out; } /* _add_next adds each entry to the head, _reverse for key order */ list_for_each_entry_safe_reverse(bref, bref_tmp, &list, head) { list_del_init(&bref->head); name_len = bref->name_len; bytes = ALIGN(offsetof(struct scoutfs_ioctl_dirent, name[name_len + 1]), 16); if (copied + bytes > gre.entries_bytes) { ret = -EINVAL; goto out; } ent.dir_ino = bref->dir_ino; ent.dir_pos = bref->dir_pos; ent.ino = gre.ino; ent.entry_bytes = bytes; ent.flags = bref->last ? SCOUTFS_IOCTL_DIRENT_FLAG_LAST : 0; ent.d_type = bref->d_type; ent.name_len = name_len; if (copy_to_user(uent, &ent, offsetof(struct scoutfs_ioctl_dirent, name[0])) || copy_to_user(&uent->name[0], bref->dent.name, name_len) || put_user('\0', &uent->name[name_len])) { ret = -EFAULT; goto out; } kfree(bref); bref = NULL; uent = (void __user *)uent + bytes; copied += bytes; nr++; if (nr == LONG_MAX || (ent.flags & SCOUTFS_IOCTL_DIRENT_FLAG_LAST)) { ret = 0; goto out; } } /* advance cursor pos from last copied entry */ if (++ent.dir_pos == 0) { if (++ent.dir_ino == 0) { ret = 0; goto out; } } } ret = 0; out: kfree(bref); list_for_each_entry_safe(bref, bref_tmp, &list, head) { list_del_init(&bref->head); kfree(bref); } return nr ?: ret; } static long scoutfs_ioc_get_attr_x(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); struct scoutfs_ioctl_inode_attr_x __user *uiax = (void __user *)arg; struct scoutfs_ioctl_inode_attr_x *iax = NULL; int ret; iax = kmalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL); if (!iax) { ret = -ENOMEM; goto out; } ret = get_user(iax->x_mask, &uiax->x_mask) ?: get_user(iax->x_flags, &uiax->x_flags); if (ret < 0) goto out; ret = scoutfs_get_attr_x(inode, iax); if (ret < 0) goto out; /* only copy results after dropping cluster locks (could fault) */ if (ret > 0 && copy_to_user(uiax, iax, ret) != 0) ret = -EFAULT; else ret = 0; out: kfree(iax); return ret; } static long scoutfs_ioc_set_attr_x(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); struct scoutfs_ioctl_inode_attr_x __user *uiax = (void __user *)arg; struct scoutfs_ioctl_inode_attr_x *iax = NULL; int ret; iax = kmalloc(sizeof(struct scoutfs_ioctl_inode_attr_x), GFP_KERNEL); if (!iax) { ret = -ENOMEM; goto out; } if (copy_from_user(iax, uiax, sizeof(struct scoutfs_ioctl_inode_attr_x))) { ret = -EFAULT; goto out; } ret = mnt_want_write_file(file); if (ret < 0) goto out; ret = scoutfs_set_attr_x(inode, iax); mnt_drop_write_file(file); out: kfree(iax); return ret; } static long scoutfs_ioc_get_quota_rules(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_get_quota_rules __user *ugqr = (void __user *)arg; struct scoutfs_ioctl_get_quota_rules gqr; struct scoutfs_ioctl_quota_rule __user *uirules; struct scoutfs_ioctl_quota_rule *irules; struct page *page = NULL; int copied = 0; int nr; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&gqr, ugqr, sizeof(gqr))) return -EFAULT; if (gqr.rules_nr == 0) return 0; uirules = (void __user *)gqr.rules_ptr; /* limit rules copied per call */ gqr.rules_nr = min_t(u64, gqr.rules_nr, INT_MAX); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { ret = -ENOMEM; goto out; } irules = page_address(page); while (copied < gqr.rules_nr) { nr = min_t(u64, gqr.rules_nr - copied, PAGE_SIZE / sizeof(struct scoutfs_ioctl_quota_rule)); ret = scoutfs_quota_get_rules(sb, gqr.iterator, page_address(page), nr); if (ret <= 0) goto out; if (copy_to_user(&uirules[copied], irules, ret * sizeof(irules[0]))) { ret = -EFAULT; goto out; } copied += ret; } ret = 0; out: if (page) __free_page(page); if (ret == 0 && copy_to_user(ugqr->iterator, gqr.iterator, sizeof(gqr.iterator))) ret = -EFAULT; return ret ?: copied; } static long scoutfs_ioc_mod_quota_rule(struct file *file, unsigned long arg, bool is_add) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_quota_rule __user *uirule = (void __user *)arg; struct scoutfs_ioctl_quota_rule irule; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&irule, uirule, sizeof(irule))) return -EFAULT; return scoutfs_quota_mod_rule(sb, is_add, &irule); } struct read_index_buf { int nr; int size; struct scoutfs_ioctl_xattr_index_entry ents[0]; }; #define READ_INDEX_BUF_MAX_ENTS \ ((PAGE_SIZE - sizeof(struct read_index_buf)) / \ sizeof(struct scoutfs_ioctl_xattr_index_entry)) /* * This doesn't filter out duplicates, the caller filters them out to * catch duplicates between iteration calls. */ static int read_index_cb(struct scoutfs_key *key, void *val, unsigned int val_len, void *cb_arg) { struct read_index_buf *rib = cb_arg; struct scoutfs_ioctl_xattr_index_entry *ent = &rib->ents[rib->nr]; u64 xid; if (val_len != 0) return -EIO; /* discard the xid, they're not exposed to ioctl callers */ scoutfs_xattr_get_indx_key(key, &ent->major, &ent->minor, &ent->ino, &xid); if (++rib->nr == rib->size) return rib->nr; return -EAGAIN; } static long scoutfs_ioc_read_xattr_index(struct file *file, unsigned long arg) { struct super_block *sb = file_inode(file)->i_sb; struct scoutfs_ioctl_read_xattr_index __user *urxi = (void __user *)arg; struct scoutfs_ioctl_xattr_index_entry __user *uents; struct scoutfs_ioctl_xattr_index_entry *ent; struct scoutfs_ioctl_xattr_index_entry prev; struct scoutfs_ioctl_read_xattr_index rxi; struct read_index_buf *rib; struct page *page = NULL; struct scoutfs_key first; struct scoutfs_key last; struct scoutfs_key start; struct scoutfs_key end; int copied = 0; int ret; int i; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } if (copy_from_user(&rxi, urxi, sizeof(rxi))) { ret = -EFAULT; goto out; } uents = (void __user *)rxi.entries_ptr; rxi.entries_nr = min_t(u64, rxi.entries_nr, INT_MAX); page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } rib = page_address(page); scoutfs_xattr_init_indx_key(&first, rxi.first.major, rxi.first.minor, rxi.first.ino, 0); scoutfs_xattr_init_indx_key(&last, rxi.last.major, rxi.last.minor, rxi.last.ino, U64_MAX); scoutfs_xattr_indx_get_range(&start, &end); if (scoutfs_key_compare(&first, &last) > 0) { ret = -EINVAL; goto out; } /* 0 ino doesn't exist, can't ever match entry to return */ memset(&prev, 0, sizeof(prev)); while (copied < rxi.entries_nr) { rib->nr = 0; rib->size = min_t(u64, rxi.entries_nr - copied, READ_INDEX_BUF_MAX_ENTS); ret = scoutfs_wkic_iterate(sb, &first, &last, &start, &end, read_index_cb, rib); if (ret < 0) goto out; if (rib->nr == 0) break; /* * Copy entries to userspace, skipping duplicate entries * that can result from multiple xattrs indexing an * inode at the same position and which can span * multiple cache iterations. (Comparing in order of * most likely to change to fail fast.) */ for (i = 0, ent = rib->ents; i < rib->nr; i++, ent++) { if (ent->ino == prev.ino && ent->minor == prev.minor && ent->major == prev.major) continue; if (copy_to_user(&uents[copied], ent, sizeof(*ent))) { ret = -EFAULT; goto out; } prev = *ent; copied++; } scoutfs_xattr_init_indx_key(&first, prev.major, prev.minor, prev.ino, U64_MAX); scoutfs_key_inc(&first); } ret = copied; out: if (page) __free_page(page); return ret; } long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case SCOUTFS_IOC_WALK_INODES: return scoutfs_ioc_walk_inodes(file, arg); case SCOUTFS_IOC_INO_PATH: return scoutfs_ioc_ino_path(file, arg); case SCOUTFS_IOC_RELEASE: return scoutfs_ioc_release(file, arg); case SCOUTFS_IOC_STAGE: return scoutfs_ioc_stage(file, arg); case SCOUTFS_IOC_STAT_MORE: return scoutfs_ioc_stat_more(file, arg); case SCOUTFS_IOC_DATA_WAITING: return scoutfs_ioc_data_waiting(file, arg); case SCOUTFS_IOC_SETATTR_MORE: return scoutfs_ioc_setattr_more(file, arg); case SCOUTFS_IOC_LISTXATTR_HIDDEN: return scoutfs_ioc_listxattr_hidden(file, arg); case SCOUTFS_IOC_SEARCH_XATTRS: return scoutfs_ioc_search_xattrs(file, arg); case SCOUTFS_IOC_STATFS_MORE: return scoutfs_ioc_statfs_more(file, arg); case SCOUTFS_IOC_DATA_WAIT_ERR: return scoutfs_ioc_data_wait_err(file, arg); case SCOUTFS_IOC_ALLOC_DETAIL: return scoutfs_ioc_alloc_detail(file, arg); case SCOUTFS_IOC_MOVE_BLOCKS: return scoutfs_ioc_move_blocks(file, arg); case SCOUTFS_IOC_RESIZE_DEVICES: return scoutfs_ioc_resize_devices(file, arg); case SCOUTFS_IOC_READ_XATTR_TOTALS: return scoutfs_ioc_read_xattr_totals(file, arg); case SCOUTFS_IOC_GET_ALLOCATED_INOS: return scoutfs_ioc_get_allocated_inos(file, arg); case SCOUTFS_IOC_GET_REFERRING_ENTRIES: return scoutfs_ioc_get_referring_entries(file, arg); case SCOUTFS_IOC_GET_ATTR_X: return scoutfs_ioc_get_attr_x(file, arg); case SCOUTFS_IOC_SET_ATTR_X: return scoutfs_ioc_set_attr_x(file, arg); case SCOUTFS_IOC_GET_QUOTA_RULES: return scoutfs_ioc_get_quota_rules(file, arg); case SCOUTFS_IOC_ADD_QUOTA_RULE: return scoutfs_ioc_mod_quota_rule(file, arg, true); case SCOUTFS_IOC_DEL_QUOTA_RULE: return scoutfs_ioc_mod_quota_rule(file, arg, false); case SCOUTFS_IOC_READ_XATTR_INDEX: return scoutfs_ioc_read_xattr_index(file, arg); } return -ENOTTY; }