From 38e6f11ee4e0f6fee4a3d5e8f113bfea930835ce Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 13 Sep 2023 09:30:23 -0700 Subject: [PATCH] Add quota support Signed-off-by: Zach Brown --- kmod/src/Makefile | 1 + kmod/src/data.c | 4 + kmod/src/dir.c | 5 + kmod/src/file.c | 9 + kmod/src/format.h | 37 ++ kmod/src/ioctl.c | 80 +++ kmod/src/ioctl.h | 69 +++ kmod/src/lock.c | 15 + kmod/src/lock.h | 2 + kmod/src/quota.c | 1261 ++++++++++++++++++++++++++++++++++++++ kmod/src/quota.h | 48 ++ kmod/src/scoutfs_trace.h | 2 + kmod/src/super.c | 3 + kmod/src/super.h | 2 + kmod/src/trace/quota.h | 143 +++++ 15 files changed, 1681 insertions(+) create mode 100644 kmod/src/quota.c create mode 100644 kmod/src/quota.h create mode 100644 kmod/src/trace/quota.h diff --git a/kmod/src/Makefile b/kmod/src/Makefile index ca1b657e..fa632aa1 100644 --- a/kmod/src/Makefile +++ b/kmod/src/Makefile @@ -35,6 +35,7 @@ scoutfs-y += \ options.o \ per_task.o \ quorum.o \ + quota.o \ recov.o \ scoutfs_trace.o \ server.o \ diff --git a/kmod/src/data.c b/kmod/src/data.c index 7d14c55a..dd10a06f 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1110,6 +1110,10 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) while(iblock <= last) { + ret = scoutfs_quota_check_data(sb, inode); + if (ret) + goto out_extent; + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret) goto out_extent; diff --git a/kmod/src/dir.c b/kmod/src/dir.c index c8417bdf..ca23d3a6 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -34,6 +34,7 @@ #include "forest.h" #include "acl.h" #include "counters.h" +#include "quota.h" #include "scoutfs_trace.h" /* @@ -651,6 +652,10 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry, if (ret) goto out_unlock; + ret = scoutfs_quota_check_inode(sb, dir); + if (ret) + goto out_unlock; + if (orph_lock) { ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock); if (ret < 0) diff --git a/kmod/src/file.c b/kmod/src/file.c index 3b6a2d12..d44cd822 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -28,6 +28,7 @@ #include "inode.h" #include "per_task.h" #include "omap.h" +#include "quota.h" #ifdef KC_LINUX_HAVE_FOP_AIO_READ /* @@ -126,6 +127,10 @@ retry: goto out; } + ret = scoutfs_quota_check_data(sb, inode); + if (ret) + goto out; + /* XXX: remove SUID bit */ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); @@ -228,6 +233,10 @@ retry: if (ret) goto out; + ret = scoutfs_quota_check_data(sb, inode); + if (ret) + goto out; + if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) { /* data_version is per inode, whole file must be online */ ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode), SEF_OFFLINE, diff --git a/kmod/src/format.h b/kmod/src/format.h index 7e283de8..47a3a7c9 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -13,6 +13,7 @@ #define SCOUTFS_FORMAT_VERSION_FEAT_RETENTION 2 #define SCOUTFS_FORMAT_VERSION_FEAT_PROJECT_ID 2 +#define SCOUTFS_FORMAT_VERSION_FEAT_QUOTA 2 /* statfs(2) f_type */ #define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */ @@ -178,6 +179,10 @@ struct scoutfs_key { #define sko_rid _sk_first #define sko_ino _sk_second +/* quota rules */ +#define skqr_hash _sk_second +#define skqr_coll_nr _sk_third + /* xattr totl */ #define skxt_a _sk_first #define skxt_b _sk_second @@ -588,6 +593,7 @@ struct scoutfs_log_merge_freeing { */ #define SCOUTFS_INODE_INDEX_ZONE 4 #define SCOUTFS_ORPHAN_ZONE 8 +#define SCOUTFS_QUOTA_ZONE 10 #define SCOUTFS_XATTR_TOTL_ZONE 12 #define SCOUTFS_FS_ZONE 16 #define SCOUTFS_LOCK_ZONE 20 @@ -611,6 +617,9 @@ struct scoutfs_log_merge_freeing { /* orphan zone, redundant type used for clarity */ #define SCOUTFS_ORPHAN_TYPE 4 +/* quota zone */ +#define SCOUTFS_QUOTA_RULE_TYPE 4 + /* fs zone */ #define SCOUTFS_INODE_TYPE 4 #define SCOUTFS_XATTR_TYPE 8 @@ -664,6 +673,34 @@ struct scoutfs_xattr_totl_val { __le64 count; }; +#define SQ_RF_TOTL_COUNT (1 << 0) +#define SQ_RF__UNKNOWN (~((1 << 1) - 1)) + +#define SQ_NS_LITERAL 0 +#define SQ_NS_PROJ 1 +#define SQ_NS_UID 2 +#define SQ_NS_GID 3 +#define SQ_NS__NR 4 +#define SQ_NS__NR_SELECT (SQ_NS__NR - 1) /* !literal */ + +#define SQ_NF_SELECT (1 << 0) +#define SQ_NF__UNKNOWN (~((1 << 1) - 1)) + +#define SQ_OP_INODE 0 +#define SQ_OP_DATA 1 +#define SQ_OP__NR 2 + +struct scoutfs_quota_rule_val { + __le64 name_val[3]; + __le64 limit; + __u8 prio; + __u8 op; + __u8 rule_flags; + __u8 name_source[3]; + __u8 name_flags[3]; + __u8 _pad[7]; +}; + /* XXX does this exist upstream somewhere? */ #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER)) diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index c99580d8..ceadf91f 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -45,6 +45,7 @@ #include "attr_x.h" #include "totl.h" #include "wkic.h" +#include "quota.h" #include "scoutfs_trace.h" /* @@ -1387,6 +1388,79 @@ out: return ret; } +static long scoutfs_ioc_get_quota_rules(struct file *file, unsigned long arg) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct scoutfs_ioctl_get_quota_rules __user *ugqr = (void __user *)arg; + struct scoutfs_ioctl_get_quota_rules gqr; + struct scoutfs_ioctl_quota_rule __user *uirules; + struct scoutfs_ioctl_quota_rule *irules; + struct page *page = NULL; + int copied = 0; + int nr; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&gqr, ugqr, sizeof(gqr))) + return -EFAULT; + + if (gqr.rules_nr == 0) + return 0; + + uirules = (void __user *)gqr.rules_ptr; + /* limit rules copied per call */ + gqr.rules_nr = min_t(u64, gqr.rules_nr, INT_MAX); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + ret = -ENOMEM; + goto out; + } + irules = page_address(page); + + while (copied < gqr.rules_nr) { + nr = min_t(u64, gqr.rules_nr - copied, + PAGE_SIZE / sizeof(struct scoutfs_ioctl_quota_rule)); + ret = scoutfs_quota_get_rules(sb, gqr.iterator, page_address(page), nr); + if (ret <= 0) + goto out; + + if (copy_to_user(&uirules[copied], irules, ret * sizeof(irules[0]))) { + ret = -EFAULT; + goto out; + } + + copied += ret; + } + + ret = 0; +out: + if (page) + __free_page(page); + + if (ret == 0 && copy_to_user(ugqr->iterator, gqr.iterator, sizeof(gqr.iterator))) + ret = -EFAULT; + + return ret ?: copied; +} + +static long scoutfs_ioc_mod_quota_rule(struct file *file, unsigned long arg, bool is_add) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct scoutfs_ioctl_quota_rule __user *uirule = (void __user *)arg; + struct scoutfs_ioctl_quota_rule irule; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&irule, uirule, sizeof(irule))) + return -EFAULT; + + return scoutfs_quota_mod_rule(sb, is_add, &irule); +} + long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1428,6 +1502,12 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_get_attr_x(file, arg); case SCOUTFS_IOC_SET_ATTR_X: return scoutfs_ioc_set_attr_x(file, arg); + case SCOUTFS_IOC_GET_QUOTA_RULES: + return scoutfs_ioc_get_quota_rules(file, arg); + case SCOUTFS_IOC_ADD_QUOTA_RULE: + return scoutfs_ioc_mod_quota_rule(file, arg, true); + case SCOUTFS_IOC_DEL_QUOTA_RULE: + return scoutfs_ioc_mod_quota_rule(file, arg, false); } return -ENOTTY; diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 9a6b9b30..69cf05c3 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -750,4 +750,73 @@ struct scoutfs_ioctl_inode_attr_x { #define SCOUTFS_IOC_SET_ATTR_X \ _IOW(SCOUTFS_IOCTL_MAGIC, 19, struct scoutfs_ioctl_inode_attr_x) +/* + * (These fields are documented in the order that they're displayed by + * the scoutfs cli utility which matches the sort order of the rules.) + * + * @prio: The priority of the rule. Rules are sorted by their fields + * with prio at the highest magnitude. When multiple rules match the + * rule with the highest sort order is enforced. The priority field + * lets rules override the default field sort order. + * + * @name_val[3]: The three 64bit values that make up the name of the + * totl xattr whose total will be checked against the rule's limit to + * see if the quota rule has been exceeded. The behavior of the values + * can be changed by their corresponding name_source and name_flags. + * + * @name_source[3]: The SQ_NS_ enums that control where the value comes + * from. _LITERAL uses the value from name_val. Inode attribute + * sources (_PROJ, _UID, _GID) are taken from the inode of the operation + * that is being checked against the rule. + * + * @name_flags[3]: The SQ_NF_ enums that alter the name values. _SELECT + * makes the rule only match if the inode attribute of the operation + * matches the attribute value stored in name_val. This lets rules + * match a specific value of an attribute rather than mapping all + * attribute values of to totl names. + * + * @op: The SQ_OP_ enums which specify the operation that can't exceed + * the rule's limit. _INODE checks inode creation and the inode + * attributes are taken from the inode that would be created. _DATA + * checks file data block allocation and the inode fields come from the + * inode that is allocating the blocks. + * + * @limit: The 64bit value that is checked against the totl value + * described by the rule. If the totl value is greater than or equal to + * this value of the matching rule then the operation will return + * -EDQUOT. + * + * @rule_flags: SQ_RF_TOTL_COUNT indicates that the rule's limit should + * be checked against the number of xattrs contributing to a totl value + * instead of the sum of the xattrs. + */ +struct scoutfs_ioctl_quota_rule { + __u64 name_val[3]; + __u64 limit; + __u8 prio; + __u8 op; + __u8 rule_flags; + __u8 name_source[3]; + __u8 name_flags[3]; + __u8 _pad[7]; +}; + +struct scoutfs_ioctl_get_quota_rules { + __u64 iterator[2]; + __u64 rules_ptr; + __u64 rules_nr; +}; + +/* + * Rules are uniquely identified by their non-padded fields. Addition will fail + * with -EEXIST if the specified rule already exists and deletion must find a rule + * with all matching fields to delete. + */ +#define SCOUTFS_IOC_GET_QUOTA_RULES \ + _IOR(SCOUTFS_IOCTL_MAGIC, 20, struct scoutfs_ioctl_get_quota_rules) +#define SCOUTFS_IOC_ADD_QUOTA_RULE \ + _IOW(SCOUTFS_IOCTL_MAGIC, 21, struct scoutfs_ioctl_quota_rule) +#define SCOUTFS_IOC_DEL_QUOTA_RULE \ + _IOW(SCOUTFS_IOCTL_MAGIC, 22, struct scoutfs_ioctl_quota_rule) + #endif diff --git a/kmod/src/lock.c b/kmod/src/lock.c index b3c6b957..2398f3d9 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -37,6 +37,7 @@ #include "omap.h" #include "util.h" #include "totl.h" +#include "quota.h" /* * scoutfs uses a lock service to manage item cache consistency between @@ -186,6 +187,9 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock, return ret; } + if (lock->start.sk_zone == SCOUTFS_QUOTA_ZONE && !lock_mode_can_read(mode)) + scoutfs_quota_invalidate(sb); + /* have to invalidate if we're not in the only usable case */ if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) { retry: @@ -1250,6 +1254,17 @@ int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode, return lock_key_range(sb, mode, flags, &start, &end, lock); } +int scoutfs_lock_quota(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, + struct scoutfs_lock **lock) +{ + struct scoutfs_key start; + struct scoutfs_key end; + + scoutfs_quota_get_lock_range(&start, &end); + + return lock_key_range(sb, mode, flags, &start, &end, lock); +} + void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode) { DECLARE_LOCK_INFO(sb, linfo); diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 4ba50bcd..1b49e534 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -86,6 +86,8 @@ int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int u64 ino, struct scoutfs_lock **lock); int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, struct scoutfs_lock **lock); +int scoutfs_lock_quota(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, + struct scoutfs_lock **lock); void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode); diff --git a/kmod/src/quota.c b/kmod/src/quota.c new file mode 100644 index 00000000..c3e3839b --- /dev/null +++ b/kmod/src/quota.c @@ -0,0 +1,1261 @@ +/* + * Copyright (C) 2023 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "format.h" +#include "super.h" +#include "lock.h" +#include "hash.h" +#include "inode.h" +#include "item.h" +#include "ioctl.h" +#include "cmp.h" +#include "wkic.h" +#include "xattr.h" +#include "totl.h" +#include "util.h" +#include "quota.h" +#include "scoutfs_trace.h" + +/* + * scoutfs quotas let userspace manage accounting and rules which + * specify when operations should fail because a quota is exceeded. + * + * Userspace is responsible for managing the .totl. xattrs that + * accumulate counts and totals that can be checked to enforce quotas. + * Userspace then builds quota rules that map operations to totl names + * and limits. This puts userspace entirely in control of the quota + * policy. + * + * The quota checks are specifically allowed to use slightly stale data + * to avoid global locking bottlenecks. + * + * Rules are stored as items in the main fs btree and are subject strict + * consistency cluster locking. After any change to rules all the rules + * will be read in again and processed for checking. + * + * The .totl. xattrs are not read under cluster locking to avoid lock + * contention. They're read using the weak item cache which expires + * only on a timeout. This leads to a regular background load of weak + * reads of the item totls as they're updated at the frequency of the + * cache expiration. + */ + +#define CACHE_AGE_MS (5 * MSEC_PER_SEC) + +/* + * Rules are stored in trees whose nodes are keyed by their input + * matching criteria. The trees are not modified once they're visible + * to readers. RCU is used to free the trees once all the readers have + * finished. + */ +struct squota_ruleset { + struct rcu_head rcu; + struct rb_root roots[SQ_NS__NR_SELECT]; + struct squota_rule *defaults[SQ_OP__NR]; +}; + +struct squota_info { + struct super_block *sb; + struct squota_ruleset __rcu *ruleset; /* ENOENT, EINVAL, EBUSY, or valid ptr */ + struct rhashtable check_ht; + atomic64_t nr_checks; + + struct rw_semaphore rwsem; + spinlock_t lock; + wait_queue_head_t waitq; + KC_DEFINE_SHRINKER(shrinker); + struct dentry *drop_dentry; +}; + +#define DECLARE_QUOTA_INFO(sb, name) \ + struct squota_info *name = SCOUTFS_SB(sb)->squota_info + +static inline int quota_unsupported(struct super_block *sb) +{ + return scoutfs_fmt_vers_unsupported(sb, SCOUTFS_FORMAT_VERSION_FEAT_QUOTA); +} + +struct squota_check { + struct rcu_head rcu; + struct rhash_head head; + struct squota_input inp; + ktime_t expiration; + int result; +}; + +static const struct rhashtable_params check_ht_params = { + .key_len = member_sizeof(struct squota_check, inp), + .key_offset = offsetof(struct squota_check, inp), + .head_offset = offsetof(struct squota_check, head), +}; + +static bool get_cached_check(struct squota_info *qtinf, struct squota_input *inp, int *result) +{ + struct squota_check *chk; + bool got; + + if (WARN_ON_ONCE(!rcu_read_lock_held())) + return false; + + chk = rhashtable_lookup(&qtinf->check_ht, inp, check_ht_params); + if (chk && ktime_after(chk->expiration, ktime_get_raw())) { + *result = chk->result; + got = true; + } else { + *result = 0; + got = false; + } + + return got; +} + +/* + * Insert a new cached check. If a cached check already exists its + * either timed out or was inserted very recently so either can be used. + * We abandon the insertion attempt on other errors, including + * allocation failures and insertion failure from a pending hash table + * resize. + */ +static void insert_cached_check(struct squota_info *qtinf, struct squota_input *inp, int result) +{ + struct squota_check *found; + struct squota_check *chk; + int ret; + + /* zero full size for hash table memcmp */ + chk = kzalloc(sizeof(struct squota_check), GFP_NOFS); + if (!chk) + return; + + chk->inp = *inp; + chk->expiration = ktime_add_ms(ktime_get_raw(), CACHE_AGE_MS); + chk->result = result; + + while (chk) { + ret = rhashtable_lookup_insert_fast(&qtinf->check_ht, &chk->head, + check_ht_params); + if (ret == 0) { + atomic64_inc(&qtinf->nr_checks); + chk = NULL; + + } else if (ret == -EEXIST) { + /* try to free older insertion or existing */ + rcu_read_lock(); + found = rhashtable_lookup(&qtinf->check_ht, inp, check_ht_params); + if (found) { + if (ktime_before(found->expiration, chk->expiration)) { + if (rhashtable_remove_fast(&qtinf->check_ht, + &found->head, + check_ht_params) == 0) { + kfree_rcu(found, rcu); + atomic64_dec(&qtinf->nr_checks); + } + } else { + kfree(chk); + chk = NULL; + } + } + rcu_read_unlock(); + + } else { + kfree(chk); + chk = NULL; + } + } +} + +/* + * Return a random cached check from the hash table. We sweep the + * buckets from a random starting point and return the first we find, + * continuing from the next table if it's resizing. This is sort of + * like the _walk_ api but we can set the starting point and it doesn't + * return -EAGAIN while resizing. + */ +static struct squota_check *lookup_random_check(struct rhashtable *rht) +{ + struct bucket_table *tbl; + struct squota_check *chk; + struct rhash_head *pos; + unsigned long s; + unsigned long i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + tbl = rht_dereference_rcu(rht->tbl, rht); + do { + for (s = 0, i = prandom_u32_max(tbl->size); + s < tbl->size; + s++, i = (i + 1) % tbl->size) { + rht_for_each_entry_rcu(chk, pos, tbl, i, head) { + return chk; + } + } + } while (!IS_ERR_OR_NULL((tbl = rht_dereference_rcu(tbl->future_tbl, rht)))); + + return NULL; +} + +static unsigned long count_cached_checks(struct shrinker *shrink, struct shrink_control *sc) +{ + struct squota_info *qtinf = KC_SHRINKER_CONTAINER_OF(shrink, struct squota_info); + + return shrinker_min_long(atomic64_read(&qtinf->nr_checks)); +} + +/* + * We don't bother with any precise replacement mechanism. We choose + * cached check results to drop at random. If the cache is large then + * random choices are unlikely to have been used again. If the cache is + * small then any choices end up blowing away most of the cache. + */ +static unsigned long scan_cached_checks(struct shrinker *shrink, struct shrink_control *sc) +{ + struct squota_info *qtinf = KC_SHRINKER_CONTAINER_OF(shrink, struct squota_info); + unsigned long nr = sc->nr_to_scan; + unsigned int retries = 10; + unsigned long freed = 0; + struct squota_check *chk; + int err; + + rcu_read_lock(); + + while (nr > 0 && retries > 0 && (chk = lookup_random_check(&qtinf->check_ht))) { + err = rhashtable_remove_fast(&qtinf->check_ht, &chk->head, check_ht_params); + if (err) { + retries--; + continue; + } + + kfree_rcu(chk, rcu); + atomic64_dec(&qtinf->nr_checks); + freed++; + nr--; + } + + rcu_read_unlock(); + + if (retries == 0 && freed == 0) + freed = SHRINK_STOP; + + return freed; +} + +static void shrink_all_cached_checks(struct squota_info *qtinf) +{ + struct shrink_control sc = { .nr_to_scan = LONG_MAX, }; + + scan_cached_checks(KC_SHRINKER_FN(&qtinf->shrinker), &sc); +} + +static u8 ns_is_attr(u8 ns) +{ + switch (ns) { + case SQ_NS_PROJ: + case SQ_NS_UID: + case SQ_NS_GID: + return true; + default: + return false; + } +} + +/* rule validation has made sure these derefs are safe */ +static u8 ns_to_attr(u8 ns) +{ + static u8 ind[] = { + [SQ_NS_PROJ] = 0, + [SQ_NS_UID] = 1, + [SQ_NS_GID] = 2, + }; + + return ind[ns]; +} + +static void rule_to_rule_val(struct scoutfs_quota_rule_val *rv, struct squota_rule *rule) +{ + rv->limit = cpu_to_le64(rule->limit); + rv->prio = rule->prio; + rv->op = rule->op; + rv->rule_flags = rule->rule_flags; + rv->name_val[0] = cpu_to_le64(rule->names[0].val); + rv->name_source[0] = rule->names[0].source; + rv->name_flags[0] = rule->names[0].flags; + rv->name_val[1] = cpu_to_le64(rule->names[1].val); + rv->name_source[1] = rule->names[1].source; + rv->name_flags[1] = rule->names[1].flags; + rv->name_val[2] = cpu_to_le64(rule->names[2].val); + rv->name_source[2] = rule->names[2].source; + rv->name_flags[2] = rule->names[2].flags; + memset(&rv->_pad, 0, sizeof(rv->_pad)); +} + +static void rule_to_irule(struct scoutfs_ioctl_quota_rule *irule, struct squota_rule *rule) +{ + irule->limit = rule->limit; + irule->prio = rule->prio; + irule->op = rule->op; + irule->rule_flags = rule->rule_flags; + irule->name_val[0] = rule->names[0].val; + irule->name_source[0] = rule->names[0].source; + irule->name_flags[0] = rule->names[0].flags; + irule->name_val[1] = rule->names[1].val; + irule->name_source[1] = rule->names[1].source; + irule->name_flags[1] = rule->names[1].flags; + irule->name_val[2] = rule->names[2].val; + irule->name_source[2] = rule->names[2].source; + irule->name_flags[2] = rule->names[2].flags; + memset(&irule->_pad, 0, sizeof(irule->_pad)); +} + +/* + * We verify rules coming from untrusted ioctls/storage. + */ +static bool valid_rule(struct squota_rule *rule) +{ + struct squota_rule_name *other; + struct squota_rule_name *name; + int i; + int j; + + /* invalid op */ + if (rule->op > SQ_OP__NR) + return false; + + if (rule->rule_flags & SQ_RF__UNKNOWN) + return false; + + for (i = 0; i < ARRAY_SIZE(rule->names); i++) { + name = &rule->names[i]; + + /* unknown name flags */ + if (name->flags & SQ_NF__UNKNOWN) + return false; + + if ((name->flags & SQ_NF_SELECT)) { + /* can only select sources that are inode attributes */ + if (!ns_is_attr(name->source)) + return false; + + for (j = 0; j < ARRAY_SIZE(rule->names); j++) { + if (i == j) + continue; + other = &rule->names[j]; + + /* can't select different values of same attr */ + if ((other->flags & SQ_NF_SELECT) && + name->source == other->source && + name->val != other->val) { + return false; + } + } + } + } + + return true; +} + +static int rule_val_to_rule(struct squota_rule *rule, struct scoutfs_quota_rule_val *rv, + int bytes) +{ + if (bytes != sizeof(struct scoutfs_quota_rule_val)) + return -EIO; + + rule->limit = le64_to_cpu(rv->limit); + rule->prio = rv->prio; + rule->op = rv->op; + rule->rule_flags = rv->rule_flags; + rule->names[0].val = le64_to_cpu(rv->name_val[0]); + rule->names[0].source = rv->name_source[0]; + rule->names[0].flags = rv->name_flags[0]; + rule->names[1].val = le64_to_cpu(rv->name_val[1]); + rule->names[1].source = rv->name_source[1]; + rule->names[1].flags = rv->name_flags[1]; + rule->names[2].val = le64_to_cpu(rv->name_val[2]); + rule->names[2].source = rv->name_source[2]; + rule->names[2].flags = rv->name_flags[2]; + + if (!valid_rule(rule)) + return -EIO; + + return 0; +} + +static int irule_to_rule(struct squota_rule *rule, struct scoutfs_ioctl_quota_rule *irule) +{ + rule->limit = irule->limit; + rule->prio = irule->prio; + rule->op = irule->op; + rule->rule_flags = irule->rule_flags; + rule->names[0].val = irule->name_val[0]; + rule->names[0].source = irule->name_source[0]; + rule->names[0].flags = irule->name_flags[0]; + rule->names[1].val = irule->name_val[1]; + rule->names[1].source = irule->name_source[1]; + rule->names[1].flags = irule->name_flags[1]; + rule->names[2].val = irule->name_val[2]; + rule->names[2].source = irule->name_source[2]; + rule->names[2].flags = irule->name_flags[2]; + + if (!valid_rule(rule)) + return -EINVAL; + + return 0; +} + +static void init_rule_key(struct scoutfs_key *key, u64 hash, u64 coll_nr) +{ + *key = (struct scoutfs_key) { + .sk_zone = SCOUTFS_QUOTA_ZONE, + .sk_type = SCOUTFS_QUOTA_RULE_TYPE, + .skqr_hash = cpu_to_le64(hash), + .skqr_coll_nr = cpu_to_le64(coll_nr), + }; +} + +static void rule_to_key(struct scoutfs_key *key, struct squota_rule *rule) +{ + struct scoutfs_quota_rule_val rv; + + rule_to_rule_val(&rv, rule); + init_rule_key(key, scoutfs_hash64(&rv, sizeof(rv)), 0); +} + +/* + * Callers specifically want to increase keys by increasing the + * collision nr, not just incing the key. + */ +static void inc_coll_nr(struct scoutfs_key *key) +{ + le64_add_cpu(&key->skqr_coll_nr, 1); + if (key->skqr_coll_nr == 0) + le64_add_cpu(&key->skqr_hash, 1); +} + +/* + * Rules have a defined sort order that determines matching priority + * when multiple rules match an input. + */ +static int cmp_rules(struct squota_rule *a, struct squota_rule *b) +{ + return scoutfs_cmp(a->prio, b->prio) ?: + scoutfs_cmp(a->names[0].val, b->names[0].val) ?: + scoutfs_cmp(a->names[0].source, b->names[0].source) ?: + scoutfs_cmp(a->names[0].flags, b->names[0].flags) ?: + scoutfs_cmp(a->names[1].val, b->names[1].val) ?: + scoutfs_cmp(a->names[1].source, b->names[1].source) ?: + scoutfs_cmp(a->names[1].flags, b->names[1].flags) ?: + scoutfs_cmp(a->names[2].val, b->names[2].val) ?: + scoutfs_cmp(a->names[2].source, b->names[2].source) ?: + scoutfs_cmp(a->names[2].flags, b->names[2].flags) ?: + scoutfs_cmp(a->op, b->op) ?: + scoutfs_cmp(a->limit, b->limit) ?: + scoutfs_cmp(a->rule_flags, b->rule_flags); +} + +static struct squota_rule *name_to_rule(struct squota_rule_name *name) +{ + return container_of(name, struct squota_rule, names[name->i]); +} + +static bool unlinked_rule(struct squota_rule *rule) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rule->names); i++) { + if (!RB_EMPTY_NODE(&rule->names[i].node)) + return false; + } + + return true; +} + +static void free_ruleset(struct squota_ruleset *rs) +{ + struct squota_rule_name *name; + struct squota_rule_name *name_; + struct squota_rule *rule; + int i; + + if (!IS_ERR_OR_NULL(rs)) { + for (i = 0; i < ARRAY_SIZE(rs->roots); i++) { + rbtree_postorder_for_each_entry_safe(name, name_, &rs->roots[i], node) { + RB_CLEAR_NODE(&name->node); + + rule = name_to_rule(name); + if (unlinked_rule(rule)) + kfree(rule); + } + } + + for (i = 0; i < ARRAY_SIZE(rs->defaults); i++) + kfree(rs->defaults[i]); + + kfree(rs); + } +} + +static void free_ruleset_rcu(struct rcu_head *rcu) +{ + struct squota_ruleset *rs = container_of(rcu, struct squota_ruleset, rcu); + + free_ruleset(rs); +} + +static bool empty_ruleset(struct squota_ruleset *rs) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rs->roots); i++) { + if (!RB_EMPTY_ROOT(&rs->roots[i])) + return false; + } + for (i = 0; i < ARRAY_SIZE(rs->defaults); i++) { + if (rs->defaults[i]) + return false; + } + + return true; +} + +/* + * Walk a rule tree for a given matching attr. Each tree only contains + * names which select on the tree's attr so we only have to compare each + * name's value, not its flags or source. + * + * The tree allows multiple names with a given val. The first match is + * found and callers can iterate through all matches with _next. + */ +static struct squota_rule_name *walk_rule_tree(struct rb_root *root, u64 val, + struct squota_rule_name *ins) +{ + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + struct squota_rule_name *found = NULL; + struct squota_rule_name *name; + int cmp; + + while (*node) { + parent = *node; + name = container_of(*node, struct squota_rule_name, node); + + cmp = scoutfs_cmp(name->val, val); + if (cmp < 0) { + node = &(*node)->rb_left; + } else if (cmp > 0) { + node = &(*node)->rb_right; + } else { + found = name; + node = &(*node)->rb_left; + } + } + + if (ins) { + rb_link_node(&ins->node, parent, node); + rb_insert_color(&ins->node, root); + } + + return found; +} + +/* + * Return the next name in the ruleset attr tree that matches the val. + * All the nodes match this attribute, so we only have to compare the + * val. + */ +static struct squota_rule_name *next_val_name(struct squota_rule_name *name) +{ + struct squota_rule_name *next; + struct rb_node *node; + + if (!name || RB_EMPTY_NODE(&name->node)) + return NULL; + + node = rb_next(&name->node); + if (node) { + next = container_of(node, struct squota_rule_name, node); + if (next->val == name->val) + return next; + } + + return NULL; +} + +static bool ruleset_is_busy(struct squota_info *qtinf) +{ + bool busy; + + rcu_read_lock(); + busy = rcu_dereference(qtinf->ruleset) == ERR_PTR(-EBUSY); + rcu_read_unlock(); + + return busy; +} + +/* + * The caller found that we didn't have a valid ruleset and wants us to + * read in a new ruleset. + * + * We get exclusive access to the rules by marking the ruleset pointer + * busy, possibly waiting for someone else to finish if they beat us to + * it. If we get exclusive access then we walk all the rule items and + * build up a rule set and publish it for use. + */ +static int read_ruleset(struct super_block *sb, struct squota_info *qtinf) +{ + struct scoutfs_lock *lock = NULL; + struct squota_ruleset *rs = NULL; + struct scoutfs_quota_rule_val rv; + struct squota_rule *rule = NULL; + struct squota_rule_name *name; + struct scoutfs_key key; + struct scoutfs_key end; + bool reading = false; + int ret; + int i; + + ret = scoutfs_lock_quota(sb, SCOUTFS_LOCK_READ, 0, &lock); + if (ret < 0) + goto out; + + spin_lock(&qtinf->lock); + rs = rcu_dereference_protected(qtinf->ruleset, lockdep_is_held(&qtinf->lock)); + if (rs == ERR_PTR(-EINVAL)) { + rs = ERR_PTR(-EBUSY); + rcu_assign_pointer(qtinf->ruleset, rs); + reading = true; + } + spin_unlock(&qtinf->lock); + + if (!reading) { + wait_event(qtinf->waitq, !ruleset_is_busy(qtinf)); + ret = 0; + goto out; + } + + rs = kzalloc(sizeof(struct squota_ruleset), GFP_NOFS); + if (!rs) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(rs->roots); i++) + rs->roots[i] = RB_ROOT; + + init_rule_key(&key, 0, 0); + init_rule_key(&end, U64_MAX, U64_MAX); + + for (;;) { + if (!rule) { + rule = kmalloc(sizeof(struct squota_rule), GFP_NOFS); + if (!rule) { + ret = -ENOMEM; + goto out; + } + } + + ret = scoutfs_item_next(sb, &key, &end, &rv, sizeof(rv), lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + ret = rule_val_to_rule(rule, &rv, ret); + if (ret < 0) + goto out; + + /* insert rule into attr tree if any of its names select */ + for (i = 0; i < ARRAY_SIZE(rule->names); i++) { + name = &rule->names[i]; + name->i = i; + + if (name->flags & SQ_NF_SELECT) { + walk_rule_tree(&rs->roots[ns_to_attr(name->source)], + name->val, name); + } else { + RB_CLEAR_NODE(&name->node); + } + } + + + if (!unlinked_rule(rule)) + rule = NULL; + + /* remember highest priority unlinked (default) rule */ + if (rule && + (!rs->defaults[rule->op] || cmp_rules(rule, rs->defaults[rule->op]) > 0)) { + rs->defaults[rule->op] = rule; + rule = NULL; + } + + inc_coll_nr(&key); + } + +out: + if (reading) { + if (ret == 0 && empty_ruleset(rs)) { + free_ruleset(rs); + rs = ERR_PTR(-ENOENT); + } + + if (ret < 0) { + free_ruleset(rs); + rs = ERR_PTR(-EINVAL); + } + + spin_lock(&qtinf->lock); + rcu_assign_pointer(qtinf->ruleset, rs); + spin_unlock(&qtinf->lock); + wake_up(&qtinf->waitq); + } + + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + + kfree(rule); + + return ret; +} + +/* + * A rule matches input when the ops match and all of the rule's key + * name selectors match the input -- non-selecting key names always + * match. + */ +static bool rule_matches(struct squota_input *inp, struct squota_rule *rule) +{ + struct squota_rule_name *name; + int i; + + if (inp->op != rule->op) + return false; + + for (i = 0; i < ARRAY_SIZE(rule->names); i++) { + name = &rule->names[i]; + + if ((name->flags & SQ_NF_SELECT) && + (inp->attrs[ns_to_attr(name->source)] != name->val)) + return false; + } + + return true; +} + +struct squota_totl_check { + u64 totl[3]; + u64 limit; + u8 rule_flags; +}; + +/* + * Check the rules against the caller's inputs. We start with the + * highest priority default rule for the operation then search all the + * rules that select for any of the input's attrs and use the highest + * priority match. + * + * If we find a matching rule then we give the caller the totl xattr + * name and limit to check. + */ +static bool check_rules(struct squota_ruleset *rs, struct squota_input *inp, + struct squota_totl_check *tc) +{ + struct squota_rule_name *name; + struct squota_rule *match; + struct squota_rule *rule; + int i; + + if (WARN_ON_ONCE(!rcu_read_lock_held())) + return false; + + match = rs->defaults[inp->op]; + + for (i = 0; i < SQ_NS__NR_SELECT; i++) { + name = walk_rule_tree(&rs->roots[i], inp->attrs[i], NULL); + while (name) { + rule = name_to_rule(name); + if (rule_matches(inp, rule) && (!match || cmp_rules(rule, match) > 0)) + match = rule; + name = next_val_name(name); + } + } + + if (match) { + for (i = 0; i < ARRAY_SIZE(match->names); i++) { + name = &match->names[i]; + + if (ns_is_attr(name->source)) + tc->totl[i] = inp->attrs[ns_to_attr(name->source)]; + else + tc->totl[i] = name->val; /* LITERAL is only non-attr source */ + } + + tc->limit = match->limit; + tc->rule_flags = match->rule_flags; + return true; + } + + return false; +} + +static int check_totl_cb(struct scoutfs_key *key, void *val, unsigned int val_len, void *cb_arg) +{ + struct scoutfs_xattr_totl_val *tval = val; + struct squota_totl_check *tc = cb_arg; + u64 use; + + if (val_len != sizeof(struct scoutfs_xattr_totl_val)) + return -EIO; + + if (tc->rule_flags & SQ_RF_TOTL_COUNT) + use = le64_to_cpu(tval->count); + else + use = le64_to_cpu(tval->total); + + return use >= tc->limit ? -EDQUOT : 0; +} + +/* + * Check that operations can be performed on the given inode. The rules + * are protected by cluster locking and re-read any time the lock is + * revoked. The xattr totl items are read from the weak item cache and + * can be a little out of date. Check results are also cached so we can + * rely on those while the current persistent items would produce a + * different result. + */ +static int check_inputs(struct super_block *sb, struct squota_input *inp) +{ + DECLARE_QUOTA_INFO(sb, qtinf); + struct squota_ruleset *rs = NULL; + struct scoutfs_key range_start; + struct scoutfs_key range_end; + struct scoutfs_key key; + struct squota_totl_check tc; + bool found; + int ret; + + rcu_read_lock(); + + /* quick fast path check when there are no quota rules */ + rs = rcu_dereference(qtinf->ruleset); + if (rs == ERR_PTR(-ENOENT)) { + rcu_read_unlock(); + ret = 0; + goto out; + } + + /* see if we have a cached check result */ + if (get_cached_check(qtinf, inp, &ret)) { + rcu_read_unlock(); + goto out; + } + + /* get the current ruleset, blocking to lock+read if we need to read items */ + while ((rs = rcu_dereference(qtinf->ruleset)), + (rs == ERR_PTR(-EINVAL) || rs == ERR_PTR(-EBUSY))) { + rcu_read_unlock(); + + ret = read_ruleset(sb, qtinf); + if (ret < 0) + goto out; + + rcu_read_lock(); + } + + /* see if we have a matching rule for our inputs */ + if (!IS_ERR(rs)) + found = check_rules(rs, inp, &tc); + else + found = NULL; + + rcu_read_unlock(); + + /* check if the totl limit was exceeded if we found a rule */ + if (found) { + scoutfs_totl_set_range(&range_start, &range_end); + scoutfs_xattr_init_totl_key(&key, tc.totl); + + ret = scoutfs_wkic_iterate(sb, &key, &key, &range_start, &range_end, + check_totl_cb, &tc); + + trace_scoutfs_quota_totl_check(sb, inp, &key, tc.limit, ret); + } else { + ret = 0; + } + + if (ret == 0 || ret == -EDQUOT) + insert_cached_check(qtinf, inp, ret); +out: + trace_scoutfs_quota_check(sb, (long)rs, inp, ret); + return ret; +} + +static void init_inp(struct squota_input *inp, u64 proj, u32 uid, u32 gid, u8 op) +{ + /* zero full size for hash table memcmp */ + memset(inp, 0, sizeof(struct squota_input)); + + inp->attrs[ns_to_attr(SQ_NS_PROJ)] = proj; + inp->attrs[ns_to_attr(SQ_NS_UID)] = uid; + inp->attrs[ns_to_attr(SQ_NS_GID)] = gid; + inp->op = op; +} + +/* + * The [ug]id initialization here mirrors init_inode_owner() but that + * takes a live inode struct and our cluster lock and transaction + * layering makes that awkward. + */ +int scoutfs_quota_check_inode(struct super_block *sb, struct inode *dir) +{ + struct squota_input inp; + + if (quota_unsupported(sb)) + return 0; + + BUILD_BUG_ON(max(sizeof(uid_t), sizeof(gid_t)) > sizeof(u32)); + + init_inp(&inp, scoutfs_inode_get_proj(dir), from_kuid(&init_user_ns, current_fsuid()), + (dir->i_mode & S_ISGID) ? i_gid_read(dir) : + from_kgid(&init_user_ns, current_fsgid()), + SQ_OP_INODE); + + return check_inputs(sb, &inp); +} + +int scoutfs_quota_check_data(struct super_block *sb, struct inode *inode) +{ + struct squota_input inp; + + if (quota_unsupported(sb)) + return 0; + + init_inp(&inp, scoutfs_inode_get_proj(inode), i_uid_read(inode), i_gid_read(inode), + SQ_OP_DATA); + + return check_inputs(sb, &inp); +} + +/* + * Read rules from the iterator position into the caller's irules + * buffer. We set the iterator to point past the last irules we return + * so that it can be used to continue iteration. + */ +int scoutfs_quota_get_rules(struct super_block *sb, u64 *iterator, + struct scoutfs_ioctl_quota_rule *irules, int nr) +{ + DECLARE_QUOTA_INFO(sb, qtinf); + struct scoutfs_quota_rule_val rv; + struct scoutfs_lock *lock = NULL; + struct squota_rule rule; + struct scoutfs_key key; + struct scoutfs_key end; + int copied = 0; + int ret = 0; + + if ((ret = quota_unsupported(sb))) + return ret; + + if (nr == 0) + goto out; + + ret = scoutfs_lock_quota(sb, SCOUTFS_LOCK_READ, 0, &lock); + if (ret < 0) + goto out; + + down_read(&qtinf->rwsem); + + init_rule_key(&key, iterator[0], iterator[1]); + init_rule_key(&end, U64_MAX, U64_MAX); + + while (copied < nr) { + ret = scoutfs_item_next(sb, &key, &end, &rv, sizeof(rv), lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + ret = rule_val_to_rule(&rule, &rv, ret); + if (ret < 0) + break; + + rule_to_irule(&irules[copied], &rule); + copied++; + + inc_coll_nr(&key); + iterator[0] = le64_to_cpu(key.skqr_hash); + iterator[1] = le64_to_cpu(key.skqr_coll_nr); + } + + up_read(&qtinf->rwsem); + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); +out: + return ret ?: copied; +} + +/* + * Search through rule items with the search hash value looking for a + * match. The return key is set to either the rule we found or the next + * unused collision nr. Returns 0 if found, -ENOENT if not, and -errno + * for errors. + */ +static int find_rule(struct super_block *sb, struct squota_rule *rule, struct scoutfs_key *key_ret, + struct scoutfs_lock *lock) +{ + struct scoutfs_quota_rule_val rv; + struct squota_rule found; + struct scoutfs_key key; + struct scoutfs_key end; + int ret; + + rule_to_key(&key, rule); + end = key; + end.skqr_coll_nr = cpu_to_le64(U64_MAX); + + for (;;) { + ret = scoutfs_item_next(sb, &key, &end, &rv, sizeof(rv), lock); + if (ret < 0) + break; + + ret = rule_val_to_rule(&found, &rv, ret); + if (ret) + break; + + if (cmp_rules(&found, rule) == 0) { + ret = 0; + break; + } + + inc_coll_nr(&key); + } + + *key_ret = key; + return ret; +} + +/* + * Modify a rule. This only operates on the persistent items. It holds + * a write cluster lock so it invalidates all other rules used by other + * nodes and also marks the local rules invalid. The next enforcement + * everywhere will re-read and process the full rule set. All this + * makes rule set modification expensive but it should be + * correspondingly rare. + */ +int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add, + struct scoutfs_ioctl_quota_rule *irule) +{ + DECLARE_QUOTA_INFO(sb, qtinf); + struct scoutfs_quota_rule_val rv; + struct scoutfs_lock *lock = NULL; + struct squota_rule rule; + struct scoutfs_key key; + int ret; + + if ((ret = quota_unsupported(sb))) + return ret; + + ret = irule_to_rule(&rule, irule); + if (ret < 0) + goto out; + + ret = scoutfs_lock_quota(sb, SCOUTFS_LOCK_WRITE, 0, &lock); + if (ret < 0) + goto out; + + down_write(&qtinf->rwsem); + + if (is_add) { + ret = find_rule(sb, &rule, &key, lock); + if (ret == -ENOENT) + ret = 0; + else if (ret == 0) + ret = -EEXIST; + if (ret < 0) + goto unlock; + + rule_to_rule_val(&rv, &rule); + ret = scoutfs_item_create(sb, &key, &rv, sizeof(rv), lock); + if (ret < 0) + goto unlock; + + } else { + ret = find_rule(sb, &rule, &key, lock) ?: + scoutfs_item_delete(sb, &key, lock); + if (ret < 0) + goto unlock; + } + + scoutfs_quota_invalidate(sb); + ret = 0; + +unlock: + up_write(&qtinf->rwsem); + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + +out: + if (is_add) + trace_scoutfs_quota_add_rule(sb, &rule, ret); + else + trace_scoutfs_quota_del_rule(sb, &rule, ret); + + return ret; +} + +void scoutfs_quota_get_lock_range(struct scoutfs_key *start, struct scoutfs_key *end) +{ + scoutfs_key_set_zeros(start); + start->sk_zone = SCOUTFS_QUOTA_ZONE; + + scoutfs_key_set_ones(end); + end->sk_zone = SCOUTFS_QUOTA_ZONE; +} + +/* + * This is called during cluster lock invalidation to indicate that the + * ruleset is no longer protected by cluster locking and might have been + * modified. We mark the ruleset invalid and free it once all readers + * drain. The next check will acquire the cluster lock and read the + * rules. Because this is called during invalidation this is serialized + * with write holders of cluster locks so we can never see -EBUSY here. + */ +void scoutfs_quota_invalidate(struct super_block *sb) +{ + DECLARE_QUOTA_INFO(sb, qtinf); + struct squota_ruleset *rs; + + if (quota_unsupported(sb)) + return; + + rcu_read_lock(); + + spin_lock(&qtinf->lock); + rs = rcu_dereference_protected(qtinf->ruleset, lockdep_is_held(&qtinf->lock)); + if (rs != ERR_PTR(-EINVAL)) + rcu_assign_pointer(qtinf->ruleset, ERR_PTR(-EINVAL)); + spin_unlock(&qtinf->lock); + + /* cluster locking should have prevented this */ + BUG_ON(rs == ERR_PTR(-EBUSY)); + + if (!IS_ERR(rs)) + call_rcu(&rs->rcu, free_ruleset_rcu); + + rcu_read_unlock(); + + shrink_all_cached_checks(qtinf); +} + +static ssize_t quota_drop_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) +{ + return 0; +} + +static ssize_t quota_drop_write(struct file *file, const char __user *buf, size_t size, + loff_t *ppos) +{ + struct squota_info *qtinf = file_inode(file)->i_private; + + shrink_all_cached_checks(qtinf); + + return size; +} + +static const struct file_operations quota_drop_fops = { + .read = quota_drop_read, + .write = quota_drop_write, +}; + +int scoutfs_quota_setup(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct squota_info *qtinf = NULL; + int ret; + + if (quota_unsupported(sb)) + return 0; + + qtinf = kzalloc(sizeof(struct squota_info), GFP_KERNEL); + if (!qtinf) { + ret = -ENOMEM; + goto out; + } + + ret = rhashtable_init(&qtinf->check_ht, &check_ht_params); + if (ret < 0) { + kfree(qtinf); + goto out; + } + + qtinf->drop_dentry = debugfs_create_file("drop_quota_check_cache", S_IFREG|S_IRUSR, + sbi->debug_root, qtinf, "a_drop_fops); + if (!qtinf->drop_dentry) { + rhashtable_destroy(&qtinf->check_ht); + kfree(qtinf); + return -ENOMEM; + } + + qtinf->sb = sb; + RCU_INIT_POINTER(qtinf->ruleset, ERR_PTR(-EINVAL)); + atomic64_set(&qtinf->nr_checks, 0); + init_rwsem(&qtinf->rwsem); + spin_lock_init(&qtinf->lock); + init_waitqueue_head(&qtinf->waitq); + + KC_INIT_SHRINKER_FUNCS(&qtinf->shrinker, count_cached_checks, scan_cached_checks); + KC_REGISTER_SHRINKER(&qtinf->shrinker); + + sbi->squota_info = qtinf; + + ret = 0; +out: + return ret; +} + +static void free_cached_check(void *ptr, void *arg) +{ + struct squota_check *chk = ptr; + + kfree(chk); +} + +void scoutfs_quota_destroy(struct super_block *sb) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + DECLARE_QUOTA_INFO(sb, qtinf); + struct squota_ruleset *rs; + + if (qtinf) { + debugfs_remove(qtinf->drop_dentry); + KC_UNREGISTER_SHRINKER(&qtinf->shrinker); + + spin_lock(&qtinf->lock); + rs = rcu_dereference_protected(qtinf->ruleset, lockdep_is_held(&qtinf->lock)); + spin_unlock(&qtinf->lock); + if (!IS_ERR(rs)) + free_ruleset(rs); + + rhashtable_free_and_destroy(&qtinf->check_ht, free_cached_check, NULL); + + kfree(qtinf); + sbi->squota_info = NULL; + } +} diff --git a/kmod/src/quota.h b/kmod/src/quota.h new file mode 100644 index 00000000..324d8a36 --- /dev/null +++ b/kmod/src/quota.h @@ -0,0 +1,48 @@ +#ifndef _SCOUTFS_QUOTA_H_ +#define _SCOUTFS_QUOTA_H_ + +#include "ioctl.h" + +/* + * Each rule's name can be in the ruleset's rbtree associated with the + * source attr that it selects. This lets checks only test rules that + * the inputs could match. The 'i' field indicates which name is in the + * tree so we can find the containing rule. + * + * This is mostly private to quota.c but we expose it for tracing. + */ +struct squota_rule { + u64 limit; + u8 prio; + u8 op; + u8 rule_flags; + struct squota_rule_name { + struct rb_node node; + u64 val; + u8 source; + u8 flags; + u8 i; + } names[3]; +}; + +/* private to quota.c, only here for tracing */ +struct squota_input { + u64 attrs[SQ_NS__NR_SELECT]; + u8 op; +}; + +int scoutfs_quota_check_inode(struct super_block *sb, struct inode *dir); +int scoutfs_quota_check_data(struct super_block *sb, struct inode *inode); + +int scoutfs_quota_get_rules(struct super_block *sb, u64 *iterator, + struct scoutfs_ioctl_quota_rule *irules, int nr); +int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add, + struct scoutfs_ioctl_quota_rule *irule); + +void scoutfs_quota_get_lock_range(struct scoutfs_key *start, struct scoutfs_key *end); +void scoutfs_quota_invalidate(struct super_block *sb); + +int scoutfs_quota_setup(struct super_block *sb); +void scoutfs_quota_destroy(struct super_block *sb); + +#endif diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index da75b03b..3fd4821f 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -37,7 +37,9 @@ #include "net.h" #include "data.h" #include "ext.h" +#include "quota.h" +#include "trace/quota.h" #include "trace/wkic.h" struct lock_info; diff --git a/kmod/src/super.c b/kmod/src/super.c index f38af5ce..0086d7fb 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -50,6 +50,7 @@ #include "fence.h" #include "xattr.h" #include "wkic.h" +#include "quota.h" #include "scoutfs_trace.h" static struct dentry *scoutfs_debugfs_root; @@ -195,6 +196,7 @@ static void scoutfs_put_super(struct super_block *sb) scoutfs_shutdown_trans(sb); scoutfs_volopt_destroy(sb); scoutfs_client_destroy(sb); + scoutfs_quota_destroy(sb); scoutfs_inode_destroy(sb); scoutfs_wkic_destroy(sb); scoutfs_item_destroy(sb); @@ -548,6 +550,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) scoutfs_item_setup(sb) ?: scoutfs_wkic_setup(sb) ?: scoutfs_inode_setup(sb) ?: + scoutfs_quota_setup(sb) ?: scoutfs_data_setup(sb) ?: scoutfs_setup_trans(sb) ?: scoutfs_omap_setup(sb) ?: diff --git a/kmod/src/super.h b/kmod/src/super.h index bdbc1b81..03c6a6ea 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -31,6 +31,7 @@ struct omap_info; struct volopt_info; struct fence_info; struct wkic_info; +struct squota_info; struct scoutfs_sb_info { struct super_block *sb; @@ -57,6 +58,7 @@ struct scoutfs_sb_info { struct volopt_info *volopt_info; struct item_cache_info *item_cache_info; struct wkic_info *wkic_info; + struct squota_info *squota_info; struct fence_info *fence_info; /* tracks tasks waiting for data extents */ diff --git a/kmod/src/trace/quota.h b/kmod/src/trace/quota.h new file mode 100644 index 00000000..983b318c --- /dev/null +++ b/kmod/src/trace/quota.h @@ -0,0 +1,143 @@ + +/* + * Tracing squota_input + */ +#define SQI_FMT "[%u %llu %llu %llu]" + +#define SQI_ARGS(i) \ + (i)->op, (i)->attrs[0], (i)->attrs[1], (i)->attrs[2] + +#define SQI_FIELDS(pref) \ + __array(__u64, pref##_attrs, SQ_NS__NR_SELECT) \ + __field(__u8, pref##_op) + +#define SQI_ASSIGN(pref, i) \ + __entry->pref##_attrs[0] = (i)->attrs[0]; \ + __entry->pref##_attrs[1] = (i)->attrs[1]; \ + __entry->pref##_attrs[2] = (i)->attrs[2]; \ + __entry->pref##_op = (i)->op; + +#define SQI_ENTRY_ARGS(pref) \ + __entry->pref##_op, __entry->pref##_attrs[0], \ + __entry->pref##_attrs[1], __entry->pref##_attrs[2] + +/* + * Tracing squota_rule + */ +#define SQR_FMT "[%u %llu,%u,%x %llu,%u,%x %llu,%u,%x %u %llu]" + +#define SQR_ARGS(r) \ + (r)->prio, \ + (r)->name_val[0], (r)->name_source[0], (r)->name_flags[0], \ + (r)->name_val[1], (r)->name_source[1], (r)->name_flags[1], \ + (r)->name_val[2], (r)->name_source[2], (r)->name_flags[2], \ + (r)->op, (r)->limit \ + +#define SQR_FIELDS(pref) \ + __array(__u64, pref##_name_val, 3) \ + __field(__u64, pref##_limit) \ + __array(__u8, pref##_name_source, 3) \ + __array(__u8, pref##_name_flags, 3) \ + __field(__u8, pref##_prio) \ + __field(__u8, pref##_op) + +#define SQR_ASSIGN(pref, r) \ + __entry->pref##_name_val[0] = (r)->names[0].val; \ + __entry->pref##_name_val[1] = (r)->names[1].val; \ + __entry->pref##_name_val[2] = (r)->names[2].val; \ + __entry->pref##_limit = (r)->limit; \ + __entry->pref##_name_source[0] = (r)->names[0].source; \ + __entry->pref##_name_source[1] = (r)->names[1].source; \ + __entry->pref##_name_source[2] = (r)->names[2].source; \ + __entry->pref##_name_flags[0] = (r)->names[0].flags; \ + __entry->pref##_name_flags[1] = (r)->names[1].flags; \ + __entry->pref##_name_flags[2] = (r)->names[2].flags; \ + __entry->pref##_prio = (r)->prio; \ + __entry->pref##_op = (r)->op; + +#define SQR_ENTRY_ARGS(pref) \ + __entry->pref##_prio, __entry->pref##_name_val[0], \ + __entry->pref##_name_source[0], __entry->pref##_name_flags[0], \ + __entry->pref##_name_val[1], __entry->pref##_name_source[1], \ + __entry->pref##_name_flags[1], __entry->pref##_name_val[2], \ + __entry->pref##_name_source[2], __entry->pref##_name_flags[2], \ + __entry->pref##_op, __entry->pref##_limit + +TRACE_EVENT(scoutfs_quota_check, + TP_PROTO(struct super_block *sb, long rs_ptr, struct squota_input *inp, int ret), + + TP_ARGS(sb, rs_ptr, inp, ret), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(long, rs_ptr) + SQI_FIELDS(i) + __field(int, ret) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->rs_ptr = rs_ptr; + SQI_ASSIGN(i, inp); + __entry->ret = ret; + ), + + TP_printk(SCSBF" rs_ptr %ld ret %d inp "SQI_FMT, + SCSB_TRACE_ARGS, __entry->rs_ptr, __entry->ret, SQI_ENTRY_ARGS(i)) +); + +DECLARE_EVENT_CLASS(scoutfs_quota_rule_op_class, + TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret), + + TP_ARGS(sb, rule, ret), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + SQR_FIELDS(r) + __field(int, ret) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + SQR_ASSIGN(r, rule); + __entry->ret = ret; + ), + + TP_printk(SCSBF" "SQR_FMT" ret %d", + SCSB_TRACE_ARGS, SQR_ENTRY_ARGS(r), __entry->ret) +); +DEFINE_EVENT(scoutfs_quota_rule_op_class, scoutfs_quota_add_rule, + TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret), + TP_ARGS(sb, rule, ret) +); +DEFINE_EVENT(scoutfs_quota_rule_op_class, scoutfs_quota_del_rule, + TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret), + TP_ARGS(sb, rule, ret) +); + +TRACE_EVENT(scoutfs_quota_totl_check, + TP_PROTO(struct super_block *sb, struct squota_input *inp, struct scoutfs_key *key, + u64 limit, int ret), + + TP_ARGS(sb, inp, key, limit, ret), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + SQI_FIELDS(i) + sk_trace_define(k) + __field(__u64, limit) + __field(int, ret) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + SQI_ASSIGN(i, inp); + sk_trace_assign(k, key); + __entry->limit = limit; + __entry->ret = ret; + ), + + TP_printk(SCSBF" inp "SQI_FMT" key "SK_FMT" limit %llu ret %d", + SCSB_TRACE_ARGS, SQI_ENTRY_ARGS(i), sk_trace_args(k), __entry->limit, + __entry->ret) +);