From 38e6f11ee4e0f6fee4a3d5e8f113bfea930835ce Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 13 Sep 2023 09:30:23 -0700
Subject: [PATCH] Add quota support

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/Makefile        |    1 +
 kmod/src/data.c          |    4 +
 kmod/src/dir.c           |    5 +
 kmod/src/file.c          |    9 +
 kmod/src/format.h        |   37 ++
 kmod/src/ioctl.c         |   80 +++
 kmod/src/ioctl.h         |   69 +++
 kmod/src/lock.c          |   15 +
 kmod/src/lock.h          |    2 +
 kmod/src/quota.c         | 1261 ++++++++++++++++++++++++++++++++++++++
 kmod/src/quota.h         |   48 ++
 kmod/src/scoutfs_trace.h |    2 +
 kmod/src/super.c         |    3 +
 kmod/src/super.h         |    2 +
 kmod/src/trace/quota.h   |  143 +++++
 15 files changed, 1681 insertions(+)
 create mode 100644 kmod/src/quota.c
 create mode 100644 kmod/src/quota.h
 create mode 100644 kmod/src/trace/quota.h

diff --git a/kmod/src/Makefile b/kmod/src/Makefile
index ca1b657e..fa632aa1 100644
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -35,6 +35,7 @@ scoutfs-y +=			\
 	options.o		\
 	per_task.o		\
 	quorum.o		\
+	quota.o			\
 	recov.o			\
 	scoutfs_trace.o		\
 	server.o		\
diff --git a/kmod/src/data.c b/kmod/src/data.c
index 7d14c55a..dd10a06f 100644
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -1110,6 +1110,10 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	while(iblock <= last) {
 
+		ret = scoutfs_quota_check_data(sb, inode);
+		if (ret)
+			goto out_extent;
+
 		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
 		if (ret)
 			goto out_extent;
diff --git a/kmod/src/dir.c b/kmod/src/dir.c
index c8417bdf..ca23d3a6 100644
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -34,6 +34,7 @@
 #include "forest.h"
 #include "acl.h"
 #include "counters.h"
+#include "quota.h"
 #include "scoutfs_trace.h"
 
 /*
@@ -651,6 +652,10 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		goto out_unlock;
 
+	ret = scoutfs_quota_check_inode(sb, dir);
+	if (ret)
+		goto out_unlock;
+
 	if (orph_lock) {
 		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
 		if (ret < 0)
diff --git a/kmod/src/file.c b/kmod/src/file.c
index 3b6a2d12..d44cd822 100644
--- a/kmod/src/file.c
+++ b/kmod/src/file.c
@@ -28,6 +28,7 @@
 #include "inode.h"
 #include "per_task.h"
 #include "omap.h"
+#include "quota.h"
 
 #ifdef KC_LINUX_HAVE_FOP_AIO_READ
 /*
@@ -126,6 +127,10 @@ retry:
 			goto out;
 	}
 
+	ret = scoutfs_quota_check_data(sb, inode);
+	if (ret)
+		goto out;
+
 	/* XXX: remove SUID bit */
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
@@ -228,6 +233,10 @@ retry:
 	if (ret)
 		goto out;
 
+	ret = scoutfs_quota_check_data(sb, inode);
+	if (ret)
+		goto out;
+
 	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) {
 		/* data_version is per inode, whole file must be online */
 		ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode), SEF_OFFLINE,
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 7e283de8..47a3a7c9 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -13,6 +13,7 @@
 
 #define SCOUTFS_FORMAT_VERSION_FEAT_RETENTION	2
 #define SCOUTFS_FORMAT_VERSION_FEAT_PROJECT_ID	2
+#define SCOUTFS_FORMAT_VERSION_FEAT_QUOTA	2
 
 /* statfs(2) f_type */
 #define SCOUTFS_SUPER_MAGIC	0x554f4353		/* "SCOU" */
@@ -178,6 +179,10 @@ struct scoutfs_key {
 #define sko_rid		_sk_first
 #define sko_ino		_sk_second
 
+/* quota rules */
+#define skqr_hash	_sk_second
+#define skqr_coll_nr	_sk_third
+
 /* xattr totl */
 #define skxt_a		_sk_first
 #define skxt_b		_sk_second
@@ -588,6 +593,7 @@ struct scoutfs_log_merge_freeing {
  */
 #define SCOUTFS_INODE_INDEX_ZONE		4
 #define SCOUTFS_ORPHAN_ZONE			8
+#define SCOUTFS_QUOTA_ZONE			10
 #define SCOUTFS_XATTR_TOTL_ZONE			12
 #define SCOUTFS_FS_ZONE				16
 #define SCOUTFS_LOCK_ZONE			20
@@ -611,6 +617,9 @@ struct scoutfs_log_merge_freeing {
 /* orphan zone, redundant type used for clarity */
 #define SCOUTFS_ORPHAN_TYPE			4
 
+/* quota zone */
+#define SCOUTFS_QUOTA_RULE_TYPE			4
+
 /* fs zone */
 #define SCOUTFS_INODE_TYPE			4
 #define SCOUTFS_XATTR_TYPE			8
@@ -664,6 +673,34 @@ struct scoutfs_xattr_totl_val {
 	__le64 count;
 };
 
+#define SQ_RF_TOTL_COUNT	(1 << 0)
+#define SQ_RF__UNKNOWN	(~((1 << 1) - 1))
+
+#define SQ_NS_LITERAL		0
+#define SQ_NS_PROJ		1
+#define SQ_NS_UID		2
+#define SQ_NS_GID		3
+#define SQ_NS__NR		4
+#define SQ_NS__NR_SELECT	(SQ_NS__NR - 1) /* !literal */
+
+#define SQ_NF_SELECT	(1 << 0)
+#define SQ_NF__UNKNOWN	(~((1 << 1) - 1))
+
+#define SQ_OP_INODE	0
+#define SQ_OP_DATA	1
+#define SQ_OP__NR	2
+
+struct scoutfs_quota_rule_val {
+	__le64 name_val[3];
+	__le64 limit;
+	__u8 prio;
+	__u8 op;
+	__u8 rule_flags;
+	__u8 name_source[3];
+	__u8 name_flags[3];
+	__u8 _pad[7];
+};
+
 /* XXX does this exist upstream somewhere? */
 #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
 
diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c
index c99580d8..ceadf91f 100644
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -45,6 +45,7 @@
 #include "attr_x.h"
 #include "totl.h"
 #include "wkic.h"
+#include "quota.h"
 #include "scoutfs_trace.h"
 
 /*
@@ -1387,6 +1388,79 @@ out:
 	return ret;
 }
 
+static long scoutfs_ioc_get_quota_rules(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_get_quota_rules __user *ugqr = (void __user *)arg;
+	struct scoutfs_ioctl_get_quota_rules gqr;
+	struct scoutfs_ioctl_quota_rule __user *uirules;
+	struct scoutfs_ioctl_quota_rule *irules;
+	struct page *page = NULL;
+	int copied = 0;
+	int nr;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&gqr, ugqr, sizeof(gqr)))
+		return -EFAULT;
+
+	if (gqr.rules_nr == 0)
+		return 0;
+
+	uirules = (void __user *)gqr.rules_ptr;
+	/* limit rules copied per call */
+	gqr.rules_nr = min_t(u64, gqr.rules_nr, INT_MAX);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	irules = page_address(page);
+
+	while (copied < gqr.rules_nr) {
+		nr = min_t(u64, gqr.rules_nr - copied,
+				PAGE_SIZE / sizeof(struct scoutfs_ioctl_quota_rule));
+		ret = scoutfs_quota_get_rules(sb, gqr.iterator, page_address(page), nr);
+		if (ret <= 0)
+			goto out;
+
+		if (copy_to_user(&uirules[copied], irules, ret * sizeof(irules[0]))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		copied += ret;
+	}
+
+	ret = 0;
+out:
+	if (page)
+		__free_page(page);
+
+	if (ret == 0 && copy_to_user(ugqr->iterator, gqr.iterator, sizeof(gqr.iterator)))
+		ret = -EFAULT;
+
+	return ret ?: copied;
+}
+
+static long scoutfs_ioc_mod_quota_rule(struct file *file, unsigned long arg, bool is_add)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_quota_rule __user *uirule = (void __user *)arg;
+	struct scoutfs_ioctl_quota_rule irule;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&irule, uirule, sizeof(irule)))
+		return -EFAULT;
+
+	return scoutfs_quota_mod_rule(sb, is_add, &irule);
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1428,6 +1502,12 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_get_attr_x(file, arg);
 	case SCOUTFS_IOC_SET_ATTR_X:
 		return scoutfs_ioc_set_attr_x(file, arg);
+	case SCOUTFS_IOC_GET_QUOTA_RULES:
+		return scoutfs_ioc_get_quota_rules(file, arg);
+	case SCOUTFS_IOC_ADD_QUOTA_RULE:
+		return scoutfs_ioc_mod_quota_rule(file, arg, true);
+	case SCOUTFS_IOC_DEL_QUOTA_RULE:
+		return scoutfs_ioc_mod_quota_rule(file, arg, false);
 	}
 
 	return -ENOTTY;
diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h
index 9a6b9b30..69cf05c3 100644
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -750,4 +750,73 @@ struct scoutfs_ioctl_inode_attr_x {
 #define SCOUTFS_IOC_SET_ATTR_X \
 	_IOW(SCOUTFS_IOCTL_MAGIC, 19, struct scoutfs_ioctl_inode_attr_x)
 
+/*
+ * (These fields are documented in the order that they're displayed by
+ * the scoutfs cli utility which matches the sort order of the rules.)
+ *
+ * @prio: The priority of the rule.  Rules are sorted by their fields
+ * with prio at the highest magnitude.  When multiple rules match the
+ * rule with the highest sort order is enforced.  The priority field
+ * lets rules override the default field sort order.
+ *
+ * @name_val[3]: The three 64bit values that make up the name of the
+ * totl xattr whose total will be checked against the rule's limit to
+ * see if the quota rule has been exceeded.  The behavior of the values
+ * can be changed by their corresponding name_source and name_flags.
+ *
+ * @name_source[3]: The SQ_NS_ enums that control where the value comes
+ * from.  _LITERAL uses the value from name_val.  Inode attribute
+ * sources (_PROJ, _UID, _GID) are taken from the inode of the operation
+ * that is being checked against the rule.
+ *
+ * @name_flags[3]: The SQ_NF_ enums that alter the name values.  _SELECT
+ * makes the rule only match if the inode attribute of the operation
+ * matches the attribute value stored in name_val.  This lets rules
+ * match a specific value of an attribute rather than mapping all
+ * attribute values of to totl names.
+ *
+ * @op: The SQ_OP_ enums which specify the operation that can't exceed
+ * the rule's limit.  _INODE checks inode creation and the inode
+ * attributes are taken from the inode that would be created.  _DATA
+ * checks file data block allocation and the inode fields come from the
+ * inode that is allocating the blocks.
+ *
+ * @limit: The 64bit value that is checked against the totl value
+ * described by the rule.  If the totl value is greater than or equal to
+ * this value of the matching rule then the operation will return
+ * -EDQUOT.
+ *
+ * @rule_flags: SQ_RF_TOTL_COUNT indicates that the rule's limit should
+ * be checked against the number of xattrs contributing to a totl value
+ * instead of the sum of the xattrs.
+ */
+struct scoutfs_ioctl_quota_rule {
+	__u64 name_val[3];
+	__u64 limit;
+	__u8 prio;
+	__u8 op;
+	__u8 rule_flags;
+	__u8 name_source[3];
+	__u8 name_flags[3];
+	__u8 _pad[7];
+};
+
+struct scoutfs_ioctl_get_quota_rules {
+	__u64 iterator[2];
+	__u64 rules_ptr;
+	__u64 rules_nr;
+};
+
+/*
+ * Rules are uniquely identified by their non-padded fields.  Addition will fail
+ * with -EEXIST if the specified rule already exists and deletion must find a rule
+ * with all matching fields to delete.
+ */
+#define SCOUTFS_IOC_GET_QUOTA_RULES \
+	_IOR(SCOUTFS_IOCTL_MAGIC, 20, struct scoutfs_ioctl_get_quota_rules)
+#define SCOUTFS_IOC_ADD_QUOTA_RULE \
+	_IOW(SCOUTFS_IOCTL_MAGIC, 21, struct scoutfs_ioctl_quota_rule)
+#define SCOUTFS_IOC_DEL_QUOTA_RULE \
+	_IOW(SCOUTFS_IOCTL_MAGIC, 22, struct scoutfs_ioctl_quota_rule)
+
 #endif
diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index b3c6b957..2398f3d9 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -37,6 +37,7 @@
 #include "omap.h"
 #include "util.h"
 #include "totl.h"
+#include "quota.h"
 
 /*
  * scoutfs uses a lock service to manage item cache consistency between
@@ -186,6 +187,9 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
 			return ret;
 	}
 
+	if (lock->start.sk_zone == SCOUTFS_QUOTA_ZONE && !lock_mode_can_read(mode))
+		scoutfs_quota_invalidate(sb);
+
 	/* have to invalidate if we're not in the only usable case */
 	if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
 retry:
@@ -1250,6 +1254,17 @@ int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode,
 	return lock_key_range(sb, mode, flags, &start, &end, lock);
 }
 
+int scoutfs_lock_quota(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		       struct scoutfs_lock **lock)
+{
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+
+	scoutfs_quota_get_lock_range(&start, &end);
+
+	return lock_key_range(sb, mode, flags, &start, &end, lock);
+}
+
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
diff --git a/kmod/src/lock.h b/kmod/src/lock.h
index 4ba50bcd..1b49e534 100644
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -86,6 +86,8 @@ int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int
 		        u64 ino, struct scoutfs_lock **lock);
 int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			    struct scoutfs_lock **lock);
+int scoutfs_lock_quota(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		       struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
 		    enum scoutfs_lock_mode mode);
 
diff --git a/kmod/src/quota.c b/kmod/src/quota.c
new file mode 100644
index 00000000..c3e3839b
--- /dev/null
+++ b/kmod/src/quota.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (C) 2023 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/time.h>
+#include <linux/rhashtable.h>
+#include <linux/random.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+
+#include "format.h"
+#include "super.h"
+#include "lock.h"
+#include "hash.h"
+#include "inode.h"
+#include "item.h"
+#include "ioctl.h"
+#include "cmp.h"
+#include "wkic.h"
+#include "xattr.h"
+#include "totl.h"
+#include "util.h"
+#include "quota.h"
+#include "scoutfs_trace.h"
+
+/*
+ * scoutfs quotas let userspace manage accounting and rules which
+ * specify when operations should fail because a quota is exceeded.
+ *
+ * Userspace is responsible for managing the .totl. xattrs that
+ * accumulate counts and totals that can be checked to enforce quotas.
+ * Userspace then builds quota rules that map operations to totl names
+ * and limits.  This puts userspace entirely in control of the quota
+ * policy.
+ *
+ * The quota checks are specifically allowed to use slightly stale data
+ * to avoid global locking bottlenecks.
+ *
+ * Rules are stored as items in the main fs btree and are subject strict
+ * consistency cluster locking.  After any change to rules all the rules
+ * will be read in again and processed for checking.
+ *
+ * The .totl. xattrs are not read under cluster locking to avoid lock
+ * contention.  They're read using the weak item cache which expires
+ * only on a timeout.  This leads to a regular background load of weak
+ * reads of the item totls as they're updated at the frequency of the
+ * cache expiration.
+ */
+
+#define CACHE_AGE_MS		(5 * MSEC_PER_SEC)
+
+/*
+ * Rules are stored in trees whose nodes are keyed by their input
+ * matching criteria.  The trees are not modified once they're visible
+ * to readers.  RCU is used to free the trees once all the readers have
+ * finished.
+ */
+struct squota_ruleset {
+	struct rcu_head rcu;
+	struct rb_root roots[SQ_NS__NR_SELECT];
+	struct squota_rule *defaults[SQ_OP__NR];
+};
+
+struct squota_info {
+	struct super_block *sb;
+	struct squota_ruleset __rcu *ruleset; /* ENOENT, EINVAL, EBUSY, or valid ptr */
+	struct rhashtable check_ht;
+	atomic64_t nr_checks;
+
+	struct rw_semaphore rwsem;
+	spinlock_t lock;
+	wait_queue_head_t waitq;
+	KC_DEFINE_SHRINKER(shrinker);
+	struct dentry *drop_dentry;
+};
+
+#define DECLARE_QUOTA_INFO(sb, name) \
+	struct squota_info *name = SCOUTFS_SB(sb)->squota_info
+
+static inline int quota_unsupported(struct super_block *sb)
+{
+	return scoutfs_fmt_vers_unsupported(sb, SCOUTFS_FORMAT_VERSION_FEAT_QUOTA);
+}
+
+struct squota_check {
+	struct rcu_head rcu;
+	struct rhash_head head;
+	struct squota_input inp;
+	ktime_t expiration;
+	int result;
+};
+
+static const struct rhashtable_params check_ht_params = {
+	.key_len = member_sizeof(struct squota_check, inp),
+	.key_offset = offsetof(struct squota_check, inp),
+	.head_offset = offsetof(struct squota_check, head),
+};
+
+static bool get_cached_check(struct squota_info *qtinf, struct squota_input *inp, int *result)
+{
+	struct squota_check *chk;
+	bool got;
+
+	if (WARN_ON_ONCE(!rcu_read_lock_held()))
+		return false;
+
+	chk = rhashtable_lookup(&qtinf->check_ht, inp, check_ht_params);
+	if (chk && ktime_after(chk->expiration, ktime_get_raw())) {
+		*result = chk->result;
+		got = true;
+	} else {
+		*result = 0;
+		got = false;
+	}
+
+	return got;
+}
+
+/*
+ * Insert a new cached check.  If a cached check already exists its
+ * either timed out or was inserted very recently so either can be used.
+ * We abandon the insertion attempt on other errors, including
+ * allocation failures and insertion failure from a pending hash table
+ * resize.
+ */
+static void insert_cached_check(struct squota_info *qtinf, struct squota_input *inp, int result)
+{
+	struct squota_check *found;
+	struct squota_check *chk;
+	int ret;
+
+	/* zero full size for hash table memcmp */
+	chk = kzalloc(sizeof(struct squota_check), GFP_NOFS);
+	if (!chk)
+		return;
+
+	chk->inp = *inp;
+	chk->expiration = ktime_add_ms(ktime_get_raw(), CACHE_AGE_MS);
+	chk->result = result;
+
+	while (chk) {
+		ret = rhashtable_lookup_insert_fast(&qtinf->check_ht, &chk->head,
+						    check_ht_params);
+		if (ret == 0) {
+			atomic64_inc(&qtinf->nr_checks);
+			chk = NULL;
+
+		} else if (ret == -EEXIST) {
+			/* try to free older insertion or existing */
+			rcu_read_lock();
+			found = rhashtable_lookup(&qtinf->check_ht, inp, check_ht_params);
+			if (found) {
+				if (ktime_before(found->expiration, chk->expiration)) {
+					if (rhashtable_remove_fast(&qtinf->check_ht,
+								   &found->head,
+								   check_ht_params) == 0) {
+						kfree_rcu(found, rcu);
+						atomic64_dec(&qtinf->nr_checks);
+					}
+				} else {
+					kfree(chk);
+					chk = NULL;
+				}
+			}
+			rcu_read_unlock();
+
+		} else {
+			kfree(chk);
+			chk = NULL;
+		}
+	}
+}
+
+/*
+ * Return a random cached check from the hash table.  We sweep the
+ * buckets from a random starting point and return the first we find,
+ * continuing from the next table if it's resizing.  This is sort of
+ * like the _walk_ api but we can set the starting point and it doesn't
+ * return -EAGAIN while resizing.
+ */
+static struct squota_check *lookup_random_check(struct rhashtable *rht)
+{
+	struct bucket_table *tbl;
+	struct squota_check *chk;
+	struct rhash_head *pos;
+	unsigned long s;
+	unsigned long i;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	tbl = rht_dereference_rcu(rht->tbl, rht);
+	do {
+		for (s = 0, i = prandom_u32_max(tbl->size);
+		     s < tbl->size;
+		     s++, i = (i + 1) % tbl->size) {
+			rht_for_each_entry_rcu(chk, pos, tbl, i, head) {
+				return chk;
+			}
+		}
+	} while (!IS_ERR_OR_NULL((tbl = rht_dereference_rcu(tbl->future_tbl, rht))));
+
+	return NULL;
+}
+
+static unsigned long count_cached_checks(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct squota_info *qtinf = KC_SHRINKER_CONTAINER_OF(shrink, struct squota_info);
+
+	return shrinker_min_long(atomic64_read(&qtinf->nr_checks));
+}
+
+/*
+ * We don't bother with any precise replacement mechanism.  We choose
+ * cached check results to drop at random.  If the cache is large then
+ * random choices are unlikely to have been used again.  If the cache is
+ * small then any choices end up blowing away most of the cache.
+ */
+static unsigned long scan_cached_checks(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct squota_info *qtinf = KC_SHRINKER_CONTAINER_OF(shrink, struct squota_info);
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int retries = 10;
+	unsigned long freed = 0;
+	struct squota_check *chk;
+	int err;
+
+	rcu_read_lock();
+
+	while (nr > 0 && retries > 0 && (chk = lookup_random_check(&qtinf->check_ht))) {
+		err = rhashtable_remove_fast(&qtinf->check_ht, &chk->head, check_ht_params);
+		if (err) {
+			retries--;
+			continue;
+		}
+
+		kfree_rcu(chk, rcu);
+		atomic64_dec(&qtinf->nr_checks);
+		freed++;
+		nr--;
+	}
+
+	rcu_read_unlock();
+
+	if (retries == 0 && freed == 0)
+		freed = SHRINK_STOP;
+
+	return freed;
+}
+
+static void shrink_all_cached_checks(struct squota_info *qtinf)
+{
+	struct shrink_control sc = { .nr_to_scan = LONG_MAX, };
+
+	scan_cached_checks(KC_SHRINKER_FN(&qtinf->shrinker), &sc);
+}
+
+static u8 ns_is_attr(u8 ns)
+{
+	switch (ns) {
+	case SQ_NS_PROJ:
+	case SQ_NS_UID:
+	case SQ_NS_GID:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/* rule validation has made sure these derefs are safe */
+static u8 ns_to_attr(u8 ns)
+{
+	static u8 ind[] = {
+		[SQ_NS_PROJ] = 0,
+		[SQ_NS_UID] = 1,
+		[SQ_NS_GID] = 2,
+	};
+
+	return ind[ns];
+}
+
+static void rule_to_rule_val(struct scoutfs_quota_rule_val *rv, struct squota_rule *rule)
+{
+	rv->limit = cpu_to_le64(rule->limit);
+	rv->prio = rule->prio;
+	rv->op = rule->op;
+	rv->rule_flags = rule->rule_flags;
+	rv->name_val[0] = cpu_to_le64(rule->names[0].val);
+	rv->name_source[0] = rule->names[0].source;
+	rv->name_flags[0] = rule->names[0].flags;
+	rv->name_val[1] = cpu_to_le64(rule->names[1].val);
+	rv->name_source[1] = rule->names[1].source;
+	rv->name_flags[1] = rule->names[1].flags;
+	rv->name_val[2] = cpu_to_le64(rule->names[2].val);
+	rv->name_source[2] = rule->names[2].source;
+	rv->name_flags[2] = rule->names[2].flags;
+	memset(&rv->_pad, 0, sizeof(rv->_pad));
+}
+
+static void rule_to_irule(struct scoutfs_ioctl_quota_rule *irule, struct squota_rule *rule)
+{
+	irule->limit = rule->limit;
+	irule->prio = rule->prio;
+	irule->op = rule->op;
+	irule->rule_flags = rule->rule_flags;
+	irule->name_val[0] = rule->names[0].val;
+	irule->name_source[0] = rule->names[0].source;
+	irule->name_flags[0] = rule->names[0].flags;
+	irule->name_val[1] = rule->names[1].val;
+	irule->name_source[1] = rule->names[1].source;
+	irule->name_flags[1] = rule->names[1].flags;
+	irule->name_val[2] = rule->names[2].val;
+	irule->name_source[2] = rule->names[2].source;
+	irule->name_flags[2] = rule->names[2].flags;
+	memset(&irule->_pad, 0, sizeof(irule->_pad));
+}
+
+/*
+ * We verify rules coming from untrusted ioctls/storage.
+ */
+static bool valid_rule(struct squota_rule *rule)
+{
+	struct squota_rule_name *other;
+	struct squota_rule_name *name;
+	int i;
+	int j;
+
+	/* invalid op */
+	if (rule->op > SQ_OP__NR)
+		return false;
+
+	if (rule->rule_flags & SQ_RF__UNKNOWN)
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(rule->names); i++) {
+		name = &rule->names[i];
+
+		/* unknown name flags */
+		if (name->flags & SQ_NF__UNKNOWN)
+			return false;
+
+		if ((name->flags & SQ_NF_SELECT)) {
+			/* can only select sources that are inode attributes */
+			if (!ns_is_attr(name->source))
+				return false;
+
+			for (j = 0; j < ARRAY_SIZE(rule->names); j++) {
+				if (i == j)
+					continue;
+				other = &rule->names[j];
+
+				/* can't select different values of same attr */
+				if ((other->flags & SQ_NF_SELECT) &&
+				    name->source == other->source &&
+				    name->val != other->val) {
+					return false;
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+static int rule_val_to_rule(struct squota_rule *rule, struct scoutfs_quota_rule_val *rv,
+			    int bytes)
+{
+	if (bytes != sizeof(struct scoutfs_quota_rule_val))
+		return -EIO;
+
+	rule->limit = le64_to_cpu(rv->limit);
+	rule->prio = rv->prio;
+	rule->op = rv->op;
+	rule->rule_flags = rv->rule_flags;
+	rule->names[0].val = le64_to_cpu(rv->name_val[0]);
+	rule->names[0].source = rv->name_source[0];
+	rule->names[0].flags = rv->name_flags[0];
+	rule->names[1].val = le64_to_cpu(rv->name_val[1]);
+	rule->names[1].source = rv->name_source[1];
+	rule->names[1].flags = rv->name_flags[1];
+	rule->names[2].val = le64_to_cpu(rv->name_val[2]);
+	rule->names[2].source = rv->name_source[2];
+	rule->names[2].flags = rv->name_flags[2];
+
+	if (!valid_rule(rule))
+		return -EIO;
+
+	return 0;
+}
+
+static int irule_to_rule(struct squota_rule *rule, struct scoutfs_ioctl_quota_rule *irule)
+{
+	rule->limit = irule->limit;
+	rule->prio = irule->prio;
+	rule->op = irule->op;
+	rule->rule_flags = irule->rule_flags;
+	rule->names[0].val = irule->name_val[0];
+	rule->names[0].source = irule->name_source[0];
+	rule->names[0].flags = irule->name_flags[0];
+	rule->names[1].val = irule->name_val[1];
+	rule->names[1].source = irule->name_source[1];
+	rule->names[1].flags = irule->name_flags[1];
+	rule->names[2].val = irule->name_val[2];
+	rule->names[2].source = irule->name_source[2];
+	rule->names[2].flags = irule->name_flags[2];
+
+	if (!valid_rule(rule))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void init_rule_key(struct scoutfs_key *key, u64 hash, u64 coll_nr)
+{
+	*key = (struct scoutfs_key) {
+		.sk_zone = SCOUTFS_QUOTA_ZONE,
+		.sk_type = SCOUTFS_QUOTA_RULE_TYPE,
+		.skqr_hash = cpu_to_le64(hash),
+		.skqr_coll_nr = cpu_to_le64(coll_nr),
+	};
+}
+
+static void rule_to_key(struct scoutfs_key *key, struct squota_rule *rule)
+{
+	struct scoutfs_quota_rule_val rv;
+
+	rule_to_rule_val(&rv, rule);
+	init_rule_key(key, scoutfs_hash64(&rv, sizeof(rv)), 0);
+}
+
+/*
+ * Callers specifically want to increase keys by increasing the
+ * collision nr, not just incing the key.
+ */
+static void inc_coll_nr(struct scoutfs_key *key)
+{
+	le64_add_cpu(&key->skqr_coll_nr, 1);
+	if (key->skqr_coll_nr == 0)
+		le64_add_cpu(&key->skqr_hash, 1);
+}
+
+/*
+ * Rules have a defined sort order that determines matching priority
+ * when multiple rules match an input.
+ */
+static int cmp_rules(struct squota_rule *a, struct squota_rule *b)
+{
+	return scoutfs_cmp(a->prio, b->prio) ?:
+	       scoutfs_cmp(a->names[0].val, b->names[0].val) ?:
+	       scoutfs_cmp(a->names[0].source, b->names[0].source) ?:
+	       scoutfs_cmp(a->names[0].flags, b->names[0].flags) ?:
+	       scoutfs_cmp(a->names[1].val, b->names[1].val) ?:
+	       scoutfs_cmp(a->names[1].source, b->names[1].source) ?:
+	       scoutfs_cmp(a->names[1].flags, b->names[1].flags) ?:
+	       scoutfs_cmp(a->names[2].val, b->names[2].val) ?:
+	       scoutfs_cmp(a->names[2].source, b->names[2].source) ?:
+	       scoutfs_cmp(a->names[2].flags, b->names[2].flags) ?:
+	       scoutfs_cmp(a->op, b->op) ?:
+	       scoutfs_cmp(a->limit, b->limit) ?:
+	       scoutfs_cmp(a->rule_flags, b->rule_flags);
+}
+
+static struct squota_rule *name_to_rule(struct squota_rule_name *name)
+{
+	return container_of(name, struct squota_rule, names[name->i]);
+}
+
+static bool unlinked_rule(struct squota_rule *rule)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(rule->names); i++) {
+		if (!RB_EMPTY_NODE(&rule->names[i].node))
+			return false;
+	}
+
+	return true;
+}
+
+static void free_ruleset(struct squota_ruleset *rs)
+{
+	struct squota_rule_name *name;
+	struct squota_rule_name *name_;
+	struct squota_rule *rule;
+	int i;
+
+	if (!IS_ERR_OR_NULL(rs)) {
+		for (i = 0; i < ARRAY_SIZE(rs->roots); i++) {
+			rbtree_postorder_for_each_entry_safe(name, name_, &rs->roots[i], node) {
+				RB_CLEAR_NODE(&name->node);
+
+				rule = name_to_rule(name);
+				if (unlinked_rule(rule))
+					kfree(rule);
+			}
+		}
+
+		for (i = 0; i < ARRAY_SIZE(rs->defaults); i++)
+			kfree(rs->defaults[i]);
+
+		kfree(rs);
+	}
+}
+
+static void free_ruleset_rcu(struct rcu_head *rcu)
+{
+	struct squota_ruleset *rs = container_of(rcu, struct squota_ruleset, rcu);
+
+	free_ruleset(rs);
+}
+
+static bool empty_ruleset(struct squota_ruleset *rs)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(rs->roots); i++) {
+		if (!RB_EMPTY_ROOT(&rs->roots[i]))
+			return false;
+	}
+	for (i = 0; i < ARRAY_SIZE(rs->defaults); i++) {
+		if (rs->defaults[i])
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Walk a rule tree for a given matching attr.  Each tree only contains
+ * names which select on the tree's attr so we only have to compare each
+ * name's value, not its flags or source.
+ *
+ * The tree allows multiple names with a given val.  The first match is
+ * found and callers can iterate through all matches with _next.
+ */
+static struct squota_rule_name *walk_rule_tree(struct rb_root *root, u64 val,
+					       struct squota_rule_name *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct squota_rule_name *found = NULL;
+	struct squota_rule_name *name;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		name = container_of(*node, struct squota_rule_name, node);
+
+		cmp = scoutfs_cmp(name->val, val);
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			found = name;
+			node = &(*node)->rb_left;
+		}
+	}
+
+	if (ins) {
+		rb_link_node(&ins->node, parent, node);
+		rb_insert_color(&ins->node, root);
+	}
+
+	return found;
+}
+
+/*
+ * Return the next name in the ruleset attr tree that matches the val.
+ * All the nodes match this attribute, so we only have to compare the
+ * val.
+ */
+static struct squota_rule_name *next_val_name(struct squota_rule_name *name)
+{
+	struct squota_rule_name *next;
+	struct rb_node *node;
+
+	if (!name || RB_EMPTY_NODE(&name->node))
+		return NULL;
+
+	node = rb_next(&name->node);
+	if (node) {
+		next = container_of(node, struct squota_rule_name, node);
+		if (next->val == name->val)
+			return next;
+	}
+
+	return NULL;
+}
+
+static bool ruleset_is_busy(struct squota_info *qtinf)
+{
+	bool busy;
+
+	rcu_read_lock();
+	busy = rcu_dereference(qtinf->ruleset) == ERR_PTR(-EBUSY);
+	rcu_read_unlock();
+
+	return busy;
+}
+
+/*
+ * The caller found that we didn't have a valid ruleset and wants us to
+ * read in a new ruleset.
+ *
+ * We get exclusive access to the rules by marking the ruleset pointer
+ * busy, possibly waiting for someone else to finish if they beat us to
+ * it.  If we get exclusive access then we walk all the rule items and
+ * build up a rule set and publish it for use.
+ */
+static int read_ruleset(struct super_block *sb, struct squota_info *qtinf)
+{
+	struct scoutfs_lock *lock = NULL;
+	struct squota_ruleset *rs = NULL;
+	struct scoutfs_quota_rule_val rv;
+	struct squota_rule *rule = NULL;
+	struct squota_rule_name *name;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	bool reading = false;
+	int ret;
+	int i;
+
+	ret = scoutfs_lock_quota(sb, SCOUTFS_LOCK_READ, 0, &lock);
+	if (ret < 0)
+		goto out;
+
+	spin_lock(&qtinf->lock);
+	rs = rcu_dereference_protected(qtinf->ruleset, lockdep_is_held(&qtinf->lock));
+	if (rs == ERR_PTR(-EINVAL)) {
+		rs = ERR_PTR(-EBUSY);
+		rcu_assign_pointer(qtinf->ruleset, rs);
+		reading = true;
+	}
+	spin_unlock(&qtinf->lock);
+
+	if (!reading) {
+		wait_event(qtinf->waitq, !ruleset_is_busy(qtinf));
+		ret = 0;
+		goto out;
+	}
+
+	rs = kzalloc(sizeof(struct squota_ruleset), GFP_NOFS);
+	if (!rs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rs->roots); i++)
+		rs->roots[i] = RB_ROOT;
+
+	init_rule_key(&key, 0, 0);
+	init_rule_key(&end, U64_MAX, U64_MAX);
+
+	for (;;) {
+		if (!rule) {
+			rule = kmalloc(sizeof(struct squota_rule), GFP_NOFS);
+			if (!rule) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		ret = scoutfs_item_next(sb, &key, &end, &rv, sizeof(rv), lock);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			goto out;
+		}
+
+		ret = rule_val_to_rule(rule, &rv, ret);
+		if (ret < 0)
+			goto out;
+
+		/* insert rule into attr tree if any of its names select */
+		for (i = 0; i < ARRAY_SIZE(rule->names); i++) {
+			name = &rule->names[i];
+			name->i = i;
+
+			if (name->flags & SQ_NF_SELECT) {
+				walk_rule_tree(&rs->roots[ns_to_attr(name->source)],
+					       name->val, name);
+			} else {
+				RB_CLEAR_NODE(&name->node);
+			}
+		}
+
+
+		if (!unlinked_rule(rule))
+			rule = NULL;
+
+		/* remember highest priority unlinked (default) rule */
+		if (rule &&
+		    (!rs->defaults[rule->op] || cmp_rules(rule, rs->defaults[rule->op]) > 0)) {
+			rs->defaults[rule->op] = rule;
+			rule = NULL;
+		}
+
+		inc_coll_nr(&key);
+	}
+
+out:
+	if (reading) {
+		if (ret == 0 && empty_ruleset(rs)) {
+			free_ruleset(rs);
+			rs = ERR_PTR(-ENOENT);
+		}
+
+		if (ret < 0) {
+			free_ruleset(rs);
+			rs = ERR_PTR(-EINVAL);
+		}
+
+		spin_lock(&qtinf->lock);
+		rcu_assign_pointer(qtinf->ruleset, rs);
+		spin_unlock(&qtinf->lock);
+		wake_up(&qtinf->waitq);
+	}
+
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+
+	kfree(rule);
+
+	return ret;
+}
+
+/*
+ * A rule matches input when the ops match and all of the rule's key
+ * name selectors match the input -- non-selecting key names always
+ * match.
+ */
+static bool rule_matches(struct squota_input *inp, struct squota_rule *rule)
+{
+	struct squota_rule_name *name;
+	int i;
+
+	if (inp->op != rule->op)
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(rule->names); i++) {
+		name = &rule->names[i];
+
+		if ((name->flags & SQ_NF_SELECT) &&
+		    (inp->attrs[ns_to_attr(name->source)] != name->val))
+			return false;
+	}
+
+	return true;
+}
+
+struct squota_totl_check {
+	u64 totl[3];
+	u64 limit;
+	u8 rule_flags;
+};
+
+/*
+ * Check the rules against the caller's inputs.  We start with the
+ * highest priority default rule for the operation then search all the
+ * rules that select for any of the input's attrs and use the highest
+ * priority match.
+ *
+ * If we find a matching rule then we give the caller the totl xattr
+ * name and limit to check.
+ */
+static bool check_rules(struct squota_ruleset *rs, struct squota_input *inp,
+			struct squota_totl_check *tc)
+{
+	struct squota_rule_name *name;
+	struct squota_rule *match;
+	struct squota_rule *rule;
+	int i;
+
+	if (WARN_ON_ONCE(!rcu_read_lock_held()))
+		return false;
+
+	match = rs->defaults[inp->op];
+
+	for (i = 0; i < SQ_NS__NR_SELECT; i++) {
+		name = walk_rule_tree(&rs->roots[i], inp->attrs[i], NULL);
+		while (name) {
+			rule = name_to_rule(name);
+			if (rule_matches(inp, rule) && (!match || cmp_rules(rule, match) > 0))
+				match = rule;
+			name = next_val_name(name);
+		}
+	}
+
+	if (match) {
+		for (i = 0; i < ARRAY_SIZE(match->names); i++) {
+			name = &match->names[i];
+
+			if (ns_is_attr(name->source))
+				tc->totl[i] = inp->attrs[ns_to_attr(name->source)];
+			else
+				tc->totl[i] = name->val; /* LITERAL is only non-attr source */
+		}
+
+		tc->limit = match->limit;
+		tc->rule_flags = match->rule_flags;
+		return true;
+	}
+
+	return false;
+}
+
+static int check_totl_cb(struct scoutfs_key *key, void *val, unsigned int val_len, void *cb_arg)
+{
+	struct scoutfs_xattr_totl_val *tval = val;
+	struct squota_totl_check *tc = cb_arg;
+	u64 use;
+
+	if (val_len != sizeof(struct scoutfs_xattr_totl_val))
+		return -EIO;
+
+	if (tc->rule_flags & SQ_RF_TOTL_COUNT)
+		use = le64_to_cpu(tval->count);
+	else
+		use = le64_to_cpu(tval->total);
+
+	return use >= tc->limit ? -EDQUOT : 0;
+}
+
+/*
+ * Check that operations can be performed on the given inode.  The rules
+ * are protected by cluster locking and re-read any time the lock is
+ * revoked.  The xattr totl items are read from the weak item cache and
+ * can be a little out of date.  Check results are also cached so we can
+ * rely on those while the current persistent items would produce a
+ * different result.
+ */
+static int check_inputs(struct super_block *sb, struct squota_input *inp)
+{
+	DECLARE_QUOTA_INFO(sb, qtinf);
+	struct squota_ruleset *rs = NULL;
+	struct scoutfs_key range_start;
+	struct scoutfs_key range_end;
+	struct scoutfs_key key;
+	struct squota_totl_check tc;
+	bool found;
+	int ret;
+
+	rcu_read_lock();
+
+	/* quick fast path check when there are no quota rules */
+	rs = rcu_dereference(qtinf->ruleset);
+	if (rs == ERR_PTR(-ENOENT)) {
+		rcu_read_unlock();
+		ret = 0;
+		goto out;
+	}
+
+	/* see if we have a cached check result */
+	if (get_cached_check(qtinf, inp, &ret)) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	/* get the current ruleset, blocking to lock+read if we need to read items */
+	while ((rs = rcu_dereference(qtinf->ruleset)),
+	       (rs == ERR_PTR(-EINVAL) || rs == ERR_PTR(-EBUSY))) {
+		rcu_read_unlock();
+
+		ret = read_ruleset(sb, qtinf);
+		if (ret < 0)
+			goto out;
+
+		rcu_read_lock();
+	}
+
+	/* see if we have a matching rule for our inputs */
+	if (!IS_ERR(rs))
+		found = check_rules(rs, inp, &tc);
+	else
+		found = NULL;
+
+	rcu_read_unlock();
+
+	/* check if the totl limit was exceeded if we found a rule */
+	if (found) {
+		scoutfs_totl_set_range(&range_start, &range_end);
+		scoutfs_xattr_init_totl_key(&key, tc.totl);
+
+		ret = scoutfs_wkic_iterate(sb, &key, &key, &range_start, &range_end,
+					   check_totl_cb, &tc);
+
+		trace_scoutfs_quota_totl_check(sb, inp, &key, tc.limit, ret);
+	} else {
+		ret = 0;
+	}
+
+	if (ret == 0 || ret == -EDQUOT)
+		insert_cached_check(qtinf, inp, ret);
+out:
+	trace_scoutfs_quota_check(sb, (long)rs, inp, ret);
+	return ret;
+}
+
+static void init_inp(struct squota_input *inp, u64 proj, u32 uid, u32 gid, u8 op)
+{
+	/* zero full size for hash table memcmp */
+	memset(inp, 0, sizeof(struct squota_input));
+
+	inp->attrs[ns_to_attr(SQ_NS_PROJ)] = proj;
+	inp->attrs[ns_to_attr(SQ_NS_UID)] = uid;
+	inp->attrs[ns_to_attr(SQ_NS_GID)] = gid;
+	inp->op = op;
+}
+
+/*
+ * The [ug]id initialization here mirrors init_inode_owner() but that
+ * takes a live inode struct and our cluster lock and transaction
+ * layering makes that awkward.
+ */
+int scoutfs_quota_check_inode(struct super_block *sb, struct inode *dir)
+{
+	struct squota_input inp;
+
+	if (quota_unsupported(sb))
+		return 0;
+
+	BUILD_BUG_ON(max(sizeof(uid_t), sizeof(gid_t)) > sizeof(u32));
+
+	init_inp(&inp, scoutfs_inode_get_proj(dir), from_kuid(&init_user_ns, current_fsuid()),
+		 (dir->i_mode & S_ISGID) ? i_gid_read(dir) :
+					   from_kgid(&init_user_ns, current_fsgid()),
+		 SQ_OP_INODE);
+
+	return check_inputs(sb, &inp);
+}
+
+int scoutfs_quota_check_data(struct super_block *sb, struct inode *inode)
+{
+	struct squota_input inp;
+
+	if (quota_unsupported(sb))
+		return 0;
+
+	init_inp(&inp, scoutfs_inode_get_proj(inode), i_uid_read(inode), i_gid_read(inode),
+	         SQ_OP_DATA);
+
+	return check_inputs(sb, &inp);
+}
+
+/*
+ * Read rules from the iterator position into the caller's irules
+ * buffer.  We set the iterator to point past the last irules we return
+ * so that it can be used to continue iteration.
+ */
+int scoutfs_quota_get_rules(struct super_block *sb, u64 *iterator,
+			    struct scoutfs_ioctl_quota_rule *irules, int nr)
+{
+	DECLARE_QUOTA_INFO(sb, qtinf);
+	struct scoutfs_quota_rule_val rv;
+	struct scoutfs_lock *lock = NULL;
+	struct squota_rule rule;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	int copied = 0;
+	int ret = 0;
+
+	if ((ret = quota_unsupported(sb)))
+		return ret;
+
+	if (nr == 0)
+		goto out;
+
+	ret = scoutfs_lock_quota(sb, SCOUTFS_LOCK_READ, 0, &lock);
+	if (ret < 0)
+		goto out;
+
+	down_read(&qtinf->rwsem);
+
+	init_rule_key(&key, iterator[0], iterator[1]);
+	init_rule_key(&end, U64_MAX, U64_MAX);
+
+	while (copied < nr) {
+		ret = scoutfs_item_next(sb, &key, &end, &rv, sizeof(rv), lock);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			break;
+		}
+
+		ret = rule_val_to_rule(&rule, &rv, ret);
+		if (ret < 0)
+			break;
+
+		rule_to_irule(&irules[copied], &rule);
+		copied++;
+
+		inc_coll_nr(&key);
+		iterator[0] = le64_to_cpu(key.skqr_hash);
+		iterator[1] = le64_to_cpu(key.skqr_coll_nr);
+	}
+
+	up_read(&qtinf->rwsem);
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+out:
+	return ret ?: copied;
+}
+
+/*
+ * Search through rule items with the search hash value looking for a
+ * match.  The return key is set to either the rule we found or the next
+ * unused collision nr.  Returns 0 if found, -ENOENT if not, and -errno
+ * for errors.
+ */
+static int find_rule(struct super_block *sb, struct squota_rule *rule, struct scoutfs_key *key_ret,
+		     struct scoutfs_lock *lock)
+{
+	struct scoutfs_quota_rule_val rv;
+	struct squota_rule found;
+	struct scoutfs_key key;
+	struct scoutfs_key end;
+	int ret;
+
+	rule_to_key(&key, rule);
+	end = key;
+	end.skqr_coll_nr = cpu_to_le64(U64_MAX);
+
+	for (;;) {
+		ret = scoutfs_item_next(sb, &key, &end, &rv, sizeof(rv), lock);
+		if (ret < 0)
+			break;
+
+		ret = rule_val_to_rule(&found, &rv, ret);
+		if (ret)
+			break;
+
+		if (cmp_rules(&found, rule) == 0) {
+			ret = 0;
+			break;
+		}
+
+		inc_coll_nr(&key);
+	}
+
+	*key_ret = key;
+	return ret;
+}
+
+/*
+ * Modify a rule.  This only operates on the persistent items.  It holds
+ * a write cluster lock so it invalidates all other rules used by other
+ * nodes and also marks the local rules invalid.  The next enforcement
+ * everywhere will re-read and process the full rule set.  All this
+ * makes rule set modification expensive but it should be
+ * correspondingly rare.
+ */
+int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
+			   struct scoutfs_ioctl_quota_rule *irule)
+{
+	DECLARE_QUOTA_INFO(sb, qtinf);
+	struct scoutfs_quota_rule_val rv;
+	struct scoutfs_lock *lock = NULL;
+	struct squota_rule rule;
+	struct scoutfs_key key;
+	int ret;
+
+	if ((ret = quota_unsupported(sb)))
+		return ret;
+
+	ret = irule_to_rule(&rule, irule);
+	if (ret < 0)
+		goto out;
+
+	ret = scoutfs_lock_quota(sb, SCOUTFS_LOCK_WRITE, 0, &lock);
+	if (ret < 0)
+		goto out;
+
+	down_write(&qtinf->rwsem);
+
+	if (is_add) {
+		ret = find_rule(sb, &rule, &key, lock);
+		if (ret == -ENOENT)
+			ret = 0;
+		else if (ret == 0)
+			ret = -EEXIST;
+		if (ret < 0)
+			goto unlock;
+
+		rule_to_rule_val(&rv, &rule);
+		ret = scoutfs_item_create(sb, &key, &rv, sizeof(rv), lock);
+		if (ret < 0)
+			goto unlock;
+
+	} else {
+		ret = find_rule(sb, &rule, &key, lock) ?:
+		      scoutfs_item_delete(sb, &key, lock);
+		if (ret < 0)
+			goto unlock;
+	}
+
+	scoutfs_quota_invalidate(sb);
+	ret = 0;
+
+unlock:
+	up_write(&qtinf->rwsem);
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+
+out:
+	if (is_add)
+		trace_scoutfs_quota_add_rule(sb, &rule, ret);
+	else
+		trace_scoutfs_quota_del_rule(sb, &rule, ret);
+
+	return ret;
+}
+
+void scoutfs_quota_get_lock_range(struct scoutfs_key *start, struct scoutfs_key *end)
+{
+	scoutfs_key_set_zeros(start);
+	start->sk_zone = SCOUTFS_QUOTA_ZONE;
+
+	scoutfs_key_set_ones(end);
+	end->sk_zone = SCOUTFS_QUOTA_ZONE;
+}
+
+/*
+ * This is called during cluster lock invalidation to indicate that the
+ * ruleset is no longer protected by cluster locking and might have been
+ * modified.  We mark the ruleset invalid and free it once all readers
+ * drain.  The next check will acquire the cluster lock and read the
+ * rules.  Because this is called during invalidation this is serialized
+ * with write holders of cluster locks so we can never see -EBUSY here.
+ */
+void scoutfs_quota_invalidate(struct super_block *sb)
+{
+	DECLARE_QUOTA_INFO(sb, qtinf);
+	struct squota_ruleset *rs;
+
+	if (quota_unsupported(sb))
+		return;
+
+	rcu_read_lock();
+
+	spin_lock(&qtinf->lock);
+	rs = rcu_dereference_protected(qtinf->ruleset, lockdep_is_held(&qtinf->lock));
+	if (rs != ERR_PTR(-EINVAL))
+		rcu_assign_pointer(qtinf->ruleset, ERR_PTR(-EINVAL));
+	spin_unlock(&qtinf->lock);
+
+	/* cluster locking should have prevented this */
+	BUG_ON(rs == ERR_PTR(-EBUSY));
+
+	if (!IS_ERR(rs))
+		call_rcu(&rs->rcu, free_ruleset_rcu);
+
+	rcu_read_unlock();
+
+	shrink_all_cached_checks(qtinf);
+}
+
+static ssize_t quota_drop_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+	return 0;
+}
+
+static ssize_t quota_drop_write(struct file *file, const char __user *buf, size_t size,
+				loff_t *ppos)
+{
+	struct squota_info *qtinf = file_inode(file)->i_private;
+
+	shrink_all_cached_checks(qtinf);
+
+	return size;
+}
+
+static const struct file_operations quota_drop_fops = {
+	.read =		quota_drop_read,
+	.write =	quota_drop_write,
+};
+
+int scoutfs_quota_setup(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct squota_info *qtinf = NULL;
+	int ret;
+
+	if (quota_unsupported(sb))
+		return 0;
+
+	qtinf = kzalloc(sizeof(struct squota_info), GFP_KERNEL);
+	if (!qtinf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = rhashtable_init(&qtinf->check_ht, &check_ht_params);
+	if (ret < 0) {
+		kfree(qtinf);
+		goto out;
+	}
+
+	qtinf->drop_dentry = debugfs_create_file("drop_quota_check_cache", S_IFREG|S_IRUSR,
+						sbi->debug_root, qtinf, &quota_drop_fops);
+	if (!qtinf->drop_dentry) {
+		rhashtable_destroy(&qtinf->check_ht);
+		kfree(qtinf);
+		return -ENOMEM;
+	}
+
+	qtinf->sb = sb;
+	RCU_INIT_POINTER(qtinf->ruleset, ERR_PTR(-EINVAL));
+	atomic64_set(&qtinf->nr_checks, 0);
+	init_rwsem(&qtinf->rwsem);
+	spin_lock_init(&qtinf->lock);
+	init_waitqueue_head(&qtinf->waitq);
+
+	KC_INIT_SHRINKER_FUNCS(&qtinf->shrinker, count_cached_checks, scan_cached_checks);
+	KC_REGISTER_SHRINKER(&qtinf->shrinker);
+
+	sbi->squota_info = qtinf;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void free_cached_check(void *ptr, void *arg)
+{
+	struct squota_check *chk = ptr;
+
+	kfree(chk);
+}
+
+void scoutfs_quota_destroy(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_QUOTA_INFO(sb, qtinf);
+	struct squota_ruleset *rs;
+
+	if (qtinf) {
+		debugfs_remove(qtinf->drop_dentry);
+		KC_UNREGISTER_SHRINKER(&qtinf->shrinker);
+
+		spin_lock(&qtinf->lock);
+		rs = rcu_dereference_protected(qtinf->ruleset, lockdep_is_held(&qtinf->lock));
+		spin_unlock(&qtinf->lock);
+		if (!IS_ERR(rs))
+			free_ruleset(rs);
+
+		rhashtable_free_and_destroy(&qtinf->check_ht, free_cached_check, NULL);
+
+		kfree(qtinf);
+		sbi->squota_info = NULL;
+	}
+}
diff --git a/kmod/src/quota.h b/kmod/src/quota.h
new file mode 100644
index 00000000..324d8a36
--- /dev/null
+++ b/kmod/src/quota.h
@@ -0,0 +1,48 @@
+#ifndef _SCOUTFS_QUOTA_H_
+#define _SCOUTFS_QUOTA_H_
+
+#include "ioctl.h"
+
+/*
+ * Each rule's name can be in the ruleset's rbtree associated with the
+ * source attr that it selects.  This lets checks only test rules that
+ * the inputs could match.  The 'i' field indicates which name is in the
+ * tree so we can find the containing rule.
+ *
+ * This is mostly private to quota.c but we expose it for tracing.
+ */
+struct squota_rule {
+	u64 limit;
+	u8 prio;
+	u8 op;
+	u8 rule_flags;
+	struct squota_rule_name {
+		struct rb_node node;
+		u64 val;
+		u8 source;
+		u8 flags;
+		u8 i;
+	} names[3];
+};
+
+/* private to quota.c, only here for tracing */
+struct squota_input {
+	u64 attrs[SQ_NS__NR_SELECT];
+	u8 op;
+};
+
+int scoutfs_quota_check_inode(struct super_block *sb, struct inode *dir);
+int scoutfs_quota_check_data(struct super_block *sb, struct inode *inode);
+
+int scoutfs_quota_get_rules(struct super_block *sb, u64 *iterator,
+			    struct scoutfs_ioctl_quota_rule *irules, int nr);
+int scoutfs_quota_mod_rule(struct super_block *sb, bool is_add,
+			   struct scoutfs_ioctl_quota_rule *irule);
+
+void scoutfs_quota_get_lock_range(struct scoutfs_key *start, struct scoutfs_key *end);
+void scoutfs_quota_invalidate(struct super_block *sb);
+
+int scoutfs_quota_setup(struct super_block *sb);
+void scoutfs_quota_destroy(struct super_block *sb);
+
+#endif
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index da75b03b..3fd4821f 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -37,7 +37,9 @@
 #include "net.h"
 #include "data.h"
 #include "ext.h"
+#include "quota.h"
 
+#include "trace/quota.h"
 #include "trace/wkic.h"
 
 struct lock_info;
diff --git a/kmod/src/super.c b/kmod/src/super.c
index f38af5ce..0086d7fb 100644
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -50,6 +50,7 @@
 #include "fence.h"
 #include "xattr.h"
 #include "wkic.h"
+#include "quota.h"
 #include "scoutfs_trace.h"
 
 static struct dentry *scoutfs_debugfs_root;
@@ -195,6 +196,7 @@ static void scoutfs_put_super(struct super_block *sb)
 	scoutfs_shutdown_trans(sb);
 	scoutfs_volopt_destroy(sb);
 	scoutfs_client_destroy(sb);
+	scoutfs_quota_destroy(sb);
 	scoutfs_inode_destroy(sb);
 	scoutfs_wkic_destroy(sb);
 	scoutfs_item_destroy(sb);
@@ -548,6 +550,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_item_setup(sb) ?:
 	      scoutfs_wkic_setup(sb) ?:
 	      scoutfs_inode_setup(sb) ?:
+	      scoutfs_quota_setup(sb) ?:
 	      scoutfs_data_setup(sb) ?:
 	      scoutfs_setup_trans(sb) ?:
 	      scoutfs_omap_setup(sb) ?:
diff --git a/kmod/src/super.h b/kmod/src/super.h
index bdbc1b81..03c6a6ea 100644
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -31,6 +31,7 @@ struct omap_info;
 struct volopt_info;
 struct fence_info;
 struct wkic_info;
+struct squota_info;
 
 struct scoutfs_sb_info {
 	struct super_block *sb;
@@ -57,6 +58,7 @@ struct scoutfs_sb_info {
 	struct volopt_info *volopt_info;
 	struct item_cache_info *item_cache_info;
 	struct wkic_info *wkic_info;
+	struct squota_info *squota_info;
 	struct fence_info *fence_info;
 
 	/* tracks tasks waiting for data extents */
diff --git a/kmod/src/trace/quota.h b/kmod/src/trace/quota.h
new file mode 100644
index 00000000..983b318c
--- /dev/null
+++ b/kmod/src/trace/quota.h
@@ -0,0 +1,143 @@
+
+/*
+ * Tracing squota_input
+ */
+#define SQI_FMT "[%u %llu %llu %llu]"
+
+#define SQI_ARGS(i)						\
+	(i)->op, (i)->attrs[0], (i)->attrs[1], (i)->attrs[2]
+
+#define SQI_FIELDS(pref)					\
+	__array(__u64, pref##_attrs, SQ_NS__NR_SELECT)		\
+	__field(__u8, pref##_op)
+
+#define SQI_ASSIGN(pref, i)					\
+	__entry->pref##_attrs[0] = (i)->attrs[0];		\
+	__entry->pref##_attrs[1] = (i)->attrs[1];		\
+	__entry->pref##_attrs[2] = (i)->attrs[2];		\
+	__entry->pref##_op = (i)->op;
+
+#define SQI_ENTRY_ARGS(pref)					\
+	__entry->pref##_op, __entry->pref##_attrs[0],		\
+	__entry->pref##_attrs[1], __entry->pref##_attrs[2]
+
+/*
+ * Tracing squota_rule
+ */
+#define SQR_FMT "[%u %llu,%u,%x %llu,%u,%x %llu,%u,%x %u %llu]"
+
+#define SQR_ARGS(r)							\
+	(r)->prio,							\
+	(r)->name_val[0], (r)->name_source[0], (r)->name_flags[0],	\
+	(r)->name_val[1], (r)->name_source[1], (r)->name_flags[1],	\
+	(r)->name_val[2], (r)->name_source[2], (r)->name_flags[2],	\
+	(r)->op, (r)->limit						\
+
+#define SQR_FIELDS(pref)			\
+	__array(__u64, pref##_name_val, 3)	\
+	__field(__u64, pref##_limit)		\
+	__array(__u8, pref##_name_source, 3)	\
+	__array(__u8, pref##_name_flags, 3)	\
+	__field(__u8, pref##_prio)		\
+	__field(__u8, pref##_op)
+
+#define SQR_ASSIGN(pref, r)					\
+	__entry->pref##_name_val[0] = (r)->names[0].val;	\
+	__entry->pref##_name_val[1] = (r)->names[1].val;	\
+	__entry->pref##_name_val[2] = (r)->names[2].val;	\
+	__entry->pref##_limit = (r)->limit;			\
+	__entry->pref##_name_source[0] = (r)->names[0].source;	\
+	__entry->pref##_name_source[1] = (r)->names[1].source;	\
+	__entry->pref##_name_source[2] = (r)->names[2].source;	\
+	__entry->pref##_name_flags[0] = (r)->names[0].flags;	\
+	__entry->pref##_name_flags[1] = (r)->names[1].flags;	\
+	__entry->pref##_name_flags[2] = (r)->names[2].flags;	\
+	__entry->pref##_prio = (r)->prio;			\
+	__entry->pref##_op = (r)->op;
+
+#define SQR_ENTRY_ARGS(pref)						\
+	__entry->pref##_prio, __entry->pref##_name_val[0],		\
+	__entry->pref##_name_source[0], __entry->pref##_name_flags[0],	\
+	__entry->pref##_name_val[1], __entry->pref##_name_source[1],	\
+	__entry->pref##_name_flags[1], __entry->pref##_name_val[2],	\
+	__entry->pref##_name_source[2], __entry->pref##_name_flags[2],	\
+	__entry->pref##_op, __entry->pref##_limit
+
+TRACE_EVENT(scoutfs_quota_check,
+	TP_PROTO(struct super_block *sb, long rs_ptr, struct squota_input *inp, int ret),
+
+	TP_ARGS(sb, rs_ptr, inp, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(long, rs_ptr)
+		SQI_FIELDS(i)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->rs_ptr = rs_ptr;
+		SQI_ASSIGN(i, inp);
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" rs_ptr %ld ret %d inp "SQI_FMT,
+		  SCSB_TRACE_ARGS, __entry->rs_ptr, __entry->ret, SQI_ENTRY_ARGS(i))
+);
+
+DECLARE_EVENT_CLASS(scoutfs_quota_rule_op_class,
+	TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret),
+
+	TP_ARGS(sb, rule, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		SQR_FIELDS(r)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		SQR_ASSIGN(r, rule);
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" "SQR_FMT" ret %d",
+		  SCSB_TRACE_ARGS, SQR_ENTRY_ARGS(r), __entry->ret)
+);
+DEFINE_EVENT(scoutfs_quota_rule_op_class, scoutfs_quota_add_rule,
+	TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret),
+	TP_ARGS(sb, rule, ret)
+);
+DEFINE_EVENT(scoutfs_quota_rule_op_class, scoutfs_quota_del_rule,
+	TP_PROTO(struct super_block *sb, struct squota_rule *rule, int ret),
+	TP_ARGS(sb, rule, ret)
+);
+
+TRACE_EVENT(scoutfs_quota_totl_check,
+	TP_PROTO(struct super_block *sb, struct squota_input *inp, struct scoutfs_key *key,
+		 u64 limit, int ret),
+
+	TP_ARGS(sb, inp, key, limit, ret),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		SQI_FIELDS(i)
+		sk_trace_define(k)
+		__field(__u64, limit)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		SQI_ASSIGN(i, inp);
+		sk_trace_assign(k, key);
+		__entry->limit = limit;
+		__entry->ret = ret;
+	),
+
+	TP_printk(SCSBF" inp "SQI_FMT" key "SK_FMT" limit %llu ret %d",
+		  SCSB_TRACE_ARGS, SQI_ENTRY_ARGS(i), sk_trace_args(k), __entry->limit,
+		  __entry->ret)
+);