(wip) add some quick lock server stats

Align first free ino to lock group
Currently the first inode number that can be allocated directly follows the root inode. This means the first batch of allocated inodes are in the same lock group as the root inode. The root inode is a bit special. It is always hot as absolute path lookups and inode-to-path resolution always read directory entries from the root. Let's try aligning the first free inode number to the next inode lock group boundary. This will stop work in those inodes from necessarily conflicting with work in the root inode. Signed-off-by: Zach Brown <zab@versity.com>
2026-01-06 20:16:25 +00:00 · 2021-05-25 16:35:30 -07:00 · 2021-05-24 13:23:46 -07:00 · 2021-05-24 13:23:46 -07:00 · 2021-05-24 13:23:46 -07:00
9 changed files with 49 additions and 164 deletions
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -112,9 +112,6 @@
 	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
 	EXPAND_COUNTER(lock_free)				\
-	EXPAND_COUNTER(lock_grace_extended)			\
-	EXPAND_COUNTER(lock_grace_set)				\
-	EXPAND_COUNTER(lock_grace_wait)				\
 	EXPAND_COUNTER(lock_grant_request)			\
 	EXPAND_COUNTER(lock_grant_response)			\
 	EXPAND_COUNTER(lock_grant_work)				\
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -757,9 +757,9 @@ enum scoutfs_dentry_type {
 	DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \
 		     (unsigned int)SCOUTFS_XATTR_MAX_PART_SIZE)

-#define SCOUTFS_LOCK_INODE_GROUP_NR	1024
+#define SCOUTFS_LOCK_INODE_GROUP_NR	128
 #define SCOUTFS_LOCK_INODE_GROUP_MASK	(SCOUTFS_LOCK_INODE_GROUP_NR - 1)
-#define SCOUTFS_LOCK_SEQ_GROUP_MASK	((1ULL << 10) - 1)
+#define SCOUTFS_LOCK_SEQ_GROUP_MASK	((1ULL << 7) - 1)

 /*
 * messages over the wire.
@@ -909,6 +909,7 @@ enum scoutfs_lock_trace {
 	SLT_INVALIDATE,
 	SLT_REQUEST,
 	SLT_RESPONSE,
+	SLT_NR,
 };

 /*
@@ -961,7 +962,7 @@ enum scoutfs_corruption_sources {

 #define SC_NR_LONGS DIV_ROUND_UP(SC_NR_SOURCES, BITS_PER_LONG)

-#define SCOUTFS_OPEN_INO_MAP_SHIFT	10
+#define SCOUTFS_OPEN_INO_MAP_SHIFT	7
 #define SCOUTFS_OPEN_INO_MAP_BITS	(1 << SCOUTFS_OPEN_INO_MAP_SHIFT)
 #define SCOUTFS_OPEN_INO_MAP_MASK	(SCOUTFS_OPEN_INO_MAP_BITS - 1)
 #define SCOUTFS_OPEN_INO_MAP_LE64S	(SCOUTFS_OPEN_INO_MAP_BITS / 64)
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -66,8 +66,6 @@
 * relative to that lock state we resend.
 */

-#define GRACE_PERIOD_KT	ms_to_ktime(10)
-
 /*
 * allocated per-super, freed on unmount.
 */
@@ -84,7 +82,7 @@ struct lock_info {
 	struct workqueue_struct *workq;
 	struct work_struct grant_work;
 	struct list_head grant_list;
-	struct delayed_work inv_dwork;
+	struct work_struct inv_work;
 	struct list_head inv_list;
 	struct work_struct shrink_work;
 	struct list_head shrink_list;
@@ -364,23 +362,6 @@ static bool lock_counts_match(int granted, unsigned int *counts)
 	return true;
 }

-/*
- * Returns true if there are any mode counts that match with the desired
- * mode.  There can be other non-matching counts as well but we're only
- * testing for the existence of any matching counts.
- */
-static bool lock_count_match_exists(int desired, unsigned int *counts)
-{
-	enum scoutfs_lock_mode mode;
-
-	for (mode = 0; mode < SCOUTFS_LOCK_NR_MODES; mode++) {
-		if (counts[mode] && lock_modes_match(desired, mode))
-			return true;
-	}
-
-	return false;
-}
-
 /*
 * An idle lock has nothing going on.  It can be present in the lru and
 * can be freed by the final put when it has a null mode.
@@ -597,24 +578,6 @@ static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
 	}
 }

-/*
- * Locks have a grace period that extends after activity and prevents
- * invalidation.  It's intended to let nodes do reasonable batches of
- * work as locks ping pong between nodes that are doing conflicting
- * work.
- */
-static void extend_grace(struct super_block *sb, struct scoutfs_lock *lock)
-{
-	ktime_t now = ktime_get();
-
-	if (ktime_after(now, lock->grace_deadline))
-		scoutfs_inc_counter(sb, lock_grace_set);
-	else
-		scoutfs_inc_counter(sb, lock_grace_extended);
-
-	lock->grace_deadline = ktime_add(now, GRACE_PERIOD_KT);
-}
-
 static void queue_grant_work(struct lock_info *linfo)
 {
 	assert_spin_locked(&linfo->lock);
@@ -624,19 +587,15 @@ static void queue_grant_work(struct lock_info *linfo)
 }

 /*
- * We immediately queue work on the assumption that the caller might
- * have made a change (set a lock mode) which can let one of the
- * invalidating locks make forward progress, even if other locks are
- * waiting for their grace period to elapse.  It's a trade-off between
- * invalidation latency and burning cpu repeatedly finding that locks
- * are still in their grace period.
+ * The caller has made a change (set a lock mode) which can let one of the
+ * invalidating locks make forward progress.
 */
 static void queue_inv_work(struct lock_info *linfo)
 {
 	assert_spin_locked(&linfo->lock);

 	if (!list_empty(&linfo->inv_list))
-		mod_delayed_work(linfo->workq, &linfo->inv_dwork, 0);
+		queue_work(linfo->workq, &linfo->inv_work);
 }

 /*
@@ -689,15 +648,6 @@ static void bug_on_inconsistent_grant_cache(struct super_block *sb,
 * Grant responses can be reordered with incoming invalidation requests
 * from the server so we have to be careful to only set the new mode
 * once the old mode matches.
- *
- * We extend the grace period as we grant the lock if there is a waiting
- * locker who can use the lock.  This stops invalidation from pulling
- * the granted lock out from under the requester, resulting in a lot of
- * churn with no forward progress.  Using the grace period avoids having
- * to identify a specific waiter and give it an acquired lock.  It's
- * also very similar to waking up the locker and having it win the race
- * against the invalidation.  In that case they'd extend the grace
- * period anyway as they unlock.
 */
 static void lock_grant_worker(struct work_struct *work)
 {
@@ -732,9 +682,6 @@ static void lock_grant_worker(struct work_struct *work)
 		lock->mode = nl->new_mode;
 		lock->write_version = le64_to_cpu(nl->write_version);

-		if (lock_count_match_exists(nl->new_mode, lock->waiters))
-			extend_grace(sb, lock);
-
 		trace_scoutfs_lock_granted(sb, lock);
 		list_del_init(&lock->grant_head);
 		wake_up(&lock->waitq);
@@ -798,11 +745,6 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 * invalidate once the lock mode matches what the server told us to
 * invalidate.
 *
- * We delay invalidation processing until a grace period has elapsed
- * since the last unlock.  The intent is to let users do a reasonable
- * batch of work before dropping the lock.  Continuous unlocking can
- * continuously extend the deadline.
- *
 * Before we start invalidating the lock we set the lock to the new
 * mode, preventing further incompatible users of the old mode from
 * using the lock while we're invalidating.
@@ -814,15 +756,11 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 */
 static void lock_invalidate_worker(struct work_struct *work)
 {
-	struct lock_info *linfo = container_of(work, struct lock_info,
-					       inv_dwork.work);
+	struct lock_info *linfo = container_of(work, struct lock_info, inv_work);
 	struct super_block *sb = linfo->sb;
 	struct scoutfs_net_lock *nl;
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *tmp;
-	unsigned long delay = MAX_JIFFY_OFFSET;
-	ktime_t now = ktime_get();
-	ktime_t deadline;
 	LIST_HEAD(ready);
 	u64 net_id;
 	int ret;
@@ -842,15 +780,6 @@ static void lock_invalidate_worker(struct work_struct *work)
 		if (!lock_counts_match(nl->new_mode, lock->users))
 			continue;

-		/* skip if grace hasn't elapsed, record earliest */
-		deadline = lock->grace_deadline;
-		if (!linfo->shutdown && ktime_before(now, deadline)) {
-			delay = min(delay,
-				    nsecs_to_jiffies(ktime_to_ns(
-						ktime_sub(deadline, now))));
-			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
-			continue;
-		}
 		/* set the new mode, no incompatible users during inval */
 		lock->mode = nl->new_mode;

@@ -861,7 +790,7 @@ static void lock_invalidate_worker(struct work_struct *work)
 	spin_unlock(&linfo->lock);

 	if (list_empty(&ready))
-		goto out;
+		return;

 	/* invalidate once the lock is read */
 	list_for_each_entry(lock, &ready, inv_head) {
@@ -904,11 +833,6 @@ static void lock_invalidate_worker(struct work_struct *work)
 	/* grant might have been waiting for invalidate request */
 	queue_grant_work(linfo);
 	spin_unlock(&linfo->lock);
-
-out:
-	/* queue delayed work if invalidations waiting on grace deadline */
-	if (delay != MAX_JIFFY_OFFSET)
-		queue_delayed_work(linfo->workq, &linfo->inv_dwork, delay);
 }

 /*
@@ -1374,10 +1298,6 @@ int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int fl
 	return lock_key_range(sb, mode, flags, &start, &end, lock);
 }

-/*
- * As we unlock we always extend the grace period to give the caller
- * another pass at the lock before its invalidated.
- */
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
@@ -1390,7 +1310,6 @@ void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scou
 	spin_lock(&linfo->lock);

 	lock_dec_count(lock->users, mode);
-	extend_grace(sb, lock);
 	if (lock_mode_can_write(mode))
 		lock->dirty_trans_seq = scoutfs_trans_sample_seq(sb);

@@ -1630,7 +1549,7 @@ void scoutfs_lock_unmount_begin(struct super_block *sb)

 	if (linfo) {
 		linfo->unmounting = true;
-		flush_delayed_work(&linfo->inv_dwork);
+		flush_work(&linfo->inv_work);
 	}
 }

@@ -1723,8 +1642,6 @@ void scoutfs_lock_destroy(struct super_block *sb)
 	spin_unlock(&linfo->lock);

 	if (linfo->workq) {
-		/* pending grace work queues normal work */
-		flush_workqueue(linfo->workq);
 		/* now all work won't queue itself */
 		destroy_workqueue(linfo->workq);
 	}
@@ -1785,7 +1702,7 @@ int scoutfs_lock_setup(struct super_block *sb)
 	INIT_LIST_HEAD(&linfo->lru_list);
 	INIT_WORK(&linfo->grant_work, lock_grant_worker);
 	INIT_LIST_HEAD(&linfo->grant_list);
-	INIT_DELAYED_WORK(&linfo->inv_dwork, lock_invalidate_worker);
+	INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
 	INIT_LIST_HEAD(&linfo->inv_list);
 	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
 	INIT_LIST_HEAD(&linfo->shrink_list);
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -27,7 +27,6 @@ struct scoutfs_lock {
 	u64 dirty_trans_seq;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
-	ktime_t grace_deadline;
 	unsigned long request_pending:1,
 		      invalidate_pending:1;

--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -78,6 +78,8 @@ struct lock_server_info {

 	struct scoutfs_tseq_tree tseq_tree;
 	struct dentry *tseq_dentry;
+	struct scoutfs_tseq_tree stats_tseq_tree;
+	struct dentry *stats_tseq_dentry;

 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
@@ -109,6 +111,9 @@ struct server_lock_node {
 	struct list_head granted;
 	struct list_head requested;
 	struct list_head invalidated;
+
+	struct scoutfs_tseq_entry stats_tseq_entry;
+	u64 stats[SLT_NR];
 };

 /*
@@ -298,6 +303,8 @@ static struct server_lock_node *alloc_server_lock(struct lock_server_info *inf,
 			snode = get_server_lock(inf, key, ins, false);
 			if (snode != ins)
 				kfree(ins);
+			else
+				scoutfs_tseq_add(&inf->stats_tseq_tree, &snode->stats_tseq_entry);
 		}
 	}

@@ -327,8 +334,10 @@ static void put_server_lock(struct lock_server_info *inf,

 	mutex_unlock(&snode->mutex);

-	if (should_free)
+	if (should_free) {
+		scoutfs_tseq_del(&inf->stats_tseq_tree, &snode->stats_tseq_entry);
 		kfree(snode);
+	}
 }

 static struct client_lock_entry *find_entry(struct server_lock_node *snode,
@@ -390,6 +399,8 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
 		goto out;
 	}

+	snode->stats[SLT_REQUEST]++;
+
 	clent->snode = snode;
 	add_client_entry(snode, &snode->requested, clent);
 	scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
@@ -430,6 +441,8 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 		goto out;
 	}

+	snode->stats[SLT_RESPONSE]++;
+
 	clent = find_entry(snode, &snode->invalidated, rid);
 	if (!clent) {
 		put_server_lock(inf, snode);
@@ -510,6 +523,7 @@ static int process_waiting_requests(struct super_block *sb,
 			trace_scoutfs_lock_message(sb, SLT_SERVER,
 						   SLT_INVALIDATE, SLT_REQUEST,
 						   gr->rid, 0, &nl);
+			snode->stats[SLT_INVALIDATE]++;

 			add_client_entry(snode, &snode->invalidated, gr);
 		}
@@ -544,6 +558,7 @@ static int process_waiting_requests(struct super_block *sb,
 		trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_GRANT,
 					   SLT_RESPONSE, req->rid,
 					   req->net_id, &nl);
+		snode->stats[SLT_GRANT]++;

 		/* don't track null client locks, track all else */ 
 		if (req->mode == SCOUTFS_LOCK_NULL)
@@ -794,6 +809,16 @@ static void lock_server_tseq_show(struct seq_file *m,
 		   clent->net_id);
 }

+static void stats_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
+{
+	struct server_lock_node *snode = container_of(ent, struct server_lock_node,
+						      stats_tseq_entry);
+
+	seq_printf(m, SK_FMT" req %llu inv %llu rsp %llu gr %llu\n",
+		   SK_ARG(&snode->key), snode->stats[SLT_REQUEST], snode->stats[SLT_INVALIDATE],
+		   snode->stats[SLT_RESPONSE], snode->stats[SLT_GRANT]);
+}
+
 /*
 * Setup the lock server.  This is called before networking can deliver
 * requests.
@@ -813,6 +838,7 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	spin_lock_init(&inf->lock);
 	inf->locks_root = RB_ROOT;
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
+	scoutfs_tseq_tree_init(&inf->stats_tseq_tree, stats_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
 	atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */
@@ -824,6 +850,14 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 		return -ENOMEM;
 	}

+	inf->stats_tseq_dentry = scoutfs_tseq_create("tmp_lock_stats", sbi->debug_root,
+					       &inf->stats_tseq_tree);
+	if (!inf->stats_tseq_dentry) {
+		debugfs_remove(inf->tseq_dentry);
+		kfree(inf);
+		return -ENOMEM;
+	}
+
 	sbi->lock_server_info = inf;

 	return 0;
@@ -845,6 +879,7 @@ void scoutfs_lock_server_destroy(struct super_block *sb)

 	if (inf) {
 		debugfs_remove(inf->tseq_dentry);
+		debugfs_remove(inf->stats_tseq_dentry);

 		rbtree_postorder_for_each_entry_safe(snode, stmp,
 						     &inf->locks_root, node) {
--- a/tests/golden/lock-conflicting-batch-commit
+++ b/tests/golden/lock-conflicting-batch-commit
@@ -1,4 +0,0 @@
-== create per mount files
-== time independent modification
-== time concurrent independent modification
-== time concurrent conflicting modification
--- a/tests/sequence
+++ b/tests/sequence
@@ -24,7 +24,6 @@ basic-posix-consistency.sh
 dirent-consistency.sh
 mkdir-rename-rmdir.sh
 lock-ex-race-processes.sh
-lock-conflicting-batch-commit.sh
 cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
--- a/tests/tests/lock-conflicting-batch-commit.sh
+++ b/tests/tests/lock-conflicting-batch-commit.sh
@@ -1,59 +0,0 @@
-#
-# If bulk work accidentally conflicts in the worst way we'd like to have
-# it not result in catastrophic performance.  Make sure that each
-# instance of bulk work is given the opportunity to get as much as it
-# can into the transaction under a lock before the lock is revoked
-# and the transaction is committed.
-#
-
-t_require_commands setfattr
-t_require_mounts 2
-
-NR=3000
-
-echo "== create per mount files" 
-for m in 0 1; do 
-	eval dir="\$T_D${m}/dir/$m"
-	t_quiet mkdir -p "$dir"
-	for a in $(seq 1 $NR); do touch "$dir/$a"; done
-done
-
-echo "== time independent modification"
-for m in 0 1; do 
-	eval dir="\$T_D${m}/dir/$m"
-	START=$SECONDS
-	for a in $(seq 1 $NR); do
-		setfattr -n user.test_grace -v $a "$dir/$a"
-	done
-	echo "mount $m: $((SECONDS - START))" >> $T_TMP.log
-done
-
-echo "== time concurrent independent modification"
-START=$SECONDS
-for m in 0 1; do 
-	eval dir="\$T_D${m}/dir/$m"
-	(for a in $(seq 1 $NR); do
-		setfattr -n user.test_grace -v $a "$dir/$a"; 
-	done) &
-done
-wait
-IND="$((SECONDS - START))"
-echo "ind: $IND" >> $T_TMP.log
-
-echo "== time concurrent conflicting modification"
-START=$SECONDS
-for m in 0 1; do 
-	eval dir="\$T_D${m}/dir/0"
-	(for a in $(seq 1 $NR); do
-		setfattr -n user.test_grace -v $a "$dir/$a"; 
-	done) &
-done
-wait
-CONF="$((SECONDS - START))"
-echo "conf: $CONF" >> $T_TMP.log
-
-if [ "$CONF" -gt "$((IND * 5))" ]; then
-	t_fail "conflicting $CONF secs is more than 5x independent $IND secs"
-fi
-
-t_pass
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -197,7 +197,7 @@ static int do_mkfs(struct mkfs_args *args)
 	memset(super, 0, SCOUTFS_BLOCK_SM_SIZE);
 	super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
 	uuid_generate(super->uuid);
-	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
+	super->next_ino = cpu_to_le64(round_up(SCOUTFS_ROOT_INO + 1, SCOUTFS_LOCK_INODE_GROUP_NR));
 	super->next_trans_seq = cpu_to_le64(1);
 	super->total_meta_blocks = cpu_to_le64(last_meta + 1);
 	super->first_meta_blkno = cpu_to_le64(next_meta);
Author	SHA1	Message	Date
Zach Brown	9cdfd2cdf8	(wip) add some quick lock server stats	2021-05-25 16:35:30 -07:00
Zach Brown	f2c9f66385	Align first free ino to lock group Currently the first inode number that can be allocated directly follows the root inode. This means the first batch of allocated inodes are in the same lock group as the root inode. The root inode is a bit special. It is always hot as absolute path lookups and inode-to-path resolution always read directory entries from the root. Let's try aligning the first free inode number to the next inode lock group boundary. This will stop work in those inodes from necessarily conflicting with work in the root inode. Signed-off-by: Zach Brown <zab@versity.com>	2021-05-24 13:23:46 -07:00
Zach Brown	e6889b1882	Reduce inode lock group count to 128, from 1024 The number of inodes that each inode group lock covers is a balance between minimizing lock overhead and contention. With 128 inodes per lock we're still doing quite a bit of bulk creates, finds, etc, per lock but could reduce the risk of contention a bit. Let's see how it works out. Signed-off-by: Zach Brown <zab@versity.com>	2021-05-24 13:23:46 -07:00
Zach Brown	c66f2e0a0e	Remove lock grace period We had some logic to try and delay lock invalidation while the lock was still actively in use. This was trying to reduce the cost of pathological lock conflict cases but it had some severe fairness problems. It was first introduced to deal with bad patterns in userspace that no longer exist and it was built on top of the LSM transaction machinery that also no longer exists. It hasn't aged well. Instead of introducing invalidation latency in the hopes that it leads to more batched work, which it can't always, let's aim more towards reducing latency in all parts of the write-invalidate-read path and also aim towards reducing contention in the first place. Signed-off-by: Zach Brown <zab@versity.com>	2021-05-24 13:23:46 -07:00