scoutfs: introduce CodingStyle.txt

Add a coding style document that tries to record the conventions that the project uses. It seemed more appropriate to put it up in the -kmod git repo context rather than in src/ which would end up in fs/scoutfs/ upstream. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #25 from versity/zab/client_greeting_items_exist
2026-04-30 01:46:54 +00:00 · 2021-03-31 10:55:18 -07:00 · 2021-03-16 15:57:55 -07:00 · 2021-03-16 12:02:05 -07:00 · 2021-03-16 11:56:26 -07:00 · 2021-03-16 10:04:42 -07:00
19 changed files with 500 additions and 276 deletions
--- a/CodingStyle.txt
+++ b/CodingStyle.txt
@@ -0,0 +1,82 @@
+
+We try to maintain a consistent coding style across the project.  It's
+admitedly arbitrary and starts with and is based on upstream's
+Documentation/CodingStyle.  Conventions are added here as they come up
+during review.  We'll demonstrate each sylistic preference with a diff
+snippet.
+
+== Try to make one exit point for reasonably long functions
+
+{
+-	void *a;
+-	void *b;
+	void *a = NULL;
+	void *b = NULL;
+	int ret;
+
+	a = kalloc();
+-	if (!a)
+-		return 1;
+	if (!a) {
+		ret = 1;
+		goto out;
+	}
+
+	b = kalloc();
+	if (!b) {
+-		kfree(a);
+-		return 2;
+		ret = 2;
+		goto out;
+	}
+
+-	return 3
+	ret = 3;
+out:
+	kfree(a);
+	kfree(b);
+	return ret;
+}
+
+The idea is to initialize all state at the top of the function,
+modifying it throughout, and clean it all up at the end.  Having one
+exit point also gives us a place to add tracing of function exit.
+
+== Multiple declarations on a line
+
+-	int i, j;
+	int i;
+	int j;
+
+Declare function variables one per line.  The verbose declarations
+create pressure to think about excessive stack use or over-long
+functions, makes initializers clear, and leaves room for comments.
+
+== Balance braces
+
+-	if (IS_ERR(super_block))
+	if (IS_ERR(super_block)) {
+		return PTR_ERR(super_block);
+-	else {
+	} else {
+		*super_res = *super_block;
+		kfree(super_block);
+		return 0;
+	}
+
+*nervous twitch*
+
+== Cute variable defintion waterfalls
+
+	struct block_device *meta_bdev;
+	struct scoutfs_sb_info *sbi;
+	struct mount_options opts;
+-	struct block_device *meta_bdev;
+	struct inode *inode;
+
+This isn't strictly necessary, but it's nice to try and make a pretty
+descending length of variable distributions.  It often has the
+accidental  effect of sorting definitions by decreasing complexity.  I
+tend to group types when the name lengths are pretty close, even if
+they're not strictly sorted, so that all the ints, u64s, keys, etc, are
+all together.
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -252,7 +252,7 @@ void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
 {
 	memset(alloc, 0, sizeof(struct scoutfs_alloc));

-	spin_lock_init(&alloc->lock);
+	seqlock_init(&alloc->seqlock);
 	mutex_init(&alloc->mutex);
 	alloc->avail = *avail;
 	alloc->freed = *freed;
@@ -526,7 +526,8 @@ int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 	if (ret < 0)
 		goto out;

-	spin_lock(&alloc->lock);
+	write_seqlock(&alloc->seqlock);
+
 	lblk = alloc->dirty_avail_bl->data;
 	if (WARN_ON_ONCE(lblk->nr == 0)) {
 		/* shouldn't happen, transaction should commit first */
@@ -536,7 +537,8 @@ int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 		list_block_remove(&alloc->avail, lblk, 1);
 		ret = 0;
 	}
-	spin_unlock(&alloc->lock);
+
+	write_sequnlock(&alloc->seqlock);

 out:
 	if (ret < 0)
@@ -559,7 +561,8 @@ int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 	if (ret < 0)
 		goto out;

-	spin_lock(&alloc->lock);
+	write_seqlock(&alloc->seqlock);
+
 	lblk = alloc->dirty_freed_bl->data;
 	if (WARN_ON_ONCE(list_block_space(lblk->nr) == 0)) {
 		/* shouldn't happen, transaction should commit first */
@@ -568,7 +571,8 @@ int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
 		list_block_add(&alloc->freed, lblk, blkno);
 		ret = 0;
 	}
-	spin_unlock(&alloc->lock);
+
+	write_sequnlock(&alloc->seqlock);

 out:
 	scoutfs_inc_counter(sb, alloc_free_meta);
@@ -1066,17 +1070,23 @@ out:

 /*
 * Returns true if meta avail and free don't have room for the given
- * number of alloctions or frees.
+ * number of allocations or frees.  This is called at a significantly
+ * higher frequency than allocations as writers try to enter
+ * transactions.  This is the only reader of the seqlock which gives
+ * read-mostly sampling instead of bouncing a spinlock around all the
+ * cores.
 */
 bool scoutfs_alloc_meta_low(struct super_block *sb,
 			    struct scoutfs_alloc *alloc, u32 nr)
 {
+	unsigned int seq;
 	bool lo;

-	spin_lock(&alloc->lock);
-	lo = le32_to_cpu(alloc->avail.first_nr) < nr ||
-	     list_block_space(alloc->freed.first_nr) < nr;
-	spin_unlock(&alloc->lock);
+	do {
+		seq = read_seqbegin(&alloc->seqlock);
+		lo = le32_to_cpu(alloc->avail.first_nr) < nr ||
+		     list_block_space(alloc->freed.first_nr) < nr;
+	} while (read_seqretry(&alloc->seqlock, seq));

 	return lo;
 }
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -72,7 +72,8 @@
 * transaction.
 */
 struct scoutfs_alloc {
-	spinlock_t lock;
+	/* writers rarely modify list_head avail/freed.  readers often check for _meta_alloc_low */
+	seqlock_t seqlock;
 	struct mutex mutex;
 	struct scoutfs_block *dirty_avail_bl;
 	struct scoutfs_block *dirty_freed_bl;
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -49,7 +49,6 @@ struct client_info {
 	struct delayed_work connect_dwork;

 	u64 server_term;
-	u64 greeting_umb;

 	bool sending_farewell;
 	int farewell_error;
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -86,11 +86,33 @@ struct scoutfs_timespec {
 	__u8 __pad[4];
 };

-/* XXX ipv6 */
-struct scoutfs_inet_addr {
-	__le32 addr;
+enum scoutfs_inet_family {
+	SCOUTFS_AF_NONE = 0,
+	SCOUTFS_AF_IPV4 = 1,
+	SCOUTFS_AF_IPV6 = 2,
+};
+
+struct scoutfs_inet_addr4 {
+	__le16 family;
 	__le16 port;
-	__u8 __pad[2];
+	__le32 addr;
+};
+
+/*
+ * Not yet supported by code.
+ */
+struct scoutfs_inet_addr6 {
+	__le16 family;
+	__le16 port;
+	__u8 addr[16];
+	__le32 flow_info;
+	__le32 scope_id;
+	__u8 __pad[4];
+};
+
+union scoutfs_inet_addr {
+	struct scoutfs_inet_addr4 v4;
+	struct scoutfs_inet_addr6 v6;
 };

 /*
@@ -591,7 +613,7 @@ struct scoutfs_quorum_message {
 struct scoutfs_quorum_config {
 	__le64 version;
 	struct scoutfs_quorum_slot {
-		struct scoutfs_inet_addr addr;
+		union scoutfs_inet_addr addr;
 	} slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };

--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -1621,19 +1621,28 @@ int scoutfs_orphan_inode(struct inode *inode)
 }

 /*
- * Track an inode that could have dirty pages.  Used to kick off writeback
- * on all dirty pages during transaction commit without tying ourselves in
- * knots trying to call through the high level vfs sync methods.
+ * Track an inode that could have dirty pages.  Used to kick off
+ * writeback on all dirty pages during transaction commit without tying
+ * ourselves in knots trying to call through the high level vfs sync
+ * methods.
+ *
+ * This is called by writers who hold the inode and transaction.  The
+ * inode's presence in the rbtree is removed by destroy_inode, prevented
+ * by the inode hold, and by committing the transaction, which is
+ * prevented by holding the transaction.  The inode can only go from
+ * empty to on the rbtree while we're here.
 */
 void scoutfs_inode_queue_writeback(struct inode *inode)
 {
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);

-	spin_lock(&inf->writeback_lock);
-	if (RB_EMPTY_NODE(&si->writeback_node))
-		insert_writeback_inode(inf, si);
-	spin_unlock(&inf->writeback_lock);
+	if (RB_EMPTY_NODE(&si->writeback_node)) {
+		spin_lock(&inf->writeback_lock);
+		if (RB_EMPTY_NODE(&si->writeback_node))
+			insert_writeback_inode(inf, si);
+		spin_unlock(&inf->writeback_lock);
+	}
 }

 /*
--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -586,7 +586,9 @@ static void init_lock_clients_key(struct scoutfs_key *key, u64 rid)
 * the client had already talked to the server then we must find an
 * existing record for it and should begin recovery.  If it doesn't have
 * a record then its timed out and we can't allow it to reconnect.  If
- * its connecting for the first time then we insert a new record.  If
+ * we're creating a new record for a client we can see EEXIST if the
+ * greeting is resent to a new server after the record was committed but
+ * before the response was received by the client.
 *
 * This is running in concurrent client greeting processing contexts.
 */
@@ -611,6 +613,8 @@ int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid,
 		ret = scoutfs_btree_insert(sb, inf->alloc, inf->wri,
 					   &super->lock_clients,
 					   &key, NULL, 0);
+		if (ret == -EEXIST)
+			ret = 0;
 	}
 	mutex_unlock(&inf->mutex);

--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -1546,9 +1546,8 @@ void scoutfs_net_client_greeting(struct super_block *sb,
 * response and they can disconnect cleanly.
 *
 * At this point our connection is idle except for send submissions and
- * shutdown being queued.  Once we shut down a We completely own a We
- * have exclusive access to a previous conn once its shutdown and we set
- * _freeing.
+ * shutdown being queued.  We have exclusive access to the previous conn
+ * once it's shutdown and we set _freeing.
 */
 void scoutfs_net_server_greeting(struct super_block *sb,
 				 struct scoutfs_net_connection *conn,
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -90,19 +90,13 @@ enum conn_flags {
 #define SIN_ARG(sin)	sin, be16_to_cpu((sin)->sin_port)

 static inline void scoutfs_addr_to_sin(struct sockaddr_in *sin,
-				       struct scoutfs_inet_addr *addr)
+				       union scoutfs_inet_addr *addr)
 {
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = cpu_to_be32(le32_to_cpu(addr->addr));
-	sin->sin_port = cpu_to_be16(le16_to_cpu(addr->port));
-}
+	BUG_ON(addr->v4.family != cpu_to_le16(SCOUTFS_AF_IPV4));

-static inline void scoutfs_addr_from_sin(struct scoutfs_inet_addr *addr,
-					 struct sockaddr_in *sin)
-{
-	addr->addr = be32_to_le32(sin->sin_addr.s_addr);
-	addr->port = be16_to_le16(sin->sin_port);
-	memset(addr->__pad, 0, sizeof(addr->__pad));
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = cpu_to_be32(le32_to_cpu(addr->v4.addr));
+	sin->sin_port = cpu_to_be16(le16_to_cpu(addr->v4.port));
 }

 struct scoutfs_net_connection *
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -138,7 +138,7 @@ static bool quorum_slot_present(struct scoutfs_super_block *super, int i)
 {
 	BUG_ON(i < 0 || i > SCOUTFS_QUORUM_MAX_SLOTS);

-	return super->qconf.slots[i].addr.addr != 0;
+	return super->qconf.slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
 }

 static ktime_t election_timeout(void)
@@ -976,6 +976,9 @@ static int verify_quorum_slots(struct super_block *sb)
 		}

 		for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
+			if (!quorum_slot_present(super, j))
+				continue;
+
 			scoutfs_quorum_slot_sin(super, j, &other);

 			if (sin.sin_addr.s_addr == other.sin_addr.s_addr &&
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -423,61 +423,34 @@ TRACE_EVENT(scoutfs_trans_write_func,
 	TP_printk(SCSBF" dirty %lu", SCSB_TRACE_ARGS, __entry->dirty)
 );

-TRACE_EVENT(scoutfs_release_trans,
-	TP_PROTO(struct super_block *sb, void *rsv, unsigned int rsv_holders,
-		 unsigned int tri_holders,
-		 unsigned int tri_writing),
+DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),

-	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),
+	TP_ARGS(sb, journal_info, holders),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(void *, rsv)
-		__field(unsigned int, rsv_holders)
-		__field(unsigned int, tri_holders)
-		__field(unsigned int, tri_writing)
+		__field(unsigned long, journal_info)
+		__field(int, holders)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->rsv = rsv;
-		__entry->rsv_holders = rsv_holders;
-		__entry->tri_holders = tri_holders;
-		__entry->tri_writing = tri_writing;
+		__entry->journal_info = (unsigned long)journal_info;
+		__entry->holders = holders;
 	),

-	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
-		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
-		  __entry->tri_holders, __entry->tri_writing)
+	TP_printk(SCSBF" journal_info 0x%0lx holders %d",
+		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
 );

-TRACE_EVENT(scoutfs_trans_acquired_hold,
-	TP_PROTO(struct super_block *sb,
-		 void *rsv, unsigned int rsv_holders,
-		 unsigned int tri_holders,
-		 unsigned int tri_writing),
-
-	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(void *, rsv)
-		__field(unsigned int, rsv_holders)
-		__field(unsigned int, tri_holders)
-		__field(unsigned int, tri_writing)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->rsv = rsv;
-		__entry->rsv_holders = rsv_holders;
-		__entry->tri_holders = tri_holders;
-		__entry->tri_writing = tri_writing;
-	),
-
-	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
-		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
-		  __entry->tri_holders, __entry->tri_writing)
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_ARGS(sb, journal_info, holders)
+);
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_ARGS(sb, journal_info, holders)
 );

 TRACE_EVENT(scoutfs_ioc_release,
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -1024,6 +1024,12 @@ static void init_mounted_client_key(struct scoutfs_key *key, u64 rid)
 	};
 }

+/*
+ * Insert a new mounted client item for a client that is sending us a
+ * greeting that hasn't yet seen a response.  The greeting can be
+ * retransmitted to a new server after the previous inserted the item so
+ * it's acceptable to see -EEXIST.
+ */
 static int insert_mounted_client(struct super_block *sb, u64 rid,
 				 u64 gr_flags)
 {
@@ -1042,6 +1048,8 @@ static int insert_mounted_client(struct super_block *sb, u64 rid,
 	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
 				   &super->mounted_clients, &key, &mcv,
 				   sizeof(mcv));
+	if (ret == -EEXIST)
+		ret = 0;
 	mutex_unlock(&server->mounted_clients_mutex);

 	return ret;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -39,17 +39,15 @@
 * track the relationships between dirty blocks so there's only ever one
 * transaction being built.
 *
- * The copy of the on-disk super block in the fs sb info has its header
- * sequence advanced so that new dirty blocks inherit this dirty
- * sequence number.  It's only advanced once all those dirty blocks are
- * reachable after having first written them all out and then the new
- * super with that seq.  It's first incremented at mount.
+ * Committing the current dirty transaction can be triggered by sync, a
+ * regular background commit interval, reaching a dirty block threshold,
+ * or the transaction running out of its private allocator resources.
+ * Once all the current holders release the writing func writes out the
+ * dirty blocks while excluding holders until it finishes.
 *
- * Unfortunately writers can nest.  We don't bother trying to special
- * case holding a transaction that you're already holding because that
- * requires per-task storage.  We just let anyone hold transactions
- * regardless of waiters waiting to write, which risks waiters waiting a
- * very long time.
+ * Unfortunately writing holders can nest.  We track nested hold callers
+ * with the per-task journal_info pointer to avoid deadlocks between
+ * holders that might otherwise wait for a pending commit.
 */

 /* sync dirty data at least this often */
@@ -59,9 +57,7 @@
 * XXX move the rest of the super trans_ fields here.
 */
 struct trans_info {
-	spinlock_t lock;
-	unsigned holders;
-	bool writing;
+	atomic_t holders;

 	struct scoutfs_log_trees lt;
 	struct scoutfs_alloc alloc;
@@ -71,17 +67,9 @@ struct trans_info {
 #define DECLARE_TRANS_INFO(sb, name) \
 	struct trans_info *name = SCOUTFS_SB(sb)->trans_info

-static bool drained_holders(struct trans_info *tri)
-{
-	bool drained;
-
-	spin_lock(&tri->lock);
-	tri->writing = true;
-	drained = tri->holders == 0;
-	spin_unlock(&tri->lock);
-
-	return drained;
-}
+/* avoid the high sign bit out of an abundance of caution*/
+#define TRANS_HOLDERS_WRITE_FUNC_BIT	(1 << 30)
+#define TRANS_HOLDERS_COUNT_MASK	(TRANS_HOLDERS_WRITE_FUNC_BIT - 1)

 static int commit_btrees(struct super_block *sb)
 {
@@ -126,6 +114,36 @@ bool scoutfs_trans_has_dirty(struct super_block *sb)
 	return scoutfs_block_writer_has_dirty(sb, &tri->wri);
 }

+/*
+ * This is racing with wait_event conditions, make sure our atomic
+ * stores and waitqueue loads are ordered.
+ */
+static void sub_holders_and_wake(struct super_block *sb, int val)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+
+	atomic_sub(val, &tri->holders);
+	smp_mb(); /* make sure sub is visible before we wake */
+	if (waitqueue_active(&sbi->trans_hold_wq))
+		wake_up(&sbi->trans_hold_wq);
+}
+
+/*
+ * called as a wait_event condition, needs to be careful to not change
+ * task state and is racing with waking paths that sub_return, test, and
+ * wake.
+ */
+static bool drained_holders(struct trans_info *tri)
+{
+	int holders;
+
+	smp_mb(); /* make sure task in wait_event queue before atomic read */
+	holders = atomic_read(&tri->holders) & TRANS_HOLDERS_COUNT_MASK;
+
+	return holders == 0;
+}
+
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -162,6 +180,9 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	sbi->trans_task = current;

+	/* mark that we're writing so holders wait for us to finish and clear our bit */
+	atomic_add(TRANS_HOLDERS_WRITE_FUNC_BIT, &tri->holders);
+
 	wait_event(sbi->trans_hold_wq, drained_holders(tri));

 	trace_scoutfs_trans_write_func(sb,
@@ -213,11 +234,8 @@ out:
 	spin_unlock(&sbi->trans_write_lock);
 	wake_up(&sbi->trans_write_wq);

-	spin_lock(&tri->lock);
-	tri->writing = false;
-	spin_unlock(&tri->lock);
-
-	wake_up(&sbi->trans_hold_wq);
+	/* we're done, wake waiting holders */
+	sub_holders_and_wake(sb, TRANS_HOLDERS_WRITE_FUNC_BIT);

 	sbi->trans_task = NULL;

@@ -309,53 +327,83 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb)
 }

 /*
- * Each thread reserves space in the segment for their dirty items while
- * they hold the transaction.  This is calculated before the first
- * transaction hold is acquired.  It includes all the potential nested
- * item manipulation that could happen with the transaction held.
- * Including nested holds avoids having to deal with writing out partial
- * transactions while a caller still holds the transaction.
+ * We store nested holders in the lower bits of journal_info.  We use
+ * some higher bits as a magic value to detect if something goes
+ * horribly wrong and it gets clobbered.
 */
+#define TRANS_JI_MAGIC		0xd5700000
+#define TRANS_JI_MAGIC_MASK	0xfff00000
+#define TRANS_JI_COUNT_MASK	0x000fffff

-#define SCOUTFS_RESERVATION_MAGIC 0xd57cd13b
-struct scoutfs_reservation {
-	unsigned magic;
-	unsigned holders;
-};
+/* returns true if a caller already had a holder counted in journal_info */
+static bool inc_journal_info_holders(void)
+{
+	unsigned long holders = (unsigned long)current->journal_info;
+
+	WARN_ON_ONCE(holders != 0 && ((holders & TRANS_JI_MAGIC_MASK) != TRANS_JI_MAGIC));
+
+	if (holders == 0)
+		holders = TRANS_JI_MAGIC;
+	holders++;
+
+	current->journal_info = (void *)holders;
+	return (holders > (TRANS_JI_MAGIC | 1));
+}
+
+static void dec_journal_info_holders(void)
+{
+	unsigned long holders = (unsigned long)current->journal_info;
+
+	WARN_ON_ONCE(holders != 0 && ((holders & TRANS_JI_MAGIC_MASK) != TRANS_JI_MAGIC));
+	WARN_ON_ONCE((holders & TRANS_JI_COUNT_MASK) == 0);
+
+	holders--;
+	if (holders == TRANS_JI_MAGIC)
+		holders = 0;
+
+	current->journal_info = (void *)holders;
+}

 /*
- * Try to hold the transaction.  If a caller already holds the trans then
- * we piggy back on their hold.  We wait if the writer is trying to
- * write out the transation.  And if our items won't fit then we kick off
- * a write.
+ * This is called as the wait_event condition for holding a transaction.
+ * Increment the holder count unless the writer is present.  We return
+ * false to wait until the writer finishes and wakes us.
 *
- * This is called as a condition for wait_event.  It is very limited in
- * the locking (blocking) it can do because the caller has set the task
- * state before testing the condition safely race with waking after
- * setting the condition.  Our checking the amount of dirty metadata
- * blocks and free data blocks is racy, but we don't mind the risk of
- * delaying or prematurely forcing commits.
+ * This can be racing with itself while there's no waiters.  We retry
+ * the cmpxchg instead of returning and waiting.
 */
-static bool acquired_hold(struct super_block *sb,
-			  struct scoutfs_reservation *rsv)
+static bool inc_holders_unless_writer(struct trans_info *tri)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	DECLARE_TRANS_INFO(sb, tri);
-	bool acquired = false;
+	int holders;

-	spin_lock(&tri->lock);
+	do {
+		smp_mb(); /* make sure we read after wait puts task in queue */
+		holders = atomic_read(&tri->holders);
+		if (holders & TRANS_HOLDERS_WRITE_FUNC_BIT)
+			return false;

-	trace_scoutfs_trans_acquired_hold(sb, rsv, rsv->holders,
-					  tri->holders, tri->writing);
+	} while (atomic_cmpxchg(&tri->holders, holders, holders + 1) != holders);

-	/* use a caller's existing reservation */
-	if (rsv->holders)
-		goto hold;
+	return true;
+}

-	/* wait until the writing thread is finished */
-	if (tri->writing)
-		goto out;
+/*
+ * As we drop the last trans holder we try to wake a writing thread that
+ * was waiting for us to finish.
+ */
+static void release_holders(struct super_block *sb)
+{
+	dec_journal_info_holders();
+	sub_holders_and_wake(sb, 1);
+}

+/*
+ * The caller has incremented holders so it is blocking commits.  We
+ * make some quick checks to see if we need to trigger and wait for
+ * another commit before proceeding.
+ */
+static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
+{
 	/*
 	 * In theory each dirty item page could be straddling two full
 	 * blocks, requiring 4 allocations for each item cache page.
@@ -365,11 +413,9 @@ static bool acquired_hold(struct super_block *sb,
 	 * that it accounts for having to dirty parent blocks and
 	 * whatever dirtying is done during the transaction hold.
 	 */
-	if (scoutfs_alloc_meta_low(sb, &tri->alloc,
-				   scoutfs_item_dirty_pages(sb) * 2)) {
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc, scoutfs_item_dirty_pages(sb) * 2)) {
 		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
-		queue_trans_work(sbi);
-		goto out;
+		return true;
 	}

 	/*
@@ -381,57 +427,74 @@ static bool acquired_hold(struct super_block *sb,
 	 */
 	if (scoutfs_alloc_meta_low(sb, &tri->alloc, 16)) {
 		scoutfs_inc_counter(sb, trans_commit_meta_alloc_low);
-		queue_trans_work(sbi);
-		goto out;
+		return true;
 	}

 	/* Try to refill data allocator before premature enospc */
 	if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
 		scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
-		queue_trans_work(sbi);
+		return true;
+	}
+
+	return false;
+}
+
+static bool acquired_hold(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+	bool acquired;
+
+	/* if a caller already has a hold we acquire unconditionally */
+	if (inc_journal_info_holders()) {
+		atomic_inc(&tri->holders);
+		acquired = true;
 		goto out;
 	}

-hold:
-	rsv->holders++;
-	tri->holders++;
+	/* wait if the writer is blocking holds */
+	if (!inc_holders_unless_writer(tri)) {
+		dec_journal_info_holders();
+		acquired = false;
+		goto out;
+	}
+
+	/* wait if we're triggering another commit */
+	if (commit_before_hold(sb, tri)) {
+		release_holders(sb);
+		queue_trans_work(sbi);
+		acquired = false;
+		goto out;
+	}
+
+	trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
 	acquired = true;
-
 out:
-
-	spin_unlock(&tri->lock);
-
 	return acquired;
 }

+/*
+ * Try to hold the transaction.  Holding the transaction prevents it
+ * from being committed.  If a transaction is currently being written
+ * then we'll block until it's done and our hold can be granted.
+ *
+ * If a caller already holds the trans then we unconditionally acquire
+ * our hold and return to avoid deadlocks with our caller, the writing
+ * thread, and us.  We record nested holds in a call stack with the
+ * journal_info pointer in the task_struct.
+ *
+ * The writing thread marks itself as a global trans_task which
+ * short-circuits all the hold machinery so it can call code that would
+ * otherwise try to hold transactions while it is writing.
+ */
 int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv;
-	int ret;

 	if (current == sbi->trans_task)
 		return 0;

-	rsv = current->journal_info;
-	if (rsv == NULL) {
-		rsv = kzalloc(sizeof(struct scoutfs_reservation), GFP_NOFS);
-		if (!rsv)
-			return -ENOMEM;
-
-		rsv->magic = SCOUTFS_RESERVATION_MAGIC;
-		current->journal_info = rsv;
-	}
-
-	BUG_ON(rsv->magic != SCOUTFS_RESERVATION_MAGIC);
-
-	ret = wait_event_interruptible(sbi->trans_hold_wq,
-				       acquired_hold(sb, rsv));
-	if (ret && rsv->holders == 0) {
-		current->journal_info = NULL;
-		kfree(rsv);
-	}
-	return ret;
+	return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
 }

 /*
@@ -441,50 +504,22 @@ int scoutfs_hold_trans(struct super_block *sb)
 */
 bool scoutfs_trans_held(void)
 {
-	struct scoutfs_reservation *rsv = current->journal_info;
+	unsigned long holders = (unsigned long)current->journal_info;

-	return rsv && rsv->magic == SCOUTFS_RESERVATION_MAGIC;
+	return (holders != 0 && ((holders & TRANS_JI_MAGIC_MASK) == TRANS_JI_MAGIC));
 }

-/*
- * As we drop the last hold in the reservation we try and wake other
- * hold attempts that were waiting for space.  As we drop the last trans
- * holder we try to wake a writing thread that was waiting for us to
- * finish.
- */
 void scoutfs_release_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv;
 	DECLARE_TRANS_INFO(sb, tri);
-	bool wake = false;

 	if (current == sbi->trans_task)
 		return;

-	rsv = current->journal_info;
-	BUG_ON(!rsv || rsv->magic != SCOUTFS_RESERVATION_MAGIC);
+	release_holders(sb);

-	spin_lock(&tri->lock);
-
-	trace_scoutfs_release_trans(sb, rsv, rsv->holders, tri->holders, tri->writing);
-
-	BUG_ON(rsv->holders <= 0);
-	BUG_ON(tri->holders <= 0);
-
-	if (--rsv->holders == 0) {
-		current->journal_info = NULL;
-		kfree(rsv);
-		wake = true;
-	}
-
-	if (--tri->holders == 0)
-		wake = true;
-
-	spin_unlock(&tri->lock);
-
-	if (wake)
-		wake_up(&sbi->trans_hold_wq);
+	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
 }

 /*
@@ -513,7 +548,7 @@ int scoutfs_setup_trans(struct super_block *sb)
 	if (!tri)
 		return -ENOMEM;

-	spin_lock_init(&tri->lock);
+	atomic_set(&tri->holders, 0);
 	scoutfs_block_writer_init(sb, &tri->wri);

 	sbi->trans_write_workq = alloc_workqueue("scoutfs_trans",
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -209,12 +209,19 @@ t_trigger_show() {
 	echo "trigger $which $string: $(t_trigger_get $which $nr)"
 }

-t_trigger_arm() {
+t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
 	local path=$(t_trigger_path "$nr")

 	echo 1 > "$path/$which"
+}
+
+t_trigger_arm() {
+	local which="$1"
+	local nr="$2"
+
+	t_trigger_arm_silent $which $nr
 	t_trigger_show $which armed $nr
 }

@@ -229,16 +236,44 @@ t_counter() {
 	cat "$(t_sysfs_path $nr)/counters/$which"
 }

+#
+# output the difference between the current value of a counter and the
+# caller's provided previous value.
+#
+t_counter_diff_value() {
+	local which="$1"
+	local old="$2"
+	local nr="$3"
+	local new="$(t_counter $which $nr)"
+
+	echo "$((new - old))"
+}
+
 #
 # output the value of the given counter for the given mount, defaulting
-# to mount 0 if a mount isn't specified.
+# to mount 0 if a mount isn't specified.  For tests which expect a
+# specific difference in counters.
 #
 t_counter_diff() {
 	local which="$1"
 	local old="$2"
 	local nr="$3"
-	local new

-	new="$(t_counter $which $nr)"
-	echo "counter $which diff $((new - old))"
+	echo "counter $which diff $(t_counter_diff_value $which $old $nr)"
+}
+
+#
+# output a message indicating whether or not the counter value changed.
+# For tests that expect a difference, or not, but the amount of
+# difference isn't significant.
+#
+t_counter_diff_changed() {
+	local which="$1"
+	local old="$2"
+	local nr="$3"
+	local diff="$(t_counter_diff_value $which $old $nr)"
+
+	test "$diff" -eq 0 && \
+		echo "counter $which didn't change" ||
+		echo "counter $which changed"
 }
--- a/tests/golden/block-stale-reads
+++ b/tests/golden/block-stale-reads
@@ -1,29 +1,52 @@
-== create file for xattr ping pong
-# file: /mnt/test/test/block-stale-reads/file
-user.xat="initial"
-
-== retry btree forest reads between mounts
-trigger block_remove_stale armed: 0
+== create shared test file
+== set and get xattrs between mount pairs while retrying
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="1"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 1
-trigger block_remove_stale armed: 0
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="2"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 2
-trigger block_remove_stale armed: 0
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="3"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 3
-trigger block_remove_stale armed: 0
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
 # file: /mnt/test/test/block-stale-reads/file
 user.xat="4"

-trigger block_remove_stale after: 0
-counter block_cache_remove_stale diff 4
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="5"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="6"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="7"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="8"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="9"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
+# file: /mnt/test/test/block-stale-reads/file
+user.xat="10"
+
+counter block_cache_remove_stale changed
+counter block_cache_remove_stale changed
--- a/tests/tests/block-stale-reads.sh
+++ b/tests/tests/block-stale-reads.sh
@@ -1,5 +1,5 @@
 #
-# exercise stale block reading.
+# Exercise stale block reading.
 #
 # It would be very difficult to manipulate the allocators, cache, and
 # persistent blocks to create stable block reading scenarios.    Instead
@@ -7,34 +7,55 @@
 #

 t_require_commands touch setfattr getfattr
-t_require_mounts 2
+
+inc_wrap_fs_nr()
+{
+	local nr="$(($1 + 1))"
+
+	if [ "$nr" == "$T_NR_MOUNTS" ]; then
+		nr=0
+	fi
+
+	echo $nr
+}

 GETFATTR="getfattr --absolute-names"
 SETFATTR="setfattr"

-#
-# force re-reading forest btree blocks as each mount reads the items
-# written by the other.
-#
-set_file="$T_D0/file"
-get_file="$T_D1/file"
-echo "== create file for xattr ping pong"
-touch "$set_file"
-$SETFATTR -n user.xat -v initial "$set_file"
-$GETFATTR -n user.xat "$get_file" 2>&1 | t_filter_fs
+echo "== create shared test file"
+touch "$T_D0/file"
+$SETFATTR -n user.xat -v 0 "$T_D0/file"

-echo "== retry btree forest reads between mounts" 
-for i in $(seq 1 4); do
-	tmp="$set_file"
-	set_file="$get_file"
-	get_file="$tmp"
+#
+# Trigger retries in the block cache as we bounce xattr values around
+# between sequential pairs of mounts.  This is a little silly because if
+# either of the mounts are the server then they'll almost certaily have
+# their trigger fired prematurely by message handling btree calls while
+# working with the t_ helpers long before we work with the xattrs.  But
+# the block cache stale retry path is still being exercised.
+#
+echo "== set and get xattrs between mount pairs while retrying"
+set_nr=0
+get_nr=$(inc_wrap_fs_nr $set_nr)
+
+for i in $(seq 1 10); do
+	eval set_file="\$T_D${set_nr}/file"
+	eval get_file="\$T_D${get_nr}/file"
+
+	old_set=$(t_counter block_cache_remove_stale $set_nr)
+	old_get=$(t_counter block_cache_remove_stale $get_nr)
+
+	t_trigger_arm_silent block_remove_stale $set_nr
+	t_trigger_arm_silent block_remove_stale $get_nr

 	$SETFATTR -n user.xat -v $i "$set_file"
-	t_trigger_arm block_remove_stale $cl
-	old=$(t_counter btree_stale_read $cl)
 	$GETFATTR -n user.xat "$get_file" 2>&1 | t_filter_fs
-	t_trigger_show block_remove_stale "after" $cl
-	t_counter_diff block_cache_remove_stale $old $cl
+
+	t_counter_diff_changed block_cache_remove_stale $old_set $set_nr
+	t_counter_diff_changed block_cache_remove_stale $old_get $get_nr
+
+	set_nr="$get_nr"
+	get_nr=$(inc_wrap_fs_nr $set_nr)
 done

 t_pass
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -361,12 +361,12 @@ static int do_mkfs(struct mkfs_args *args)
 		struct scoutfs_quorum_slot *sl = &super->qconf.slots[i];
 		struct in_addr in;

-		if (sl->addr.addr == 0)
+		if (sl->addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
 			continue;

-		in.s_addr = htonl(le32_to_cpu(sl->addr.addr));
+		in.s_addr = htonl(le32_to_cpu(sl->addr.v4.addr));
 		printf("%s%u: %s:%u", indent,
-		       i, inet_ntoa(in), le16_to_cpu(sl->addr.port));
+		       i, inet_ntoa(in), le16_to_cpu(sl->addr.v4.port));
 		indent = "\n                        ";
 	}
 	printf("\n");
@@ -395,22 +395,28 @@ static bool valid_quorum_slots(struct scoutfs_quorum_slot *slots)
 	int j;

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (slots[i].addr.addr == 0)
+		if (slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_NONE))
 			continue;

+		if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4)) {
+			fprintf(stderr, "quorum slot nr %u has invalid family %u\n",
+				i, le16_to_cpu(slots[i].addr.v4.family));
+			valid = false;
+		}
+
 		for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
-			if (slots[j].addr.addr == 0)
+			if (slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
 				continue;

-			if (slots[i].addr.addr == slots[j].addr.addr &&
-			    slots[i].addr.port == slots[j].addr.port) {
+			if (slots[i].addr.v4.addr == slots[j].addr.v4.addr &&
+			    slots[i].addr.v4.port == slots[j].addr.v4.port) {

 				in.s_addr =
-					htonl(le32_to_cpu(slots[i].addr.addr));
+					htonl(le32_to_cpu(slots[i].addr.v4.addr));
 				addr = inet_ntoa(in);
 				fprintf(stderr, "quorum slot nr %u and %u have the same address %s:%u\n",
 					i, j, addr,
-					le16_to_cpu(slots[i].addr.port));
+					le16_to_cpu(slots[i].addr.v4.port));
 				valid = false;
 			}
 		}
@@ -430,7 +436,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 		ret = parse_quorum_slot(&slot, arg);
 		if (ret < 0)
 			return ret;
-		if (args->slots[ret].addr.addr != 0)
+		if (args->slots[ret].addr.v4.family != cpu_to_le16(SCOUTFS_AF_NONE))
 			argp_error(state, "Quorum slot %u already specified before slot '%s'\n",
 				   ret, arg);
 		args->slots[ret] = slot;
--- a/utils/src/parse.c
+++ b/utils/src/parse.c
@@ -213,7 +213,8 @@ int parse_quorum_slot(struct scoutfs_quorum_slot *slot, char *arg)
 		return -EINVAL;
 	}

-	slot->addr.addr = cpu_to_le32(htonl(in.s_addr));
-	slot->addr.port = cpu_to_le16(port);
+	slot->addr.v4.family = cpu_to_le16(SCOUTFS_AF_IPV4);
+	slot->addr.v4.addr = cpu_to_le32(htonl(in.s_addr));
+	slot->addr.v4.port = cpu_to_le16(port);
 	return nr;
 }
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -769,7 +769,7 @@ static int print_btree_leaf_items(int fd, struct scoutfs_super_block *super,
 	return 0;
 }

-static char *alloc_addr_str(struct scoutfs_inet_addr *ia)
+static char *alloc_addr_str(union scoutfs_inet_addr *ia)
 {
 	struct in_addr addr;
 	char *quad;
@@ -777,12 +777,12 @@ static char *alloc_addr_str(struct scoutfs_inet_addr *ia)
 	int len;

 	memset(&addr, 0, sizeof(addr));
-	addr.s_addr = htonl(le32_to_cpu(ia->addr));
+	addr.s_addr = htonl(le32_to_cpu(ia->v4.addr));
 	quad = inet_ntoa(addr);
 	if (quad == NULL)
 		return NULL;

-	len = snprintf(NULL, 0, "%s:%u", quad, le16_to_cpu(ia->port));
+	len = snprintf(NULL, 0, "%s:%u", quad, le16_to_cpu(ia->v4.port));
 	if (len < 1 || len > 22)
 		return NULL;

@@ -791,7 +791,7 @@ static char *alloc_addr_str(struct scoutfs_inet_addr *ia)
 	if (!str)
 		return NULL;

-	snprintf(str, len, "%s:%u", quad, le16_to_cpu(ia->port));
+	snprintf(str, len, "%s:%u", quad, le16_to_cpu(ia->v4.port));
 	return str;
 }

@@ -915,8 +915,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	printf("  quorum config version %llu\n",
 		le64_to_cpu(super->qconf.version));
 	for (i = 0; i < array_size(super->qconf.slots); i++) {
-		if (!super->qconf.slots[i].addr.addr &&
-		    !super->qconf.slots[i].addr.port)
+		if (super->qconf.slots[i].addr.v4.family != cpu_to_le16(SCOUTFS_AF_IPV4))
 			continue;

 		addr = alloc_addr_str(&super->qconf.slots[i].addr);
Author	SHA1	Message	Date
Zach Brown	8ee41caa24	scoutfs: introduce CodingStyle.txt Add a coding style document that tries to record the conventions that the project uses. It seemed more appropriate to put it up in the -kmod git repo context rather than in src/ which would end up in fs/scoutfs/ upstream. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-31 10:55:18 -07:00
Andy Grover	09c879bcf1	Merge pull request #25 from versity/zab/client_greeting_items_exist Zab/client greeting items exist	2021-03-16 15:57:55 -07:00
Zach Brown	3de703757f	Fix weird comment editing error That comment looked very weird indeed until I recognized that I must have forgotten to delete the first two attempts at starting the sentence. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-16 12:02:05 -07:00
Zach Brown	7d67489b0c	Handle resent initial client greetings The very first greeting a client sends is unique becuase it doesn't yet have a server_term field set and tells the server to create items to track the client. A server processing this request can create the items and then shut down before the client is able to receive the reply. They'll resend the greeting without server_term but then the next server will get -EEXIST errors as it tries to create items for the client. This causes the connection to break, which the client tries to reestablish, and the pattern repeats indefinitely. The fix is to simply recognize that -EEXIST is acceptable during item creation. Server message handlers always have to address the case where a resent message was already processed by a previous server but it's response didn't make it to the client. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-16 11:56:26 -07:00
Zach Brown	73084462e9	Remove unused client greeting_umb Remove an old client info field from the unmount barrier mechanism which was removed a while ago. It used to be compared to a super field to decide to finish unmount without reconnecting but now we check for our mounted_client item in the server's btree. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-16 10:04:42 -07:00
Zach Brown	8c81af2b9b	Merge pull request #22 from agrover/ipv6 Reserve space in superblock for IPv6 addresses	2021-03-15 16:04:26 -07:00
Andy Grover	efe5d92458	Reserve space in superblock for IPv6 addresses Define a family field, and add a union for IPv4 and v6 variants, although v6 is not supported yet. Family field is now used to determine presence of address in a quorum slot, instead of checking if addr is zero. Signed-off-by: Andy Grover <agrover@versity.com>	2021-03-12 14:10:42 -08:00
Andy Grover	d39e56d953	Merge pull request #24 from versity/zab/fix-block-stale-reads Zab/fix block stale reads	2021-03-11 09:33:03 -08:00
Zach Brown	5661a1fb02	Fix block-stale-reads test The block-stale-reads test was built from the ashes of a test that used counters and triggers to work with the btree when it was only used on the server. The initial quick translation to try and trigger block cache retries while the forest called the btree got so much wrong. It was still trying to use some 'cl' variable that didn't refer to the client any more, the trigger helpers now call statfs to find paths and can end up triggering themselves. and many more counters stale reads can happen throughout the system while we're working -- not just one from our trigger. This fixes it up to consistently use fs numbers instead of the silly stale cl variable and be less sensitive to triggers firing and counter differences. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-10 12:36:41 -08:00
Zach Brown	12fa289399	Add t_trigger_arm_silent t_trigger_arm always output the value of the trigger after arming on the premise that tests required the trigger being armed. In the process of showing the trigger it calls a bunch of t_ helpers that build the path to the trigger file using statfs_more to get the rid of mounts. If the trigger being armed is in the server's mount and the specific trigger test is fired by the server's statfs_more request processing then the trigger can be fired before read its value. Tests can inconsistently fail as the golden output shows the trigger being armed or not depending on if it was in the server's mount or not. t_trigger_arm_silent doesn't output the value of the armed trigger. It can be used for low level triggers that don't rely on reading the trigger's value to discover that their effect has happened. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-10 12:36:34 -08:00
Zach Brown	75e8fab57c	Add t_counter_diff_changed Tests can use t_counter_diff to put a message in their golden output when a specific change in counters is expected. This adds t_counter_diff_changed to output a message that indicates change or not, for tests that want to see counters change but the amount of change doesn't need to be precisely known. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-10 12:32:04 -08:00
Zach Brown	513d6b2734	Merge pull request #20 from versity/zab/remove_trans_spinlock Zab/remove trans spinlock	2021-03-04 13:59:07 -08:00
Zach Brown	f8d39610a2	Only get inode writeback_lock when adding inodes Each transaction maintains a global list of inodes to sync. It checks the inode and adds it in each write_end call per OS page. Locking and unlocking the global spinlock was showing up in profiles. At the very least, we can only get the lock once per large file that's written during a transaction. This will reduce spinlock traffic on the lock by the number of pages written per file. We'll want a better solution in the long run, but this helps for now. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-04 11:39:30 -08:00
Zach Brown	c470c1c9f6	Allow read-mostly _alloc_meta_low Each transaction hold makes multiple calls to _alloc_meta_low to see if the transaction should be committed to refill allocators before the caller's hold is acquired and they can dirty blocks in the transaction. _alloc_meta_low was using a spinlock to sample the allocator list_head blocks to determine if there was space available. The lock and unlock stores were creating significant cacheline contention. The _alloc_meta_low calls are higher frequency than allocations. We can use a seqlock to have exclusive writers and allow concurrent _alloc_meta_low readers who retry if a writer intervenes. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-04 11:39:30 -08:00
Andy Grover	cad902b9cd	Merge pull request #19 from versity/zab/block_crash_and_consistency Zab/block crash and consistency	2021-03-04 10:57:27 -08:00
Zach Brown	e163f3b099	Use atomic holders instead of trans info lock We saw the transaction info lock showing up in profiles. We were doing quite a lot of work with that lock held. We can remove it entirely and use an atomic. Instead of a locked holders count and writer boolean we can use an atomic holders and have a high bit indicate that the write_func is pending. This turns the lock/unlock pairs in hold and release into atomic inc/cmpxchg/dec operations. Then we were checking allocators under the trans lock. Now that we have an atomic holders count we can increment it to prevent the writer from commiting and release it after the checks if we need another commit before the hold. And finally, we were freeing our allocated reservation struct under the lock. We weren't actually doing anything with the reservation struct so we can use journal_info as the nested hold counter instead of having it point to an allocated and freed struct. Signed-off-by: Zach Brown <zab@versity.com>	2021-03-01 14:18:04 -08:00