Disable mount-unmount-race test

The mount-unmount-race test is occasionally hanging, disable it while we debug it and have test coverage for unrelated work. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #14 from agrover/fix-jira-202
2026-06-09 21:22:36 +00:00 · 2021-02-01 10:07:47 -08:00 · 2021-02-01 09:46:01 -08:00 · 2021-02-01 09:24:59 -08:00 · 2021-01-29 09:30:57 -08:00 · 2021-01-26 16:07:05 -08:00
34 changed files with 391 additions and 762 deletions
@@ -31,15 +31,9 @@ functionality hasn't been implemented.  It's appropriate for early
 adopters and interested developers, not for production use.

 In that vein, expect significant incompatible changes to both the format
-of network messages and persistent structures.  To avoid mistakes the
-implementation currently calculates a hash of the format and ioctl
-header files in the source tree.  The kernel module will refuse to mount
-a volume created by userspace utilities with a mismatched hash, and it
-will refuse to connect to a remote node with a mismatched hash.  This
-means having to unmount, mkfs, and remount everything across many
-functional changes.  Once the format is nailed down we'll wire up
-forward and back compat machinery and remove this temporary safety
-measure. 
+of network messages and persistent structures. Since the format hash-checking
+has now been removed in preparation for release, if there is any doubt, mkfs
+is strongly recommended.

 The current kernel module is developed against the RHEL/CentOS 7.x
 kernel to minimize the friction of developing and testing with partners'
@@ -16,11 +16,7 @@ SCOUTFS_GIT_DESCRIBE := \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

-SCOUTFS_FORMAT_HASH := \
-	$(shell cat src/format.h src/ioctl.h | md5sum | cut -b1-16)
-
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
-		SCOUTFS_FORMAT_HASH=$(SCOUTFS_FORMAT_HASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

@@ -1,7 +1,6 @@
 obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o

-CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\" \
-		 -DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
+CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\"

 CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include

@@ -770,8 +770,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	ret = 0;
 out:
 	if (ret < 0) {
+		/*
+		 * Special retval meaning there wasn't space to alloc from
+		 * this txn. Doesn't mean filesystem is completely full.
+		 * Maybe upper layers want to try again.
+		 */
 		if (ret == -ENOENT)
-			ret = -ENOSPC;
+			ret = -ENOBUFS;
 		*blkno_ret = 0;
 		*count_ret = 0;
 	} else {
@@ -121,16 +121,14 @@ int scoutfs_client_get_roots(struct super_block *sb,
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-	__le64 before = cpu_to_le64p(seq);
-	__le64 after;
+	__le64 leseq;
 	int ret;

 	ret = scoutfs_net_sync_request(sb, client->conn,
 				       SCOUTFS_NET_CMD_ADVANCE_SEQ,
-				       &before, sizeof(before),
-				       &after, sizeof(after));
+				       NULL, 0, &leseq, sizeof(leseq));
 	if (ret == 0)
-		*seq = le64_to_cpu(after);
+		*seq = le64_to_cpu(leseq);

 	return ret;
 }
@@ -282,10 +280,10 @@ static int client_greeting(struct super_block *sb,
 		goto out;
 	}

-	if (gr->format_hash != super->format_hash) {
+	if (gr->version != super->version) {
 		scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx",
-			     le64_to_cpu(gr->format_hash),
-			     le64_to_cpu(super->format_hash));
+			     le64_to_cpu(gr->version),
+			     le64_to_cpu(super->version));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -394,7 +392,7 @@ static void scoutfs_client_connect_worker(struct work_struct *work)

 	/* send a greeting to verify endpoints of each connection */
 	greet.fsid = super->hdr.fsid;
-	greet.format_hash = super->format_hash;
+	greet.version = super->version;
 	greet.server_term = cpu_to_le64(client->server_term);
 	greet.unmount_barrier = cpu_to_le64(client->greeting_umb);
 	greet.rid = cpu_to_le64(sbi->rid);
@@ -1,315 +0,0 @@
-#ifndef _SCOUTFS_COUNT_H_
-#define _SCOUTFS_COUNT_H_
-
-/*
- * Our estimate of the space consumed while dirtying items is based on
- * the number of items and the size of their values.
- *
- * The estimate is still a read-only input to entering the transaction.
- * We'd like to use it as a clean rhs arg to hold_trans.  We define SIC_
- * functions which return the count struct.  This lets us have a single
- * arg and avoid bugs in initializing and passing in struct pointers
- * from callers.  The internal __count functions are used compose an
- * estimate out of the sets of items it manipulates.  We program in much
- * clearer C instead of in the preprocessor.
- *
- * Compilers are able to collapse the inlines into constants for the
- * constant estimates.
- */
-
-struct scoutfs_item_count {
-	signed items;
-	signed vals;
-};
-
-/* The caller knows exactly what they're doing. */
-static inline const struct scoutfs_item_count SIC_EXACT(signed items,
-							signed vals)
-{
-	struct scoutfs_item_count cnt = {
-		.items = items,
-		.vals = vals,
-	};
-
-	return cnt;
-}
-
-/*
- * Allocating an inode creates a new set of indexed items.
- */
-static inline void __count_alloc_inode(struct scoutfs_item_count *cnt)
-{
-	const int nr_indices = SCOUTFS_INODE_INDEX_NR;
-
-	cnt->items += 1 + nr_indices;
-	cnt->vals += sizeof(struct scoutfs_inode);
-}
-
-/*
- * Dirtying an inode dirties the inode item and can delete and create
- * the full set of indexed items.
- */
-static inline void __count_dirty_inode(struct scoutfs_item_count *cnt)
-{
-	const int nr_indices = 2 * SCOUTFS_INODE_INDEX_NR;
-
-	cnt->items += 1 + nr_indices;
-	cnt->vals += sizeof(struct scoutfs_inode);
-}
-
-static inline const struct scoutfs_item_count SIC_ALLOC_INODE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_alloc_inode(&cnt);
-
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_DIRTY_INODE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirty_inode(&cnt);
-
-	return cnt;
-}
-
-/*
- * Directory entries are stored in three items.
- */
-static inline void __count_dirents(struct scoutfs_item_count *cnt,
-				   unsigned name_len)
-{
-	cnt->items += 3;
-	cnt->vals += 3 * offsetof(struct scoutfs_dirent, name[name_len]);
-}
-
-static inline void __count_sym_target(struct scoutfs_item_count *cnt,
-				      unsigned size)
-{
-	unsigned nr = DIV_ROUND_UP(size, SCOUTFS_MAX_VAL_SIZE);
-
-	cnt->items += nr;
-	cnt->vals += size;
-}
-
-static inline void __count_orphan(struct scoutfs_item_count *cnt)
-{
-
-	cnt->items += 1;
-}
-
-static inline void __count_mknod(struct scoutfs_item_count *cnt,
-				 unsigned name_len)
-{
-	__count_alloc_inode(cnt);
-	__count_dirents(cnt, name_len);
-	__count_dirty_inode(cnt);
-}
-
-static inline const struct scoutfs_item_count SIC_MKNOD(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_mknod(&cnt, name_len);
-
-	return cnt;
-}
-
-/*
- * Dropping the inode deletes all its items.  Potentially enormous numbers
- * of items (data mapping, xattrs) are deleted in their own transactions.
- */
-static inline const struct scoutfs_item_count SIC_DROP_INODE(int mode,
-							     u64 size)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	if (S_ISLNK(mode))
-		__count_sym_target(&cnt, size);
-	__count_dirty_inode(&cnt);
-	__count_orphan(&cnt);
-
-	cnt.vals = 0;
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_LINK(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirents(&cnt, name_len);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-
-	return cnt;
-}
-
-/*
- * Unlink can add orphan items.
- */
-static inline const struct scoutfs_item_count SIC_UNLINK(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirents(&cnt, name_len);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_orphan(&cnt);
-
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_SYMLINK(unsigned name_len,
-							  unsigned size)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_mknod(&cnt, name_len);
-	__count_sym_target(&cnt, size);
-
-	return cnt;
-}
-
-/*
- * This assumes the worst case of a rename between directories that
- * unlinks an existing target.  That'll be worse than the common case
- * by a few hundred bytes.
- */
-static inline const struct scoutfs_item_count SIC_RENAME(unsigned old_len,
-							 unsigned new_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	/* dirty dirs and inodes */
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-
-	/* unlink old and new, link new */
-	__count_dirents(&cnt, old_len);
-	__count_dirents(&cnt, new_len);
-	__count_dirents(&cnt, new_len);
-
-	/* orphan the existing target */
-	__count_orphan(&cnt);
-
-	return cnt;
-}
-
-/*
- * Creating an xattr results in a dirty set of items with values that
- * store the xattr header, name, and value.  There's always at least one
- * item with the header and name.  Any previously existing items are
- * deleted which dirties their key but removes their value.  The two
- * sets of items are indexed by different ids so their items don't
- * overlap.
- */
-static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
-							    bool creating,
-							    unsigned name_len,
-							    unsigned size)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int new_parts;
-
-	__count_dirty_inode(&cnt);
-
-	if (old_parts)
-		cnt.items += old_parts;
-
-	if (creating) {
-		new_parts = SCOUTFS_XATTR_NR_PARTS(name_len, size);
-
-		cnt.items += new_parts;
-		cnt.vals += sizeof(struct scoutfs_xattr) + name_len + size;
-	}
-
-	return cnt;
-}
-
-/*
- * write_begin can have to allocate all the blocks in the page and can
- * have to add a big allocation from the server to do so:
- *  - merge added free extents from the server
- *  - remove a free extent per block
- *  - remove an offline extent for every other block
- *  - add a file extent per block
- */
-static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned nr_free = (1 + SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
-	unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCK_SM_PER_PAGE, 2) +
-			    SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items += nr_free + nr_file;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * Truncating an extent can:
- *  - delete existing file extent,
- *  - create two surrounding file extents,
- *  - add an offline file extent,
- *  - delete two existing free extents
- *  - create a merged free extent
- */
-static inline const struct scoutfs_item_count
-SIC_TRUNC_EXTENT(struct inode *inode)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int nr_file = 1 + 2 + 1;
-	unsigned int nr_free = (2 + 1) * 2;
-
-	if (inode)
-		__count_dirty_inode(&cnt);
-
-	cnt.items += nr_file + nr_free;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * Fallocating an extent can, at most:
- *  - allocate from the server: delete two free and insert merged
- *  - free an allocated extent: delete one and create two split
- *  - remove an unallocated file extent: delete one and create two split
- *  - add an fallocated flie extent: delete two and inset one merged
- */
-static inline const struct scoutfs_item_count SIC_FALLOCATE_ONE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int nr_free = ((1 + 2) * 2) * 2;
-	unsigned int nr_file = (1 + 2) * 2;
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items += nr_free + nr_file;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * ioc_setattr_more can dirty the inode and add a single offline extent.
- */
-static inline const struct scoutfs_item_count SIC_SETATTR_MORE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items++;
-
-	return cnt;
-}
-
-#endif
@@ -58,6 +58,8 @@
 	EXPAND_COUNTER(corrupt_symlink_inode_size)		\
 	EXPAND_COUNTER(corrupt_symlink_missing_item)		\
 	EXPAND_COUNTER(corrupt_symlink_not_null_term)		\
+	EXPAND_COUNTER(data_fallocate_enobufs_retry)		\
+	EXPAND_COUNTER(data_write_begin_enobufs_retry)		\
 	EXPAND_COUNTER(dentry_revalidate_error)			\
 	EXPAND_COUNTER(dentry_revalidate_invalid)		\
 	EXPAND_COUNTER(dentry_revalidate_locked)		\
@@ -37,7 +37,6 @@
 #include "lock.h"
 #include "file.h"
 #include "msg.h"
-#include "count.h"
 #include "ext.h"
 #include "util.h"

@@ -291,7 +290,6 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 				u64 ino, u64 iblock, u64 last, bool offline,
 				struct scoutfs_lock *lock)
 {
-	struct scoutfs_item_count cnt = SIC_TRUNC_EXTENT(inode);
 	struct scoutfs_inode_info *si = NULL;
 	LIST_HEAD(ind_locks);
 	s64 ret = 0;
@@ -315,9 +313,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 	while (iblock <= last) {
 		if (inode)
 			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
-							    true, cnt);
+							    true);
 		else
-			ret = scoutfs_hold_trans(sb, cnt);
+			ret = scoutfs_hold_trans(sb);
 		if (ret)
 			break;

@@ -753,13 +751,13 @@ static int scoutfs_write_begin(struct file *file,
 		goto out;
 	}

+retry:
 	do {
 		ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 		      scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
 						  true) ?:
 		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
-							ind_seq,
-							SIC_WRITE_BEGIN());
+							ind_seq);
 	} while (ret > 0);
 	if (ret < 0)
 		goto out;
@@ -768,17 +766,22 @@ static int scoutfs_write_begin(struct file *file,
 	flags |= AOP_FLAG_NOFS;

 	/* generic write_end updates i_size and calls dirty_inode */
-	ret = scoutfs_dirty_inode_item(inode, wbd->lock);
-	if (ret == 0)
-		ret = block_write_begin(mapping, pos, len, flags, pagep,
-					scoutfs_get_block_write);
-	if (ret)
+	ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?:
+	      block_write_begin(mapping, pos, len, flags, pagep,
+				scoutfs_get_block_write);
+	if (ret < 0) {
 		scoutfs_release_trans(sb);
-out:
-	if (ret) {
 		scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
-		kfree(wbd);
+		if (ret == -ENOBUFS) {
+			/* Retry with a new transaction. */
+			scoutfs_inc_counter(sb, data_write_begin_enobufs_retry);
+			goto retry;
+		}
 	}
+
+out:
+	if (ret < 0)
+		kfree(wbd);
        return ret;
 }

@@ -1007,8 +1010,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)

 	while(iblock <= last) {

-		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-						    SIC_FALLOCATE_ONE());
+		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 		if (ret)
 			goto out;

@@ -1026,6 +1028,12 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		scoutfs_release_trans(sb);
 		scoutfs_inode_index_unlock(sb, &ind_locks);

+		/* txn couldn't meet the request. Let's try with a new txn */
+		if (ret == -ENOBUFS) {
+			scoutfs_inc_counter(sb, data_fallocate_enobufs_retry);
+			continue;
+		}
+
 		if (ret <= 0)
 			goto out;

@@ -1078,8 +1086,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 	}

 	/* we're updating meta_seq with offline block count */
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_SETATTR_MORE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret < 0)
 		goto out;

@@ -1224,8 +1231,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq,
-							SIC_EXACT(1, 1));
+		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
 		if (ret > 0)
 			continue;
 		if (ret < 0)
@@ -463,7 +463,18 @@ out:
 	else
 		inode = scoutfs_iget(sb, ino);

-	return d_splice_alias(inode, dentry);
+	/*
+	 * We can't splice dir aliases into the dcache.  dir entries
+	 * might have changed on other nodes so our dcache could still
+	 * contain them, rather than having been moved in rename.  For
+	 * dirs, we use d_materialize_unique to remove any existing
+	 * aliases which must be stale.  Our inode numbers aren't reused
+	 * so inodes pointed to by entries can't change types.
+	 */
+	if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode))
+		return d_materialise_unique(dentry, inode);
+	else
+		return d_splice_alias(inode, dentry);
 }

 /*
@@ -655,7 +666,6 @@ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 */
 static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
-				      const struct scoutfs_item_count cnt,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
 				      struct list_head *ind_locks)
@@ -694,7 +704,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
 	      scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, cnt);
+	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -741,7 +751,6 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 SIC_MKNOD(dentry->d_name.len),
 				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -836,8 +845,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_LINK(dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -918,8 +926,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_UNLINK(dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1154,7 +1161,6 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 SIC_SYMLINK(dentry->d_name.len, name_len),
 				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -1586,9 +1592,7 @@ retry:
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
 	      (new_inode == NULL ? 0 :
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-					    SIC_RENAME(old_dentry->d_name.len,
-						       new_dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1,6 +1,9 @@
 #ifndef _SCOUTFS_FORMAT_H_
 #define _SCOUTFS_FORMAT_H_

+#define SCOUTFS_INTEROP_VERSION		0ULL
+#define SCOUTFS_INTEROP_VERSION_STR	__stringify(0)
+
 /* statfs(2) f_type */
 #define SCOUTFS_SUPER_MAGIC	0x554f4353		/* "SCOU" */

@@ -173,19 +176,6 @@ struct scoutfs_key {
 #define skfl_neglen	_sk_second
 #define skfl_blkno	_sk_third

-struct scoutfs_radix_block {
-	struct scoutfs_block_header hdr;
-	union {
-		struct scoutfs_radix_ref {
-			__le64 blkno;
-			__le64 seq;
-			__le64 sm_total;
-			__le64 lg_total;
-		} refs[0];
-		__le64 bits[0];
-	};
-};
-
 struct scoutfs_avl_root {
 	__le16 node;
 };
@@ -596,7 +586,7 @@ struct scoutfs_quorum_block {
 struct scoutfs_super_block {
 	struct scoutfs_block_header hdr;
 	__le64 id;
-	__le64 format_hash;
+	__le64 version;
 	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
 	__le64 next_ino;
@@ -759,7 +749,7 @@ enum scoutfs_dentry_type {
 */
 struct scoutfs_net_greeting {
 	__le64 fsid;
-	__le64 format_hash;
+	__le64 version;
 	__le64 server_term;
 	__le64 unmount_barrier;
 	__le64 rid;
@@ -343,8 +343,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (!S_ISREG(inode->i_mode))
 		return 0;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
 	if (ret)
 		return ret;

@@ -371,8 +370,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 	LIST_HEAD(ind_locks);
 	int ret;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		return ret;

@@ -487,8 +485,7 @@ retry:
 		}
 	}

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		goto out;

@@ -1189,8 +1186,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
 * Returns > 0 if the seq changed and the locks should be retried.
 */
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq,
-				      const struct scoutfs_item_count cnt)
+				      struct list_head *list, u64 seq)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct index_lock *ind_lock;
@@ -1206,7 +1202,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
 			goto out;
 	}

-	ret = scoutfs_hold_trans(sb, cnt);
+	ret = scoutfs_hold_trans(sb);
 	if (ret == 0 && seq != sbi->trans_seq) {
 		scoutfs_release_trans(sb);
 		ret = 1;
@@ -1220,8 +1216,7 @@ out:
 }

 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq,
-				  const struct scoutfs_item_count cnt)
+				  bool set_data_seq)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
@@ -1231,7 +1226,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, list, inode,
 						  set_data_seq) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, list, seq, cnt);
+		      scoutfs_inode_index_try_lock_hold(sb, list, seq);
 	} while (ret > 0);

 	return ret;
@@ -1499,8 +1494,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_DROP_INODE(mode, size));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -4,7 +4,6 @@
 #include "key.h"
 #include "lock.h"
 #include "per_task.h"
-#include "count.h"
 #include "format.h"
 #include "data.h"

@@ -83,11 +82,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
 				    struct list_head *list, u64 ino,
 				    umode_t mode);
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq,
-				      const struct scoutfs_item_count cnt);
+				      struct list_head *list, u64 seq);
 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq,
-				  const struct scoutfs_item_count cnt);
+				  bool set_data_seq);
 void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);

 int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
@@ -674,8 +674,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)

 	/* setting only so we don't see 0 data seq with nonzero data_version */
 	set_data_seq = sm.data_version != 0 ? true : false;
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq,
-					    SIC_SETATTR_MORE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
 	if (ret)
 		goto unlock;

@@ -1339,7 +1339,10 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 		/* split needs multiple items, sparse may not have enough */
 		if (!left)
 			return -ENOMEM;
+
 		compact_page_items(sb, pg, left);
+		found = item_rbtree_walk(&pg->item_root, key, NULL, &par,
+					 &pnode);
 	}

 	item = alloc_item(pg, key, liv, val, val_len);
@@ -1491,6 +1494,8 @@ retry:
 			rbtree_erase(&rd->node, &root);
 			rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
 			lru_accessed(sb, cinf, rd);
+			trace_scoutfs_item_read_page(sb, key, &rd->start,
+						     &rd->end);
 			continue;
 		}

@@ -2342,6 +2347,8 @@ retry:
 		write_lock(&pg->rwlock);

 		pgi = trim_page_intersection(sb, cinf, pg, right, start, end);
+		trace_scoutfs_item_invalidate_page(sb, start, end,
+						   &pg->start, &pg->end, pgi);
 		BUG_ON(pgi == PGI_DISJOINT); /* walk wouldn't ret disjoint */

 		if (pgi == PGI_INSIDE) {
@@ -2364,9 +2371,9 @@ retry:
 			/* inv was entirely inside page, done after bisect */
 			write_trylock_will_succeed(&right->rwlock);
 			rbtree_insert(&right->node, par, pnode, &cinf->pg_root);
+			lru_accessed(sb, cinf, right);
 			write_unlock(&right->rwlock);
 			write_unlock(&pg->rwlock);
-			lru_accessed(sb, cinf, right);
 			right = NULL;
 			break;
 		}
@@ -2396,7 +2403,6 @@ static int item_lru_shrink(struct shrinker *shrink,
 	struct active_reader *active;
 	struct cached_page *tmp;
 	struct cached_page *pg;
-	LIST_HEAD(list);
 	int nr;

 	if (sc->nr_to_scan == 0)
@@ -2433,21 +2439,17 @@ static int item_lru_shrink(struct shrinker *shrink,

 		__lru_remove(sb, cinf, pg);
 		rbtree_erase(&pg->node, &cinf->pg_root);
-		list_move_tail(&pg->lru_head, &list);
 		invalidate_pcpu_page(pg);
 		write_unlock(&pg->rwlock);

+		put_pg(sb, pg);
+
 		if (--nr == 0)
 			break;
 	}

 	write_unlock(&cinf->rwlock);
 	spin_unlock(&cinf->lru_lock);
-
-	list_for_each_entry_safe(pg, tmp, &list, lru_head) {
-		list_del_init(&pg->lru_head);
-		put_pg(sb, pg);
-	}
 out:
 	return min_t(unsigned long, cinf->lru_pages, INT_MAX);
 }
@@ -65,7 +65,7 @@
 * relative to that lock state we resend.
 */

-#define GRACE_PERIOD_KT	ms_to_ktime(2)
+#define GRACE_PERIOD_KT	ms_to_ktime(10)

 /*
 * allocated per-super, freed on unmount.
@@ -770,16 +770,6 @@ static void lock_invalidate_worker(struct work_struct *work)
 	list_for_each_entry_safe(lock, tmp, &linfo->inv_list, inv_head) {
 		nl = &lock->inv_nl;

-		/* skip if grace hasn't elapsed, record earliest */
-		deadline = lock->grace_deadline;
-		if (ktime_before(now, deadline)) {
-			delay = min(delay,
-				    nsecs_to_jiffies(ktime_to_ns(
-						ktime_sub(deadline, now))));
-			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
-			continue;
-		}
-
 		/* wait for reordered grant to finish */
 		if (lock->mode != nl->old_mode)
 			continue;
@@ -788,6 +778,15 @@ static void lock_invalidate_worker(struct work_struct *work)
 		if (!lock_counts_match(nl->new_mode, lock->users))
 			continue;

+		/* skip if grace hasn't elapsed, record earliest */
+		deadline = lock->grace_deadline;
+		if (!linfo->shutdown && ktime_before(now, deadline)) {
+			delay = min(delay,
+				    nsecs_to_jiffies(ktime_to_ns(
+						ktime_sub(deadline, now))));
+			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
+			continue;
+		}
 		/* set the new mode, no incompatible users during inval */
 		lock->mode = nl->new_mode;

@@ -31,7 +31,6 @@
 #include "lock.h"
 #include "super.h"
 #include "ioctl.h"
-#include "count.h"
 #include "export.h"
 #include "dir.h"
 #include "server.h"
@@ -426,133 +425,59 @@ TRACE_EVENT(scoutfs_trans_write_func,

 TRACE_EVENT(scoutfs_release_trans,
 	TP_PROTO(struct super_block *sb, void *rsv, unsigned int rsv_holders,
-		 struct scoutfs_item_count *res,
-		 struct scoutfs_item_count *act, unsigned int tri_holders,
-		 unsigned int tri_writing, unsigned int tri_items,
-		 unsigned int tri_vals),
+		 unsigned int tri_holders,
+		 unsigned int tri_writing),

-	TP_ARGS(sb, rsv, rsv_holders, res, act, tri_holders, tri_writing,
-		tri_items, tri_vals),
+	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(void *, rsv)
 		__field(unsigned int, rsv_holders)
-		__field(int, res_items)
-		__field(int, res_vals)
-		__field(int, act_items)
-		__field(int, act_vals)
 		__field(unsigned int, tri_holders)
 		__field(unsigned int, tri_writing)
-		__field(unsigned int, tri_items)
-		__field(unsigned int, tri_vals)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->rsv = rsv;
 		__entry->rsv_holders = rsv_holders;
-		__entry->res_items = res->items;
-		__entry->res_vals = res->vals;
-		__entry->act_items = act->items;
-		__entry->act_vals = act->vals;
 		__entry->tri_holders = tri_holders;
 		__entry->tri_writing = tri_writing;
-		__entry->tri_items = tri_items;
-		__entry->tri_vals = tri_vals;
 	),

-	TP_printk(SCSBF" rsv %p holders %u reserved %u.%u actual "
-		  "%d.%d, trans holders %u writing %u reserved "
-		  "%u.%u", SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
-		  __entry->res_items, __entry->res_vals, __entry->act_items,
-		  __entry->act_vals, __entry->tri_holders, __entry->tri_writing,
-		  __entry->tri_items, __entry->tri_vals)
+	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
+		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
+		  __entry->tri_holders, __entry->tri_writing)
 );

 TRACE_EVENT(scoutfs_trans_acquired_hold,
-	TP_PROTO(struct super_block *sb, const struct scoutfs_item_count *cnt,
+	TP_PROTO(struct super_block *sb,
 		 void *rsv, unsigned int rsv_holders,
-		 struct scoutfs_item_count *res,
-		 struct scoutfs_item_count *act, unsigned int tri_holders,
-		 unsigned int tri_writing, unsigned int tri_items,
-		 unsigned int tri_vals),
+		 unsigned int tri_holders,
+		 unsigned int tri_writing),

-	TP_ARGS(sb, cnt, rsv, rsv_holders, res, act, tri_holders, tri_writing,
-		tri_items, tri_vals),
+	TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(int, cnt_items)
-		__field(int, cnt_vals)
 		__field(void *, rsv)
 		__field(unsigned int, rsv_holders)
-		__field(int, res_items)
-		__field(int, res_vals)
-		__field(int, act_items)
-		__field(int, act_vals)
 		__field(unsigned int, tri_holders)
 		__field(unsigned int, tri_writing)
-		__field(unsigned int, tri_items)
-		__field(unsigned int, tri_vals)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->cnt_items = cnt->items;
-		__entry->cnt_vals = cnt->vals;
 		__entry->rsv = rsv;
 		__entry->rsv_holders = rsv_holders;
-		__entry->res_items = res->items;
-		__entry->res_vals = res->vals;
-		__entry->act_items = act->items;
-		__entry->act_vals = act->vals;
 		__entry->tri_holders = tri_holders;
 		__entry->tri_writing = tri_writing;
-		__entry->tri_items = tri_items;
-		__entry->tri_vals = tri_vals;
 	),

-	TP_printk(SCSBF" cnt %u.%u, rsv %p holders %u reserved %u.%u "
-		  "actual %d.%d, trans holders %u writing %u reserved "
-		  "%u.%u", SCSB_TRACE_ARGS, __entry->cnt_items,
-		  __entry->cnt_vals, __entry->rsv, __entry->rsv_holders,
-		  __entry->res_items, __entry->res_vals, __entry->act_items,
-		  __entry->act_vals, __entry->tri_holders, __entry->tri_writing,
-		  __entry->tri_items, __entry->tri_vals)
-);
-
-TRACE_EVENT(scoutfs_trans_track_item,
-	TP_PROTO(struct super_block *sb, int delta_items, int delta_vals,
-		 int act_items, int act_vals, int res_items, int res_vals),
-
-	TP_ARGS(sb, delta_items, delta_vals, act_items, act_vals, res_items,
-		res_vals),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(int, delta_items)
-		__field(int, delta_vals)
-		__field(int, act_items)
-		__field(int, act_vals)
-		__field(int, res_items)
-		__field(int, res_vals)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->delta_items = delta_items;
-		__entry->delta_vals = delta_vals;
-		__entry->act_items = act_items;
-		__entry->act_vals = act_vals;
-		__entry->res_items = res_items;
-		__entry->res_vals = res_vals;
-	),
-
-	TP_printk(SCSBF" delta_items %d delta_vals %d act_items %d act_vals %d res_items %d res_vals %d",
-		  SCSB_TRACE_ARGS, __entry->delta_items, __entry->delta_vals,
-		  __entry->act_items, __entry->act_vals, __entry->res_items,
-		  __entry->res_vals)
+	TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
+		  SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
+		  __entry->tri_holders, __entry->tri_writing)
 );

 TRACE_EVENT(scoutfs_ioc_release,
@@ -2013,31 +1938,27 @@ DEFINE_EVENT(scoutfs_clock_sync_class, scoutfs_recv_clock_sync,
 );

 TRACE_EVENT(scoutfs_trans_seq_advance,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 prev_seq,
-		 u64 next_seq),
+	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),

-	TP_ARGS(sb, rid, prev_seq, next_seq),
+	TP_ARGS(sb, rid, trans_seq),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, s_rid)
-		__field(__u64, prev_seq)
-		__field(__u64, next_seq)
+		__field(__u64, trans_seq)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->s_rid = rid;
-		__entry->prev_seq = prev_seq;
-		__entry->next_seq = next_seq;
+		__entry->trans_seq = trans_seq;
 	),

-	TP_printk(SCSBF" rid %016llx prev_seq %llu next_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->prev_seq,
-		  __entry->next_seq)
+	TP_printk(SCSBF" rid %016llx trans_seq %llu\n",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_trans_seq_farewell,
+TRACE_EVENT(scoutfs_trans_seq_remove,
 	TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),

 	TP_ARGS(sb, rid, trans_seq),
@@ -2496,6 +2417,53 @@ TRACE_EVENT(scoutfs_alloc_move,
 		  __entry->ret)
 );

+TRACE_EVENT(scoutfs_item_read_page,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
+		 struct scoutfs_key *pg_start, struct scoutfs_key *pg_end),
+	TP_ARGS(sb, key, pg_start, pg_end),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(key)
+		sk_trace_define(pg_start)
+		sk_trace_define(pg_end)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(key, key);
+		sk_trace_assign(pg_start, pg_start);
+		sk_trace_assign(pg_end, pg_end);
+	),
+	TP_printk(SCSBF" key "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT,
+		  SCSB_TRACE_ARGS, sk_trace_args(key), sk_trace_args(pg_start),
+		  sk_trace_args(pg_end))
+);
+
+TRACE_EVENT(scoutfs_item_invalidate_page,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *start,
+		 struct scoutfs_key *end, struct scoutfs_key *pg_start,
+		 struct scoutfs_key *pg_end, int pgi),
+	TP_ARGS(sb, start, end, pg_start, pg_end, pgi),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(start)
+		sk_trace_define(end)
+		sk_trace_define(pg_start)
+		sk_trace_define(pg_end)
+		__field(int, pgi)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		sk_trace_assign(pg_start, pg_start);
+		sk_trace_assign(pg_end, pg_end);
+		__entry->pgi = pgi;
+	),
+	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT" pgi %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
+		  sk_trace_args(pg_start), sk_trace_args(pg_end), __entry->pgi)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -649,79 +649,10 @@ static void init_trans_seq_key(struct scoutfs_key *key, u64 seq, u64 rid)
 }

 /*
- * Give the client the next sequence number for their transaction.  They
- * provide their previous transaction sequence number that they've
- * committed.
- *
- * We track the sequence numbers of transactions that clients have open.
- * This limits the transaction sequence numbers that can be returned in
- * the index of inodes by meta and data transaction numbers.  We
- * communicate the largest possible sequence number to clients via an
- * rpc.
- *
- * The transaction sequence tracking is stored in a btree so it is
- * shared across servers.  Final entries are removed when processing a
- * client's farewell or when it's removed.
+ * Remove all trans_seq items owned by the client rid, the caller holds
+ * the seq_rwsem.
 */
-static int server_advance_seq(struct super_block *sb,
-			      struct scoutfs_net_connection *conn,
-			      u8 cmd, u64 id, void *arg, u16 arg_len)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	__le64 their_seq;
-	__le64 next_seq;
-	u64 rid = scoutfs_net_client_rid(conn);
-	struct scoutfs_key key;
-	int ret;
-
-	if (arg_len != sizeof(__le64)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	memcpy(&their_seq, arg, sizeof(their_seq));
-
-	ret = scoutfs_server_hold_commit(sb);
-	if (ret)
-		goto out;
-
-	down_write(&server->seq_rwsem);
-
-	if (their_seq != 0) {
-		init_trans_seq_key(&key, le64_to_cpu(their_seq), rid);
-		ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
-					   &super->trans_seqs, &key);
-		if (ret < 0 && ret != -ENOENT)
-			goto unlock;
-	}
-
-	next_seq = super->next_trans_seq;
-	le64_add_cpu(&super->next_trans_seq, 1);
-
-	trace_scoutfs_trans_seq_advance(sb, rid, le64_to_cpu(their_seq),
-					le64_to_cpu(next_seq));
-
-	init_trans_seq_key(&key, le64_to_cpu(next_seq), rid);
-	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
-				   &super->trans_seqs, &key, NULL, 0);
-unlock:
-	up_write(&server->seq_rwsem);
-	ret = scoutfs_server_apply_commit(sb, ret);
-
-out:
-	return scoutfs_net_response(sb, conn, cmd, id, ret,
-				    &next_seq, sizeof(next_seq));
-}
-
-/*
- * Remove any transaction sequences owned by the client.  They must have
- * committed any final transaction by the time they get here via sending
- * their farewell message.  This can be called multiple times as the
- * client's farewell is retransmitted so it's OK to not find any
- * entries.  This is called with the server commit rwsem held.
- */
-static int remove_trans_seq(struct super_block *sb, u64 rid)
+static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
 {
 	DECLARE_SERVER_INFO(sb, server);
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -730,8 +661,6 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 	struct scoutfs_key key;
 	int ret = 0;

-	down_write(&server->seq_rwsem);
-
 	init_trans_seq_key(&key, 0, 0);

 	for (;;) {
@@ -746,17 +675,102 @@ static int remove_trans_seq(struct super_block *sb, u64 rid)
 		scoutfs_btree_put_iref(&iref);

 		if (le64_to_cpu(key.skts_rid) == rid) {
-			trace_scoutfs_trans_seq_farewell(sb, rid,
+			trace_scoutfs_trans_seq_remove(sb, rid,
 					le64_to_cpu(key.skts_trans_seq));
 			ret = scoutfs_btree_delete(sb, &server->alloc,
 						   &server->wri,
 						   &super->trans_seqs, &key);
-			break;
+			if (ret < 0)
+				break;
 		}

 		scoutfs_key_inc(&key);
 	}

+	return ret;
+}
+
+/*
+ * Give the client the next sequence number for the transaction that
+ * they're opening.
+ *
+ * We track the sequence numbers of transactions that clients have open.
+ * This limits the transaction sequence numbers that can be returned in
+ * the index of inodes by meta and data transaction numbers.  We
+ * communicate the largest possible sequence number to clients via an
+ * rpc.
+ *
+ * The transaction sequence tracking is stored in a btree so it is
+ * shared across servers.  Final entries are removed when processing a
+ * client's farewell or when it's removed.  We can be processent a
+ * resent request that was committed by a previous server before the
+ * reply was lost.  At this point the client has no transactions open
+ * and may or may not have just finished one.  To keep it simple we
+ * always remove any previous seq items, if there are any, and then
+ * insert a new item for the client at the next greatest seq.
+ */
+static int server_advance_seq(struct super_block *sb,
+			      struct scoutfs_net_connection *conn,
+			      u8 cmd, u64 id, void *arg, u16 arg_len)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	u64 rid = scoutfs_net_client_rid(conn);
+	struct scoutfs_key key;
+	__le64 leseq = 0;
+	u64 seq;
+	int ret;
+
+	if (arg_len != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = scoutfs_server_hold_commit(sb);
+	if (ret)
+		goto out;
+
+	down_write(&server->seq_rwsem);
+
+	ret = remove_trans_seq_locked(sb, rid);
+	if (ret < 0)
+		goto unlock;
+
+	seq = le64_to_cpu(super->next_trans_seq);
+	le64_add_cpu(&super->next_trans_seq, 1);
+
+	trace_scoutfs_trans_seq_advance(sb, rid, seq);
+
+	init_trans_seq_key(&key, seq, rid);
+	ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
+				   &super->trans_seqs, &key, NULL, 0);
+	if (ret == 0)
+		leseq = cpu_to_le64(seq);
+unlock:
+	up_write(&server->seq_rwsem);
+	ret = scoutfs_server_apply_commit(sb, ret);
+
+out:
+	return scoutfs_net_response(sb, conn, cmd, id, ret,
+				    &leseq, sizeof(leseq));
+}
+
+/*
+ * Remove any transaction sequences owned by the client who's sent a
+ * farewell They must have committed any final transaction by the time
+ * they get here via sending their farewell message.  This can be called
+ * multiple times as the client's farewell is retransmitted so it's OK
+ * to not find any entries.  This is called with the server commit rwsem
+ * held.
+ */
+static int remove_trans_seq(struct super_block *sb, u64 rid)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	int ret = 0;
+
+	down_write(&server->seq_rwsem);
+	ret = remove_trans_seq_locked(sb, rid);
 	up_write(&server->seq_rwsem);

 	return ret;
@@ -1096,6 +1110,20 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
 	return ret;
 }

+/*
+ * Farewell processing is async to the request processing work.  Shutdown
+ * waits for request processing to finish and then tears down the connection.
+ * We don't want to queue farewell processing once we start shutting down
+ * so that we don't have farewell processing racing with the connecting
+ * being shutdown.  If a mount's farewell message is dropped by a server
+ * it will be processed by the next server.
+ */
+static void queue_farewell_work(struct server_info *server)
+{
+	if (!server->shutting_down)
+		queue_work(server->wq, &server->farewell_work);
+}
+
 /*
 * Process an incoming greeting request in the server from the client.
 * We try to send responses to failed greetings so that the sender can
@@ -1141,10 +1169,10 @@ static int server_greeting(struct super_block *sb,
 		goto send_err;
 	}

-	if (gr->format_hash != super->format_hash) {
+	if (gr->version != super->version) {
 		scoutfs_warn(sb, "client sent format 0x%llx, server has 0x%llx",
-			     le64_to_cpu(gr->format_hash),
-			     le64_to_cpu(super->format_hash));
+			     le64_to_cpu(gr->version),
+			     le64_to_cpu(super->version));
 		ret = -EINVAL;
 		goto send_err;
 	}
@@ -1173,7 +1201,7 @@ send_err:
 	err = ret;

 	greet.fsid = super->hdr.fsid;
-	greet.format_hash = super->format_hash;
+	greet.version = super->version;
 	greet.server_term = cpu_to_le64(server->term);
 	greet.unmount_barrier = umb;
 	greet.rid = gr->rid;
@@ -1400,8 +1428,8 @@ out:

 	if (ret < 0)
 		stop_server(server);
-	else if (more_reqs && !server->shutting_down)
-		queue_work(server->wq, &server->farewell_work);
+	else if (more_reqs)
+		queue_farewell_work(server);
 }

 static void free_farewell_requests(struct super_block *sb, u64 rid)
@@ -1455,7 +1483,7 @@ static int server_farewell(struct super_block *sb,
 	list_add_tail(&fw->entry, &server->farewell_requests);
 	mutex_unlock(&server->farewell_mutex);

-	queue_work(server->wq, &server->farewell_work);
+	queue_farewell_work(server);

 	/* response will be sent later */
 	return 0;
@@ -1618,12 +1646,17 @@ static void scoutfs_server_worker(struct work_struct *work)

 shutdown:
 	scoutfs_info(sb, "server shutting down at "SIN_FMT, SIN_ARG(&sin));
-	/* wait for request processing */
+
+	/* wait for farewell to finish sending messages */
+	flush_work(&server->farewell_work);
+
+	/* wait for requests to finish, no more requests */
 	scoutfs_net_shutdown(sb, conn);
-	/* wait for commit queued by request processing */
-	flush_work(&server->commit_work);
 	server->conn = NULL;

+	/* wait for extra queues by requests, won't find waiters */
+	flush_work(&server->commit_work);
+
 	scoutfs_lock_server_destroy(sb);

 out:
@@ -1696,8 +1729,9 @@ void scoutfs_server_stop(struct super_block *sb)
 	DECLARE_SERVER_INFO(sb, server);

 	stop_server(server);
-	/* XXX not sure both are needed */
+
 	cancel_work_sync(&server->work);
+	cancel_work_sync(&server->farewell_work);
 	cancel_work_sync(&server->commit_work);
 }

@@ -1752,11 +1786,12 @@ void scoutfs_server_destroy(struct super_block *sb)

 		/* wait for server work to wait for everything to shut down */
 		cancel_work_sync(&server->work);
+		/* farewell work triggers commits */
+		cancel_work_sync(&server->farewell_work);
 		/* recv work/compaction could have left commit_work queued */
 		cancel_work_sync(&server->commit_work);

 		/* pending farewell requests are another server's problem */
-		cancel_work_sync(&server->farewell_work);
 		free_farewell_requests(sb, 0);

 		trace_scoutfs_server_workqueue_destroy(sb, 0, 0);
@@ -1198,14 +1198,10 @@ int scoutfs_srch_get_compact(struct super_block *sb,

 	for (;;scoutfs_key_inc(&key)) {
 		ret = scoutfs_btree_next(sb, root, &key, &iref);
-		if (ret == -ENOENT) {
-			ret = 0;
-			sc->nr = 0;
-			goto out;
-		}
-
 		if (ret == 0) {
-			if (iref.val_len == sizeof(struct scoutfs_srch_file)) {
+			if (iref.key->sk_type != type) {
+				ret = -ENOENT;
+			} else if (iref.val_len == sizeof(sfl)) {
 				key = *iref.key;
 				memcpy(&sfl, iref.val, iref.val_len);
 			} else {
@@ -1213,24 +1209,25 @@ int scoutfs_srch_get_compact(struct super_block *sb,
 			}
 			scoutfs_btree_put_iref(&iref);
 		}
-		if (ret < 0)
+		if (ret < 0) {
+			/* see if we ran out of log files or files entirely */
+			if (ret == -ENOENT) {
+				sc->nr = 0;
+				if (type == SCOUTFS_SRCH_LOG_TYPE) {
+					type = SCOUTFS_SRCH_BLOCKS_TYPE;
+					init_srch_key(&key, type, 0, 0);
+					continue;
+				} else {
+					ret = 0;
+				}
+			}
 			goto out;
+		}

 		/* skip any files already being compacted */
 		if (scoutfs_spbm_test(&busy, le64_to_cpu(sfl.ref.blkno)))
 			continue;

-		/* see if we ran out of log files or files entirely */
-		if (key.sk_type != type) {
-			sc->nr = 0;
-			if (key.sk_type == SCOUTFS_SRCH_BLOCKS_TYPE) {
-				type = SCOUTFS_SRCH_BLOCKS_TYPE;
-			} else {
-				ret = 0;
-				goto out;
-			}
-		}
-
 		/* reset if we iterated into the next size category */
 		if (type == SCOUTFS_SRCH_BLOCKS_TYPE) {
 			order = fls64(le64_to_cpu(sfl.blocks)) /
@@ -352,10 +352,10 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb,
 	}


-	if (super->format_hash != cpu_to_le64(SCOUTFS_FORMAT_HASH)) {
-		scoutfs_err(sb, "super block has invalid format hash 0x%llx, expected 0x%llx",
-			    le64_to_cpu(super->format_hash),
-			    SCOUTFS_FORMAT_HASH);
+	if (super->version != cpu_to_le64(SCOUTFS_INTEROP_VERSION)) {
+		scoutfs_err(sb, "super block has invalid version %llu, expected %llu",
+			    le64_to_cpu(super->version),
+			    SCOUTFS_INTEROP_VERSION);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -682,6 +682,10 @@ static int __init scoutfs_module_init(void)
 		".section	.note.git_describe,\"a\"\n"
 		".string	\""SCOUTFS_GIT_DESCRIBE"\\n\"\n"
 		".previous\n");
+	__asm__ __volatile__ (
+		".section	.note.scoutfs_interop_version,\"a\"\n"
+		".string	\""SCOUTFS_INTEROP_VERSION_STR"\\n\"\n"
+		".previous\n");

 	scoutfs_init_counters();

@@ -714,3 +718,4 @@ module_exit(scoutfs_module_exit)
 MODULE_AUTHOR("Zach Brown <zab@versity.com>");
 MODULE_LICENSE("GPL");
 MODULE_INFO(git_describe, SCOUTFS_GIT_DESCRIBE);
+MODULE_INFO(scoutfs_interop_version, SCOUTFS_INTEROP_VERSION_STR);
@@ -60,8 +60,6 @@
 */
 struct trans_info {
 	spinlock_t lock;
-	unsigned reserved_items;
-	unsigned reserved_vals;
 	unsigned holders;
 	bool writing;

@@ -318,12 +316,11 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb)
 * Including nested holds avoids having to deal with writing out partial
 * transactions while a caller still holds the transaction.
 */
+
 #define SCOUTFS_RESERVATION_MAGIC 0xd57cd13b
 struct scoutfs_reservation {
 	unsigned magic;
 	unsigned holders;
-	struct scoutfs_item_count reserved;
-	struct scoutfs_item_count actual;
 };

 /*
@@ -340,22 +337,16 @@ struct scoutfs_reservation {
 * delaying or prematurely forcing commits.
 */
 static bool acquired_hold(struct super_block *sb,
-			  struct scoutfs_reservation *rsv,
-			  const struct scoutfs_item_count *cnt)
+			  struct scoutfs_reservation *rsv)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	DECLARE_TRANS_INFO(sb, tri);
 	bool acquired = false;
-	unsigned items;
-	unsigned vals;

 	spin_lock(&tri->lock);

-	trace_scoutfs_trans_acquired_hold(sb, cnt, rsv, rsv->holders,
-					  &rsv->reserved, &rsv->actual,
-					  tri->holders, tri->writing,
-					  tri->reserved_items,
-					  tri->reserved_vals);
+	trace_scoutfs_trans_acquired_hold(sb, rsv, rsv->holders,
+					  tri->holders, tri->writing);

 	/* use a caller's existing reservation */
 	if (rsv->holders)
@@ -365,10 +356,6 @@ static bool acquired_hold(struct super_block *sb,
 	if (tri->writing)
 		goto out;

-	/* see if we can reserve space for our item count */
-	items = tri->reserved_items + cnt->items;
-	vals = tri->reserved_vals + cnt->vals;
-
 	/*
 	 * In theory each dirty item page could be straddling two full
 	 * blocks, requiring 4 allocations for each item cache page.
@@ -405,12 +392,6 @@ static bool acquired_hold(struct super_block *sb,
 		goto out;
 	}

-	tri->reserved_items = items;
-	tri->reserved_vals = vals;
-
-	rsv->reserved.items = cnt->items;
-	rsv->reserved.vals = cnt->vals;
-
 hold:
 	rsv->holders++;
 	tri->holders++;
@@ -423,20 +404,12 @@ out:
 	return acquired;
 }

-int scoutfs_hold_trans(struct super_block *sb,
-		       const struct scoutfs_item_count cnt)
+int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_reservation *rsv;
 	int ret;

-	/*
-	 * Caller shouldn't provide garbage counts, nor counts that
-	 * can't fit in segments by themselves.
-	 */
-	if (WARN_ON_ONCE(cnt.items <= 0 || cnt.vals < 0))
-		return -EINVAL;
-
 	if (current == sbi->trans_task)
 		return 0;

@@ -453,7 +426,7 @@ int scoutfs_hold_trans(struct super_block *sb,
 	BUG_ON(rsv->magic != SCOUTFS_RESERVATION_MAGIC);

 	ret = wait_event_interruptible(sbi->trans_hold_wq,
-				       acquired_hold(sb, rsv, &cnt));
+				       acquired_hold(sb, rsv));
 	if (ret && rsv->holders == 0) {
 		current->journal_info = NULL;
 		kfree(rsv);
@@ -473,38 +446,6 @@ bool scoutfs_trans_held(void)
 	return rsv && rsv->magic == SCOUTFS_RESERVATION_MAGIC;
 }

-/*
- * Record a transaction holder's individual contribution to the dirty
- * items in the current transaction.  We're making sure that the
- * reservation matches the possible item manipulations while they hold
- * the reservation.
- *
- * It is possible and legitimate for an individual contribution to be
- * negative if they delete dirty items.  The item cache makes sure that
- * the total dirty item count doesn't fall below zero.
- */
-void scoutfs_trans_track_item(struct super_block *sb, signed items,
-			      signed vals)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv = current->journal_info;
-
-	if (current == sbi->trans_task)
-		return;
-
-	BUG_ON(!rsv || rsv->magic != SCOUTFS_RESERVATION_MAGIC);
-
-	rsv->actual.items += items;
-	rsv->actual.vals += vals;
-
-	trace_scoutfs_trans_track_item(sb, items, vals, rsv->actual.items,
-				       rsv->actual.vals, rsv->reserved.items,
-				       rsv->reserved.vals);
-
-	WARN_ON_ONCE(rsv->actual.items > rsv->reserved.items);
-	WARN_ON_ONCE(rsv->actual.vals > rsv->reserved.vals);
-}
-
 /*
 * As we drop the last hold in the reservation we try and wake other
 * hold attempts that were waiting for space.  As we drop the last trans
@@ -526,16 +467,12 @@ void scoutfs_release_trans(struct super_block *sb)

 	spin_lock(&tri->lock);

-	trace_scoutfs_release_trans(sb, rsv, rsv->holders, &rsv->reserved,
-				    &rsv->actual, tri->holders, tri->writing,
-				    tri->reserved_items, tri->reserved_vals);
+	trace_scoutfs_release_trans(sb, rsv, rsv->holders, tri->holders, tri->writing);

 	BUG_ON(rsv->holders <= 0);
 	BUG_ON(tri->holders <= 0);

 	if (--rsv->holders == 0) {
-		tri->reserved_items -= rsv->reserved.items;
-		tri->reserved_vals -= rsv->reserved.vals;
 		current->journal_info = NULL;
 		kfree(rsv);
 		wake = true;
@@ -6,21 +6,16 @@
 /* the client will force commits if data allocators get too low */
 #define SCOUTFS_TRANS_DATA_ALLOC_LWM	(256ULL * 1024 * 1024)

-#include "count.h"
-
 void scoutfs_trans_write_func(struct work_struct *work);
 int scoutfs_trans_sync(struct super_block *sb, int wait);
 int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 void scoutfs_trans_restart_sync_deadline(struct super_block *sb);

-int scoutfs_hold_trans(struct super_block *sb,
-		       const struct scoutfs_item_count cnt);
+int scoutfs_hold_trans(struct super_block *sb);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
 u64 scoutfs_trans_sample_seq(struct super_block *sb);
-void scoutfs_trans_track_item(struct super_block *sb, signed items,
-			      signed vals);

 int scoutfs_trans_get_log_trees(struct super_block *sb);
 bool scoutfs_trans_has_dirty(struct super_block *sb);
@@ -577,10 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_XATTR_SET(found_parts,
-							      value != NULL,
-							      name_len, size));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -781,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 					     &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

-		ret = scoutfs_hold_trans(sb, SIC_EXACT(2, 0));
+		ret = scoutfs_hold_trans(sb);
 		if (ret < 0)
 			break;
 		release = true;
@@ -59,5 +59,8 @@ t_filter_dmesg()
 	# some tests mount w/o options
 	re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"

+	# in debugging kernels we can slow things down a bit
+	re="$re|hrtimer: interrupt took .*"
+
 	egrep -v "($re)" 
 }
@@ -23,3 +23,18 @@ t_require_mounts() {
 	test "$T_NR_MOUNTS" -ge "$req" || \
 		t_skip "$req mounts required, only have $T_NR_MOUNTS"
 }
+
+#
+# Require that the meta device be at least the size string argument, as
+# parsed by numfmt using single char base 2 suffixes (iec).. 64G, etc.
+#
+t_require_meta_size() {
+	local dev="$T_META_DEVICE"
+	local req_iec="$1"
+	local req_bytes=$(numfmt --from=iec --to=none $req_iec)
+	local dev_bytes=$(blockdev --getsize64 $dev)
+	local dev_iec=$(numfmt --from=auto --to=iec $dev_bytes)
+
+	test "$dev_bytes" -ge "$req_bytes" || \
+		t_skip "$dev must be at least $req_iec, is $dev_iec"
+}
@@ -0,0 +1,3 @@
+== create per mount files
+== 30s of racing random mount/umount
+== mounting any unmounted
@@ -53,7 +53,7 @@ $(basename $0) options:
    -m        | Run mkfs on the device before mounting and running
              | tests.  Implies unmounting existing mounts first.
    -n        | The number of devices and mounts to test.
-    -P        | Output trace events with printk as they're generated.
+    -P        | Enable trace_printk.
    -p        | Exit script after preparing mounts only, don't run tests.
    -q <nr>   | Specify the quorum count needed to mount.  This is
              | used when running mkfs and is needed by a few tests.
@@ -62,6 +62,7 @@ $(basename $0) options:
              | exist.  Previous results will be deleted as each test runs.
    -s        | Skip git repo checkouts.
    -t        | Enabled trace events that match the given glob argument.
+              | Multiple options enable multiple globbed events.
    -X        | xfstests git repo. Used by tests/xfstests.sh.
    -x        | xfstests git branch to checkout and track.
    -y        | xfstests ./check additional args
@@ -77,6 +78,9 @@ done
 T_TRACE_DUMP="0"
 T_TRACE_PRINTK="0"

+# array declarations to be able to use array ops
+declare -a T_TRACE_GLOB
+
 while true; do
 	case $1 in
 	-a)
@@ -147,7 +151,7 @@ while true; do
 		;;
 	-t)
 		test -n "$2" || die "-t must have trace glob argument"
-		T_TRACE_GLOB="$2"
+		T_TRACE_GLOB+=("$2")
 		shift
 		;;
 	-X)
@@ -314,23 +318,37 @@ if [ -n "$T_INSMOD" ]; then
 	cmd insmod "$T_KMOD/src/scoutfs.ko"
 fi

-if [ -n "$T_TRACE_GLOB" ]; then
-	msg "enabling trace events"
+nr_globs=${#T_TRACE_GLOB[@]}
+if [ $nr_globs -gt 0 ]; then
 	echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
-	for g in $T_TRACE_GLOB; do
+
+	for g in "${T_TRACE_GLOB[@]}"; do
 		for e in /sys/kernel/debug/tracing/events/scoutfs/$g/enable; do
-			echo 1 > $e
+			if test -w "$e"; then
+				echo 1 > "$e"
+			else
+				die "-t glob '$g' matched no scoutfs events"
+			fi
 		done
 	done

-	echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
-	echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
-
-	cmd cat /sys/kernel/debug/tracing/set_event
-	cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
-		    /proc/sys/kernel/ftrace_dump_on_oops
+	nr_events=$(cat /sys/kernel/debug/tracing/set_event | wc -l)
+	msg "enabled $nr_events trace events from $nr_globs -t globs"
 fi

+if [ -n "$T_TRACE_PRINTK" ]; then
+	echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
+fi
+
+if [ -n "$T_TRACE_DUMP" ]; then
+	echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
+fi
+
+# always describe tracing in the logs
+cmd cat /sys/kernel/debug/tracing/set_event
+cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
+	    /proc/sys/kernel/ftrace_dump_on_oops
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -434,7 +452,7 @@ for t in $tests; do

 	# get stats from previous pass
 	last="$T_RESULTS/last-passed-test-stats"
-	stats=$(grep -s "^$test_name" "$last" | cut -d " " -f 2-)
+	stats=$(grep -s "^$test_name " "$last" | cut -d " " -f 2-)
 	test -n "$stats" && stats="last: $stats"

 	printf "  %-30s $stats" "$test_name"
@@ -497,7 +515,7 @@ for t in $tests; do
 		echo "  passed: $stats"
 		((passed++))
 		# save stats for passed test
-		grep -s -v "^$test_name" "$last" > "$last.tmp"
+		grep -s -v "^$test_name " "$last" > "$last.tmp"
 		echo "$test_name $stats" >> "$last.tmp"
 		mv -f "$last.tmp" "$last"
 	elif [ "$sts" == "$T_SKIP_STATUS" ]; then
@@ -515,23 +533,24 @@ done

 msg "all tests run: $passed passed, $skipped skipped, $failed failed"

-unmount_all

-if [ -n "$T_TRACE_GLOB" ]; then
+if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
 	msg "saving traces and disabling tracing"
 	echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
+	echo 0 > /sys/kernel/debug/tracing/options/trace_printk
 	cat /sys/kernel/debug/tracing/trace > "$T_RESULTS/traces"
 fi

 if [ "$skipped" == 0 -a "$failed" == 0 ]; then
 	msg "all tests passed"
+	unmount_all
 	exit 0
 fi

 if [ "$skipped" != 0 ]; then
-	msg "$skipped tests skipped, check skip.log"
+	msg "$skipped tests skipped, check skip.log, still mounted"
 fi
 if [ "$failed" != 0 ]; then
-	msg "$failed tests failed, check fail.log"
+	msg "$failed tests failed, check fail.log, still mounted"
 fi
 exit 1
@@ -25,7 +25,8 @@ lock-conflicting-batch-commit.sh
 cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
-mount-unmount-race.sh
+# failing in jenkins pr runners, zab's working on it
+#umount-unmount-race.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
 stale-btree-read.sh
@@ -160,8 +160,8 @@ for i in $(seq 1 1); do
 		mkdir -p $(dirname $lnk)
 		ln "$T_D0/file" $lnk

-		scoutfs ino-path $ino "$T_M0" > "$T_TMP.0"
-		scoutfs ino-path $ino "$T_M1" > "$T_TMP.1"
+		scoutfs ino-path -p "$T_M0" $ino > "$T_TMP.0"
+		scoutfs ino-path -p "$T_M1" $ino > "$T_TMP.1"
 		diff -u "$T_TMP.0" "$T_TMP.1"
 	done
 done
@@ -50,7 +50,7 @@ for m in 0 1; do
 done
 wait
 CONF="$((SECONDS - START))"
-echo "conf: $IND" >> $T_TMP.log
+echo "conf: $CONF" >> $T_TMP.log

 if [ "$CONF" -gt "$((IND * 5))" ]; then
 	t_fail "conflicting $CONF secs is more than 5x independent $IND secs"
@@ -83,7 +83,7 @@ generic/375	# utils output change?  update branch?
 EOF

 t_restore_output
-echo "(showing output of xfstests)"
+echo "  (showing output of xfstests)"

 args="-E local.exclude ${T_XFSTESTS_ARGS:--g quick}"
 ./check $args
@@ -1,23 +1,11 @@
 #
-# The userspace utils and kernel module share definitions of physical
-# structures and ioctls.  If we're in the repo we include the kmod
-# headers directly, and hash them directly to calculate the format hash.
-#
 # If we're creating a standalone tarball for distribution we copy the
 # headers out of the kmod dir into the tarball.  And then when we're
 # building in that tarball we use the headers in src/ directly.
 #
 FMTIOC_H := format.h ioctl.h
-FMTIOC_DIST := $(addprefix src/,$(FMTIOC_H))
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))

-ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
-HASH_FILES := $(FMTIOC_KMOD)
-else
-HASH_FILES := $(FMTIOC_DIST)
-endif
-SCOUTFS_FORMAT_HASH := $(shell cat $(HASH_FILES) | md5sum | cut -b1-16)
-
 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
 	-fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
@@ -205,7 +205,7 @@ static int do_mkfs(struct mkfs_args *args)
 	pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
 	super->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_SUPER);
 	super->hdr.seq = cpu_to_le64(1);
-	super->format_hash = cpu_to_le64(SCOUTFS_FORMAT_HASH);
+	super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
 	uuid_generate(super->uuid);
 	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
 	super->next_trans_seq = cpu_to_le64(1);
@@ -352,7 +352,7 @@ static int do_mkfs(struct mkfs_args *args)
 	       "  meta device path:     %s\n"
 	       "  data device path:     %s\n"
 	       "  fsid:                 %llx\n"
-	       "  format hash:          %llx\n"
+	       "  version:              %llx\n"
 	       "  uuid:                 %s\n"
 	       "  64KB metadata blocks: "SIZE_FMT"\n"
 	       "  4KB data blocks:      "SIZE_FMT"\n"
@@ -360,7 +360,7 @@ static int do_mkfs(struct mkfs_args *args)
 		args->meta_device,
 	        args->data_device,
 		le64_to_cpu(super->hdr.fsid),
-		le64_to_cpu(super->format_hash),
+		le64_to_cpu(super->version),
 		uuid_str,
 		SIZE_ARGS(le64_to_cpu(super->total_meta_blocks),
 			  SCOUTFS_BLOCK_LG_SIZE),
@@ -860,8 +860,8 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)

 	printf("super blkno %llu\n", blkno);
 	print_block_header(&super->hdr, SCOUTFS_BLOCK_SM_SIZE);
-	printf("  format_hash %llx uuid %s\n",
-	       le64_to_cpu(super->format_hash), uuid_str);
+	printf("  version %llx uuid %s\n",
+	       le64_to_cpu(super->version), uuid_str);
 	printf("  flags: 0x%016llx\n", le64_to_cpu(super->flags));

 	server_addr = alloc_addr_str(&super->server_addr);
@@ -8,7 +8,6 @@
 #include <errno.h>
 #include <string.h>
 #include <limits.h>
-#include <assert.h>
 #include <argp.h>

 #include "sparse.h"
@@ -208,9 +207,6 @@ static int do_release(struct release_args *args)
 		return ret;
 	}

-	assert(args->offset % SCOUTFS_BLOCK_SM_SIZE == 0);
-	assert(args->length % SCOUTFS_BLOCK_SM_SIZE == 0);
-
 	ioctl_args.offset = args->offset;
 	ioctl_args.length = args->length;
 	ioctl_args.data_version = args->data_version;
Author	SHA1	Message	Date
Zach Brown	6ad18769cb	Disable mount-unmount-race test The mount-unmount-race test is occasionally hanging, disable it while we debug it and have test coverage for unrelated work. Signed-off-by: Zach Brown <zab@versity.com>	2021-02-01 10:07:47 -08:00
Zach Brown	49d82fcaaf	Merge pull request #14 from agrover/fix-jira-202 utils: Do not assert if release is given unaligned offset or length	2021-02-01 09:46:01 -08:00
Zach Brown	e4e12c1968	Merge pull request #15 from agrover/radix-block Remove unused radix_block struct	2021-02-01 09:24:59 -08:00
Andy Grover	15fd2ccc02	utils: Do not assert if release is given unaligned offset or length This is checked for by the kernel ioctl code, so giving unaligned values will return an error, instead of aborting with an assert. Signed-off-by: Andy Grover <agrover@versity.com>	2021-01-29 09:30:57 -08:00
Andy Grover	eea95357d3	Remove unused radix_block struct Signed-off-by: Andy Grover <agrover@versity.com>	2021-01-26 16:07:05 -08:00
Andy Grover	9842c5d13e	Merge pull request #13 from versity/zab/multi_mount_test_fixes Zab/multi mount test fixes	2021-01-26 15:56:33 -08:00
Zach Brown	ade539217e	Handle advance_seq being replayed in new server As a core principle, all server message processing needs to be safe to replay as servers shut down and requests are resent to new servers. The advance_seq handler got this wrong. It would only try to remove a trans_seq item for the seq sent by the client before inserting a new item for the next seq. This change could be committed before the reply was lost as the server shuts down. The next server would process the resent request but wouldn't find the old item for the seq that the client sent, and would ignore the new item that the previous server inserted. It would then insert another greater seq for the same client. This would leave behind a stale old trans_seq that would be returned as the last_seq which would forever limit the results that could be returned from the seq index walks. This fix is to always remove all previous seq items for the client before inserting a new one. This creates O(clients) server work, but it's minimal. This manifest as occasional simple-inode-index test failures (say 1 in 5?) which would trigger if the unmounts during previous tests would happen to have advance_seq resent across server shutdowns. With this change the test now reliably passes. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	5a90234c94	Use terminated test name when saving passed stats We've grown some test names that are prefixes of others (createmany-parallel, createmany-parallel-mounts). When we're searching for lines with the test name we have to search for the exact test name, by terminating the name with a space, instead of searching for a line that starts with the test name. This fixes strange output and saved passed stats for the names that share a prefix. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	f81e4cb98a	Add whitespace to xfstests output message The message indicating that xfstests output was now being shown was mashed up against the previous passed stats and it was gross and I hated it. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	1fc706bf3f	Filter hrtimer slow messages from dmesg When running in debug kernels in guests we can really bog down things enough to trigger hrtimer warnings. I don't think there's much we can reasonably do about that. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	e9c3aa6501	More carefully cancel server farewell work Farewell work is queued by farewell message processing. Server shutdown didn't properly wait for pending farewell work to finish before tearing down. As the server work destroyed the server's connection the farewell work could stlil be running and try to send responses down the socket. We make the server more carefully avoid queueuing farewell work if it's in the process of shutting down and wait for farewell work to finish before destroying the server's resources. This fixed all manner of crashes that were seen in testing when a bunch of nodes unmounted, creating farewell work on the server as it itself unmounted and destroyed the server. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	d39268bbc1	Fix spurious EIO from scoutfs_srch_get_compact scoutfs_srch_get_compact() is building up a compaction request which has a list of srch files to read and sort and write into a new srch file. It finds input files by searching for a sufficient number of similar files: first any unsorted log files and then sorted log files that are around the same size. It finds the files by using btree next on the srch zone which has types for unsorted srch log files, sorted srch files, but also pending and busy compaction items. It was being far too cute about iterating over different key types. It was trying to adapt to finding the next key and was making assumptions about the order of key types. It didn't notice that the pending and busy key types followed log and sorted and would generate EIO when it ran into them and found their value length didn't match what it was expecting. Rework the next item ref parsing so that it returns -ENOENT if it gets an unexpected key type, then look for the next key type when checking enoent. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	35ed1a2438	Add t_require_meta_size function Add a function that tests can use to skip when the metadata device isn't large enough. I thought we needed to avoid enospc in a particular test, but it turns out the test's failure was unrelated. So this isn't used for now but it seems nice to keep around. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	32e7978a6e	Extend lock invalidate grace period The grace period is intended to let lock holders squeeze in more bulk work before another node pulls the lock out from under them. The length of the delay is a balance between getting more work done per lock hold and adding latency to ping-ponging workloads. The current grace period was too short. To do work in the conflicting case you often have to read the result that the other mount wrote as you invalidated their lock. The test was written in the LSM world where we'd effectively read a single level 0 1MB segment. In the btree world we're checking bloom blocks and reading the other mount's btree. It has more dependent read latency. So we turn up the grace period to let conflicting readers squeeze in more work before pulling the lock out from under them. This value was chosen to make lock-conflicting-batch-commit pass in guests sharing nvme metadata devices in debugging kernels. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	8123b8fc35	fix lock-conflicting-batch-commit conf output The test had a silly typo in the label it put on the time it took mounts to perform conflicting metadata changes. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	da5911c311	Use d_materialise_unique to splice dir dentries When we're splicing in dentries in lookup we can be splicing the result of changes on other nodes into a stale dcache. The stale dcache might contain dir entries and the dcache does not allow aliased directories. Use d_materialise_unique() to splice in dir inodes so that we remove all aliased dentries which must be stale. We can still use d_splice_alias() for all other inode types. Any existing stale dentries will fail revalidation before they're used. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	098fc420be	Add some item cache page tracing Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	7a96537210	Leave mounts mounted if run-tests fails We can lose interesting state if the mounts are unmounted as tests fail, only unmount if all the tests pass. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	0607dfdac8	Enable and collect trace_printk Weirdly, run-tests was treating trace_printk not as an option to enable trace_printk() traces but as an option to print trace events to the console with printk? That's not a thing. Make -P really enable trace_printk tracing and collect it as it would enabled trace events. It needs to be treated seperately from the -t options that enable trace events. While we're at it treat the -P trace dumping option as a stand-alone option that works without -t arguments. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	0354bb64c5	More carefully enable tracing in run-tests run-tests.sh has a -t argument which takes a whitespace seperated string of globs of events to enable. This was hard to use and made it very easy to accidentally expand the globs at the wrong place in the script. This makes each -t argument specify a single word glob which is stored in an array so the glob isn't expanded until it's applied to the trace event path. We also add an error for -t globs that didn't match any events and add a message with the count of -t arguments and enabled events. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	631801c45c	Don't queue lock invalidation work during shutdown The lock invalidation work function needs to be careful not to requeue itself while we're shutting down or we can be left with invalidation functions racing with shutdown. Invalidation calls igrab so we can end up with unmount warning that there are still inodes in use. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:46:07 -08:00
Zach Brown	47a1ac92f7	Update ino-path args in basic-posix-consistency The ino-path calls in basic-posix-consistency weren't updated for the recent change to scoutfs cli args. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-26 14:45:23 -08:00
Zach Brown	004f693af3	Add golden output for mount-unmount-race test Signed-off-by: Zach Brown <zab@versity.com>	2021-01-25 14:19:35 -08:00
Andy Grover	f271a5d140	Merge pull request #12 from versity/zab/andys_fallocate_fix_minor_cleanup Retry if transaction cannot alloc for fallocate or write	2021-01-25 12:52:14 -08:00
Andy Grover	355eac79d2	Retry if transaction cannot alloc for fallocate or write Add a new distinguishable return value (ENOBUFS) from allocator for if the transaction cannot alloc space. This doesn't mean the filesystem is full -- opening a new transaction may result in forward progress. Alter fallocate and get_blocks code to check for this err val and retry with a new transaction. Handling actual ENOSPC can still happen, of course. Add counter called "alloc_trans_retry" and increment it from both spots. Signed-off-by: Andy Grover <agrover@versity.com> [zab@versity.com: fixed up write_begin error paths]	2021-01-25 09:32:01 -08:00
Zach Brown	d8b4e94854	Merge pull request #10 from agrover/rm-item-accounting Remove item accounting	2021-01-21 09:57:53 -08:00
Andy Grover	bed33c7ffd	Remove item accounting Remove kmod/src/count.h Remove scoutfs_trans_track_item() Remove reserved/actual fields from scoutfs_reservation Signed-off-by: Andy Grover <agrover@versity.com>	2021-01-20 17:01:08 -08:00
Andy Grover	b370730029	Merge pull request #11 from versity/zab/item_cache_memory_corruption Fix item cache page memory corruption	2021-01-20 10:27:20 -08:00
Zach Brown	d64dd89ead	Fix item cache page memory corruption The item cache page life cycle is tricky. There are no proper page reference counts, everthing is done by nesting the page rwlock inside item_cache_info rwlock. The intent is that you can only reference pages while you hold the rwlocks appropriately. The per-cpu page references are outside that locking regime so they add a reference count. Now there are reference counts for the main cache index reference and for each per-cpu reference. The end result of all this is that you can only reference pages outside of locks if you're protected by references. Lock invalidation messed this up by trying to add its right split page to the lru after it was unlocked. Its page reference wasn't protected at this point. Shrinking could be freeing that page, and so it could be putting a freed page's memory back on the lru. Shrinking had a little bug that it was using list_move to move an initialized lru_head list_head. It turns out to be harmless (list_del will just follow pointers to itself and set itself as next and prev all over again), but boy does it catch one's eye. Let's remove all confusion and drop the reference while holding the cinf->rwlock instead of trying to optimize freeing outside locks. Finally, the big one: inserting a read item after compacting the page to make room was inserting into stale parent pointers into the old pre-compacted page, rather than the new page that was swapped in by compaction. This left references to a freed page in the page rbtree and hilarity ensued. Signed-off-by: Zach Brown <zab@versity.com>	2021-01-20 09:02:29 -08:00
Zach Brown	8d81196e01	Merge pull request #7 from agrover/versioning Filesystem version instead of format hash check	2021-01-19 11:55:32 -08:00
Andy Grover	d731c1577e	Filesystem version instead of format hash check Instead of hashing headers, define an interop version. Do not mount superblocks that have a different version, either higher or lower. Since this is pretty much the same as the format hash except it's a constant, minimal code changes are needed. Initial dev version is 0, with the intent that version will be bumped to 1 immediately prior to tagging initial release version. Update README. Fix comments. Add interop version to notes and modinfo. Signed-off-by: Andy Grover <agrover@versity.com>	2021-01-15 10:53:00 -08:00