Block cache use rht _lookup_ insert for EEXIST

The sneaky rhashtable_insert_fast() can't return -EEXIST despite the last line of the function *REALLY* making it look like it can. It just inserts new objects at the head of the bucket lists without comparing the insertion with existing objects. The block cache was relying on insertion to resolve duplicate racing allocated blocks. Because it couldn't return -EEXIST we could get duplicate cached blocks present in the hash table. rhashtable_lookup_insert_fast() fixes this by actually comparing the inserted objects key with the objects found in the insertion bucket. A racing allocator trying to insert a duplicate cached block will get an error, drop their allocated block, and retry their lookup. Signed-off-by: Zach Brown <zab@versity.com>
Wait for rhashtable to rehash on insert EBUSY
2026-06-09 21:22:36 +00:00 · 2021-04-13 09:24:23 -07:00 · 2021-04-13 09:24:23 -07:00 · 2021-04-08 09:03:12 -07:00 · 2021-04-07 12:50:50 -07:00 · 2021-04-07 12:27:00 -07:00
28 changed files with 387 additions and 97 deletions
@@ -286,10 +286,16 @@ static int block_insert(struct super_block *sb, struct block_private *bp)

 	WARN_ON_ONCE(atomic_read(&bp->refcount) & BLOCK_REF_INSERTED);

+retry:
 	atomic_add(BLOCK_REF_INSERTED, &bp->refcount);
-	ret = rhashtable_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
+	ret = rhashtable_lookup_insert_fast(&binf->ht, &bp->ht_head, block_ht_params);
 	if (ret < 0) {
 		atomic_sub(BLOCK_REF_INSERTED, &bp->refcount);
+		if (ret == -EBUSY) {
+			/* wait for pending rebalance to finish */
+			synchronize_rcu();
+			goto retry;
+		}
 	} else {
 		atomic_inc(&binf->total_inserted);
 		TRACE_BLOCK(insert, bp);
@@ -396,6 +402,7 @@ static void block_remove_all(struct super_block *sb)

 		if (block_get_if_inserted(bp)) {
 			block_remove(sb, bp);
+			WARN_ON_ONCE(atomic_read(&bp->refcount) != 1);
 			block_put(sb, bp);
 		}
 	}
@@ -1073,10 +1080,11 @@ restart:
 		if (bp == NULL)
 			break;
 		if (bp == ERR_PTR(-EAGAIN)) {
-			/* hard reset to not hold rcu grace period across retries */
+			/* hard exit to wait for rcu rebalance to finish */
 			rhashtable_walk_stop(&iter);
 			rhashtable_walk_exit(&iter);
 			scoutfs_inc_counter(sb, block_cache_shrink_restart);
+			synchronize_rcu();
 			goto restart;
 		}

@@ -150,7 +150,7 @@ static int client_lock_response(struct super_block *sb,
 				void *resp, unsigned int resp_len,
 				int error, void *data)
 {
-	if (resp_len != sizeof(struct scoutfs_net_lock_grant_response))
+	if (resp_len != sizeof(struct scoutfs_net_lock))
 		return -EINVAL;

 	/* XXX error? */
@@ -1135,7 +1135,8 @@ static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
 */
 #define MOVE_DATA_EXTENTS_PER_HOLD 16
 int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
-			     u64 byte_len, struct inode *to, u64 to_off)
+			     u64 byte_len, struct inode *to, u64 to_off, bool is_stage,
+			     u64 data_version)
 {
 	struct scoutfs_inode_info *from_si = SCOUTFS_I(from);
 	struct scoutfs_inode_info *to_si = SCOUTFS_I(to);
@@ -1145,6 +1146,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 	struct data_ext_args from_args;
 	struct data_ext_args to_args;
 	struct scoutfs_extent ext;
+	struct timespec cur_time;
 	LIST_HEAD(locks);
 	bool done = false;
 	loff_t from_size;
@@ -1180,6 +1182,11 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		goto out;
 	}

+	if (is_stage && (data_version != SCOUTFS_I(to)->data_version)) {
+		ret = -ESTALE;
+		goto out;
+	}
+
 	from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
 	count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
 	to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
@@ -1202,7 +1209,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 	/* can't stage once data_version changes */
 	scoutfs_inode_get_onoff(from, &junk, &from_offline);
 	scoutfs_inode_get_onoff(to, &junk, &to_offline);
-	if (from_offline || to_offline) {
+	if (from_offline || (to_offline && !is_stage)) {
 		ret = -ENODATA;
 		goto out;
 	}
@@ -1246,6 +1253,8 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

 		/* arbitrarily limit the number of extents per trans hold */
 		for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) {
+			struct scoutfs_extent off_ext;
+
 			/* find the next extent to move */
 			ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
 					       from_iblock, 1, &ext);
@@ -1274,10 +1283,27 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

 			to_start = to_iblock + (from_start - from_iblock);

-			/* insert the new, fails if it overlaps */
-			ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
-						 to_start, len,
-						 map, ext.flags);
+			if (is_stage) {
+				ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
+						       to_iblock, 1, &off_ext);
+				if (ret)
+					break;
+
+				if (!scoutfs_ext_inside(to_start, len, &off_ext) ||
+				    !(off_ext.flags & SEF_OFFLINE)) {
+					ret = -EINVAL;
+					break;
+				}
+
+				ret = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
+							 to_start, len,
+							 map, ext.flags);
+			} else {
+				/* insert the new, fails if it overlaps */
+				ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args,
+							 to_start, len,
+							 map, ext.flags);
+			}
 			if (ret < 0)
 				break;

@@ -1285,10 +1311,18 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 			ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args,
 					      from_start, len, 0, 0);
 			if (ret < 0) {
-				/* remove inserted new on err */
-				err = scoutfs_ext_remove(sb, &data_ext_ops,
-							 &to_args, to_start,
-							 len);
+				if (is_stage) {
+					/* re-mark dest range as offline */
+					WARN_ON_ONCE(!(off_ext.flags & SEF_OFFLINE));
+					err = scoutfs_ext_set(sb, &data_ext_ops, &to_args,
+							      to_start, len,
+							      0, off_ext.flags);
+				} else {
+					/* remove inserted new on err */
+					err = scoutfs_ext_remove(sb, &data_ext_ops,
+								 &to_args, to_start,
+								 len);
+				}
 				BUG_ON(err); /* XXX inconsistent */
 				break;
 			}
@@ -1316,12 +1350,15 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		up_write(&from_si->extent_sem);
 		up_write(&to_si->extent_sem);

-		from->i_ctime = from->i_mtime =
-			to->i_ctime = to->i_mtime = CURRENT_TIME;
+		cur_time = CURRENT_TIME;
+		if (!is_stage) {
+			to->i_ctime = to->i_mtime = cur_time;
+			scoutfs_inode_inc_data_version(to);
+			scoutfs_inode_set_data_seq(to);
+		}
+		from->i_ctime = from->i_mtime = cur_time;
 		scoutfs_inode_inc_data_version(from);
-		scoutfs_inode_inc_data_version(to);
 		scoutfs_inode_set_data_seq(from);
-		scoutfs_inode_set_data_seq(to);

 		scoutfs_update_inode_item(from, from_lock, &locks);
 		scoutfs_update_inode_item(to, to_lock, &locks);
@@ -59,7 +59,8 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
 int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 				     struct scoutfs_lock *lock);
 int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
-			     u64 byte_len, struct inode *to, u64 to_off);
+			     u64 byte_len, struct inode *to, u64 to_off, bool to_stage,
+			     u64 data_version);

 int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
@@ -813,6 +813,7 @@ static int scoutfs_link(struct dentry *old_dentry,
 	struct scoutfs_lock *dir_lock;
 	struct scoutfs_lock *inode_lock = NULL;
 	LIST_HEAD(ind_locks);
+	bool del_orphan;
 	u64 dir_size;
 	u64 ind_seq;
 	u64 hash;
@@ -841,6 +842,8 @@ static int scoutfs_link(struct dentry *old_dentry,
 		goto out_unlock;

 	dir_size = i_size_read(dir) + dentry->d_name.len;
+	del_orphan = (inode->i_nlink == 0);
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
@@ -855,6 +858,12 @@ retry:
 	if (ret)
 		goto out;

+	if (del_orphan) {
+		ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
+		if (ret)
+			goto out;
+	}
+
 	pos = SCOUTFS_I(dir)->next_readdir_pos++;

 	ret = add_entry_items(sb, scoutfs_ino(dir), hash, pos,
@@ -870,6 +879,11 @@ retry:
 	inode->i_ctime = dir->i_mtime;
 	inc_nlink(inode);

+	if (del_orphan) {
+		ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
+		WARN_ON_ONCE(ret);
+	}
+
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);

@@ -1760,6 +1774,42 @@ static int scoutfs_dir_open(struct inode *inode, struct file *file)
 }
 #endif

+static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct scoutfs_lock *dir_lock = NULL;
+	struct scoutfs_lock *inode_lock = NULL;
+	LIST_HEAD(ind_locks);
+	int ret;
+
+	if (dentry->d_name.len > SCOUTFS_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	inode = lock_hold_create(dir, dentry, mode, 0,
+				 &dir_lock, &inode_lock, &ind_locks);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	insert_inode_hash(inode);
+	d_tmpfile(dentry, inode);
+
+	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
+	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
+	scoutfs_inode_index_unlock(sb, &ind_locks);
+
+	ret = scoutfs_orphan_inode(inode);
+	WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
+
+	scoutfs_release_trans(sb);
+	scoutfs_inode_index_unlock(sb, &ind_locks);
+	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+
+	return ret;
+}
+
 const struct file_operations scoutfs_dir_fops = {
 	.KC_FOP_READDIR	= scoutfs_readdir,
 #ifdef KC_FMODE_KABI_ITERATE
@@ -1770,7 +1820,10 @@ const struct file_operations scoutfs_dir_fops = {
 	.llseek		= generic_file_llseek,
 };

-const struct inode_operations scoutfs_dir_iops = {
+
+
+const struct inode_operations_wrapper scoutfs_dir_iops = {
+	.ops = {
 	.lookup		= scoutfs_lookup,
 	.mknod		= scoutfs_mknod,
 	.create		= scoutfs_create,
@@ -1787,6 +1840,8 @@ const struct inode_operations scoutfs_dir_iops = {
 	.removexattr	= scoutfs_removexattr,
 	.symlink	= scoutfs_symlink,
 	.permission	= scoutfs_permission,
+	},
+	.tmpfile	= scoutfs_tmpfile,
 };

 void scoutfs_dir_exit(void)
@@ -5,7 +5,7 @@
 #include "lock.h"

 extern const struct file_operations scoutfs_dir_fops;
-extern const struct inode_operations scoutfs_dir_iops;
+extern const struct inode_operations_wrapper scoutfs_dir_iops;
 extern const struct inode_operations scoutfs_symlink_iops;

 struct scoutfs_link_backref_entry {
@@ -14,7 +14,7 @@ struct scoutfs_link_backref_entry {
 	u64 dir_pos;
 	u16 name_len;
 	struct scoutfs_dirent dent;
-	/* the full name is allocated and stored in dent.name[0] */
+	/* the full name is allocated and stored in dent.name[] */
 };

 int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino,
@@ -38,7 +38,7 @@ static bool ext_overlap(struct scoutfs_extent *ext, u64 start, u64 len)
 	return !(e_end < start || ext->start > end);
 }

-static bool ext_inside(u64 start, u64 len, struct scoutfs_extent *out)
+bool scoutfs_ext_inside(u64 start, u64 len, struct scoutfs_extent *out)
 {
 	u64 in_end = start + len - 1;
 	u64 out_end = out->start + out->len - 1;
@@ -241,7 +241,7 @@ int scoutfs_ext_remove(struct super_block *sb, struct scoutfs_ext_ops *ops,
 		goto out;

 	/* removed extent must be entirely within found */
-	if (!ext_inside(start, len, &found)) {
+	if (!scoutfs_ext_inside(start, len, &found)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -341,7 +341,7 @@ int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,

 	if (ret == 0 && ext_overlap(&found, start, len)) {
 		/* set extent must be entirely within found */
-		if (!ext_inside(start, len, &found)) {
+		if (!scoutfs_ext_inside(start, len, &found)) {
 			ret = -EINVAL;
 			goto out;
 		}
@@ -31,5 +31,6 @@ int scoutfs_ext_alloc(struct super_block *sb, struct scoutfs_ext_ops *ops,
 		      struct scoutfs_extent *ext);
 int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,
 		    void *arg, u64 start, u64 len, u64 map, u8 flags);
+bool scoutfs_ext_inside(u64 start, u64 len, struct scoutfs_extent *out);

 #endif
@@ -276,7 +276,6 @@ int scoutfs_forest_read_items(struct super_block *sb,
 	scoutfs_inc_counter(sb, forest_read_items);
 	calc_bloom_nrs(&bloom, &lock->start);

-	roots = lock->roots;
 retry:
 	ret = scoutfs_client_get_roots(sb, &roots);
 	if (ret)
@@ -349,15 +348,9 @@ retry:
 	ret = 0;
 out:
 	if (ret == -ESTALE) {
-		if (memcmp(&prev_refs, &refs, sizeof(refs)) == 0) {
-			ret = -EIO;
-			goto out;
-		}
+		if (memcmp(&prev_refs, &refs, sizeof(refs)) == 0)
+			return -EIO;
 		prev_refs = refs;
-
-		ret = scoutfs_client_get_roots(sb, &roots);
-		if (ret)
-			goto out;
 		goto retry;
 	}

@@ -259,7 +259,7 @@ struct scoutfs_btree_block {
 	__le16 mid_free_len;
 	__u8 level;
 	__u8 __pad[7];
-	struct scoutfs_btree_item items[0];
+	struct scoutfs_btree_item items[];
 	/* leaf blocks have a fixed size item offset hash table at the end */
 };

@@ -307,7 +307,7 @@ struct scoutfs_alloc_list_block {
 	struct scoutfs_block_ref next;
 	__le32 start;
 	__le32 nr;
-	__le64 blknos[0]; /* naturally aligned for sorting */
+	__le64 blknos[]; /* naturally aligned for sorting */
 };

 #define SCOUTFS_ALLOC_LIST_MAX_BLOCKS					      \
@@ -362,7 +362,7 @@ struct scoutfs_srch_file {

 struct scoutfs_srch_parent {
 	struct scoutfs_block_header hdr;
-	struct scoutfs_block_ref refs[0];
+	struct scoutfs_block_ref refs[];
 };

 #define SCOUTFS_SRCH_PARENT_REFS				\
@@ -377,7 +377,7 @@ struct scoutfs_srch_block {
 	struct scoutfs_srch_entry tail;
 	__le32 entry_nr;
 	__le32 entry_bytes;
-	__u8 entries[0];
+	__u8 entries[];
 };

 /*
@@ -452,7 +452,7 @@ struct scoutfs_log_item_value {
 	__le64 vers;
 	__u8 flags;
 	__u8 __pad[7];
-	__u8 data[0];
+	__u8 data[];
 };

 /*
@@ -467,7 +467,7 @@ struct scoutfs_log_item_value {
 struct scoutfs_bloom_block {
 	struct scoutfs_block_header hdr;
 	__le64 total_set;
-	__le64 bits[0];
+	__le64 bits[];
 };

 /*
@@ -549,7 +549,7 @@ struct scoutfs_xattr {
 	__le16 val_len;
 	__u8 name_len;
 	__u8 __pad[5];
-	__u8 name[0];
+	__u8 name[];
 };


@@ -729,7 +729,7 @@ struct scoutfs_dirent {
 	__le64 pos;
 	__u8 type;
 	__u8 __pad[7];
-	__u8 name[0];
+	__u8 name[];
 };

 #define SCOUTFS_NAME_LEN 255
@@ -827,7 +827,7 @@ struct scoutfs_net_header {
 	__u8 flags;
 	__u8 error;
 	__u8 __pad[3];
-	__u8 data[0];
+	__u8 data[];
 };

 #define SCOUTFS_NET_FLAG_RESPONSE	(1 << 0)
@@ -895,15 +895,10 @@ struct scoutfs_net_lock {
 	__u8 __pad[6];
 };

-struct scoutfs_net_lock_grant_response {
-	struct scoutfs_net_lock nl;
-	struct scoutfs_net_roots roots;
-};
-
 struct scoutfs_net_lock_recover {
 	__le16 nr;
 	__u8 __pad[6];
-	struct scoutfs_net_lock locks[0];
+	struct scoutfs_net_lock locks[];
 };

 #define SCOUTFS_NET_LOCK_MAX_RECOVER_NR					       \
@@ -182,7 +182,8 @@ static void set_inode_ops(struct inode *inode)
 		inode->i_fop = &scoutfs_file_fops;
 		break;
 	case S_IFDIR:
-		inode->i_op = &scoutfs_dir_iops;
+		inode->i_op = &scoutfs_dir_iops.ops;
+		inode->i_flags |= S_IOPS_WRAPPER;
 		inode->i_fop = &scoutfs_dir_fops;
 		break;
 	case S_IFLNK:
@@ -1417,7 +1418,18 @@ static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
 	};
 }

-static int remove_orphan_item(struct super_block *sb, u64 ino)
+int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_lock *lock = sbi->rid_lock;
+	struct scoutfs_key key;
+
+	init_orphan_key(&key, sbi->rid, ino);
+
+	return scoutfs_item_dirty(sb, &key, lock);
+}
+
+int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_lock *lock = sbi->rid_lock;
@@ -1516,7 +1528,7 @@ retry:
 	if (ret)
 		goto out;

-	ret = remove_orphan_item(sb, ino);
+	ret = scoutfs_orphan_delete(sb, ino);
 out:
 	if (release)
 		scoutfs_release_trans(sb);
@@ -114,6 +114,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

 int scoutfs_scan_orphans(struct super_block *sb);
+int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
+int scoutfs_orphan_delete(struct super_block *sb, u64 ino);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
@@ -972,12 +972,18 @@ static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
 		goto out;
 	}

+	if (mb.flags & SCOUTFS_IOC_MB_UNKNOWN) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	ret = mnt_want_write_file(file);
 	if (ret < 0)
 		goto out;

 	ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
-				       to, mb.to_off);
+				       to, mb.to_off, !!(mb.flags & SCOUTFS_IOC_MB_STAGE),
+				       mb.data_version);
 	mnt_drop_write_file(file);
 out:
 	fput(from_file);
@@ -163,7 +163,7 @@ struct scoutfs_ioctl_ino_path_result {
 	__u64 dir_pos;
 	__u16 path_bytes;
 	__u8  _pad[6];
-	__u8  path[0];
+	__u8  path[];
 };

 /* Get a single path from the root to the given inode number */
@@ -259,7 +259,7 @@ struct scoutfs_ioctl_data_waiting {
 	__u8 _pad[6];
 };

-#define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN		(U8_MAX << 0)
+#define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN		(U64_MAX << 0)

 #define SCOUTFS_IOC_DATA_WAITING _IOR(SCOUTFS_IOCTL_MAGIC, 6, \
 				      struct scoutfs_ioctl_data_waiting)
@@ -279,7 +279,7 @@ struct scoutfs_ioctl_setattr_more {
 };

 #define SCOUTFS_IOC_SETATTR_MORE_OFFLINE		(1 << 0)
-#define SCOUTFS_IOC_SETATTR_MORE_UNKNOWN		(U8_MAX << 1)
+#define SCOUTFS_IOC_SETATTR_MORE_UNKNOWN		(U64_MAX << 1)

 #define SCOUTFS_IOC_SETATTR_MORE _IOW(SCOUTFS_IOCTL_MAGIC, 7, \
 				      struct scoutfs_ioctl_setattr_more)
@@ -418,12 +418,13 @@ struct scoutfs_ioctl_alloc_detail_entry {
 * on the same file system.
 *
 * from_fd specifies the source file and the ioctl is called on the
- * destination file.  Both files must have write access.  from_off
- * specifies the byte offset in the source, to_off is the byte offset in
- * the destination, and len is the number of bytes in the region to
- * move.   All of the offsets and lengths must be in multiples of 4KB,
- * except in the case where the from_off + len ends at the i_size of the
- * source file.
+ * destination file.  Both files must have write access.  from_off specifies
+ * the byte offset in the source, to_off is the byte offset in the
+ * destination, and len is the number of bytes in the region to move.  All of
+ * the offsets and lengths must be in multiples of 4KB, except in the case
+ * where the from_off + len ends at the i_size of the source
+ * file. data_version is only used when STAGE flag is set (see below).  flags
+ * field is currently only used to optionally specify STAGE behavior.
 *
 * This interface only moves extents which are block granular, it does
 * not perform RMW of sub-block byte extents and it does not overwrite
@@ -435,30 +436,41 @@ struct scoutfs_ioctl_alloc_detail_entry {
 * i_size.  The i_size update will maintain final partial blocks in the
 * source.
 *
- * It will return an error if either of the files have offline extents.
- * It will return 0 when all of the extents in the source region have
- * been moved to the destination.  Moving extents updates the ctime,
- * mtime, meta_seq, data_seq, and data_version fields of both the source
- * and destination inodes.  If an error is returned then partial
+ * If STAGE flag is not set, it will return an error if either of the files
+ * have offline extents.  It will return 0 when all of the extents in the
+ * source region have been moved to the destination.  Moving extents updates
+ * the ctime, mtime, meta_seq, data_seq, and data_version fields of both the
+ * source and destination inodes.  If an error is returned then partial
 * progress may have been made and inode fields may have been updated.
 *
+ * If STAGE flag is set, as above except destination range must be in an
+ * offline extent. Fields are updated only for source inode.
+ *
 * Errors specific to this interface include:
 *
 * EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
 *	   and destination files are the same inode; either the source or
 *	   destination is not a regular file; the destination file has
- *	   an existing overlapping extent.
+ *	   an existing overlapping extent (if STAGE flag not set); the
+ *	   destination range is not in an offline extent (if STAGE set).
 * EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
 * EBADF: from_fd isn't a valid open file descriptor.
 * EXDEV: the source and destination files are in different filesystems.
 * EISDIR: either the source or destination is a directory.
- * ENODATA: either the source or destination file have offline extents.
+ * ENODATA: either the source or destination file have offline extents and
+ *	    STAGE flag is not set.
+ * ESTALE: data_version does not match destination data_version.
 */
+#define SCOUTFS_IOC_MB_STAGE		(1 << 0)
+#define SCOUTFS_IOC_MB_UNKNOWN		(U64_MAX << 1)
+
 struct scoutfs_ioctl_move_blocks {
 	__u64 from_fd;
 	__u64 from_off;
 	__u64 len;
 	__u64 to_off;
+	__u64 data_version;
+	__u64 flags;
 };

 #define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
@@ -638,7 +638,6 @@ static void lock_grant_worker(struct work_struct *work)
 	struct lock_info *linfo = container_of(work, struct lock_info,
 					       grant_work);
 	struct super_block *sb = linfo->sb;
-	struct scoutfs_net_lock_grant_response *gr;
 	struct scoutfs_net_lock *nl;
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *tmp;
@@ -648,8 +647,7 @@ static void lock_grant_worker(struct work_struct *work)
 	spin_lock(&linfo->lock);

 	list_for_each_entry_safe(lock, tmp, &linfo->grant_list, grant_head) {
-		gr = &lock->grant_resp;
-		nl = &lock->grant_resp.nl;
+		nl = &lock->grant_nl;

 		/* wait for reordered invalidation to finish */
 		if (lock->mode != nl->old_mode)
@@ -667,7 +665,6 @@ static void lock_grant_worker(struct work_struct *work)
 		lock->request_pending = 0;
 		lock->mode = nl->new_mode;
 		lock->write_version = le64_to_cpu(nl->write_version);
-		lock->roots = gr->roots;

 		if (lock_count_match_exists(nl->new_mode, lock->waiters))
 			extend_grace(sb, lock);
@@ -689,9 +686,8 @@ static void lock_grant_worker(struct work_struct *work)
 * work to process.
 */
 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock_grant_response *gr)
+				struct scoutfs_net_lock *nl)
 {
-	struct scoutfs_net_lock *nl = &gr->nl;
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;

@@ -705,7 +701,7 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 	trace_scoutfs_lock_grant_response(sb, lock);
 	BUG_ON(!lock->request_pending);

-	lock->grant_resp = *gr;
+	lock->grant_nl = *nl;
 	list_add_tail(&lock->grant_head, &linfo->grant_list);
 	queue_grant_work(linfo);

@@ -23,7 +23,6 @@ struct scoutfs_lock {
 	u64 refresh_gen;
 	u64 write_version;
 	u64 dirty_trans_seq;
-	struct scoutfs_net_roots roots;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
 	ktime_t grace_deadline;
@@ -31,7 +30,7 @@ struct scoutfs_lock {
 		      invalidate_pending:1;

 	struct list_head grant_head;
-	struct scoutfs_net_lock_grant_response grant_resp;
+	struct scoutfs_net_lock grant_nl;
 	struct list_head inv_head;
 	struct scoutfs_net_lock inv_nl;
 	u64 inv_net_id;
@@ -57,7 +56,7 @@ struct scoutfs_lock_coverage {
 };

 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock_grant_response *gr);
+				struct scoutfs_net_lock *nl);
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl);
 int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
@@ -484,7 +484,6 @@ static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct scoutfs_net_lock_grant_response gres;
 	struct scoutfs_net_lock nl;
 	struct client_lock_entry *req;
 	struct client_lock_entry *req_tmp;
@@ -547,11 +546,8 @@ static int process_waiting_requests(struct super_block *sb,
 			nl.write_version = cpu_to_le64(wv);
 		}

-		gres.nl = nl;
-		scoutfs_server_get_roots(sb, &gres.roots);
-
 		ret = scoutfs_server_lock_response(sb, req->rid,
-						   req->net_id, &gres);
+						   req->net_id, &nl);
 		if (ret)
 			goto out;

@@ -944,7 +944,6 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
 	struct scoutfs_net_connection *acc_conn;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
 	struct socket *acc_sock;
-	LIST_HEAD(conn_list);
 	int ret;

 	trace_scoutfs_net_listen_work_enter(sb, 0, 0);
@@ -182,7 +182,7 @@ int scoutfs_server_apply_commit(struct super_block *sb, int err)
 	return err;
 }

-void scoutfs_server_get_roots(struct super_block *sb,
+static void get_roots(struct super_block *sb,
 			      struct scoutfs_net_roots *roots)
 {
 	DECLARE_SERVER_INFO(sb, server);
@@ -556,7 +556,7 @@ static int server_get_roots(struct super_block *sb,
 		memset(&roots, 0, sizeof(roots));
 		ret = -EINVAL;
 	}  else {
-		scoutfs_server_get_roots(sb, &roots);
+		get_roots(sb, &roots);
 		ret = 0;
 	}

@@ -862,13 +862,13 @@ int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 }

 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
-				 struct scoutfs_net_lock_grant_response *gr)
+				 struct scoutfs_net_lock *nl)
 {
 	struct server_info *server = SCOUTFS_SB(sb)->server_info;

 	return scoutfs_net_response_node(sb, server->conn, rid,
 					 SCOUTFS_NET_CMD_LOCK, id, 0,
-					 gr, sizeof(*gr));
+					 nl, sizeof(*nl));
 }

 static bool invalid_recover(struct scoutfs_net_lock_recover *nlr,
@@ -1551,7 +1551,6 @@ static void scoutfs_server_worker(struct work_struct *work)
 	struct scoutfs_net_connection *conn = NULL;
 	DECLARE_WAIT_QUEUE_HEAD(waitq);
 	struct sockaddr_in sin;
-	LIST_HEAD(conn_list);
 	u64 max_vers;
 	int ret;

@@ -59,11 +59,9 @@ do {								\
 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
-				 struct scoutfs_net_lock_grant_response *gr);
+				 struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
-void scoutfs_server_get_roots(struct super_block *sb,
-			      struct scoutfs_net_roots *roots);
 int scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);

@@ -2156,7 +2156,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	if (ret < 0)
 		goto commit;

-	ret = scoutfs_block_writer_write(sb, &wri);
+	ret = scoutfs_alloc_prepare_commit(sb, &alloc, &wri) ?:
+	      scoutfs_block_writer_write(sb, &wri);
 commit:
 	/* the server won't use our partial compact if _ERROR is set */
 	sc->meta_avail = alloc.avail;
@@ -1,4 +1,4 @@
-CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -fno-strict-aliasing 
+CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -fno-strict-aliasing -I ../kmod/src
 SHELL := /usr/bin/bash

 # each binary command is built from a single .c file
@@ -6,6 +6,7 @@ BIN := src/createmany			\
 	src/dumb_setxattr		\
 	src/handle_cat			\
 	src/bulk_create_paths		\
+	src/stage_tmpfile		\
 	src/find_xattrs

 DEPS := $(wildcard src/*.d)
@@ -0,0 +1,18 @@
+total file size 33669120
+00000000  41 41 41 41 41 41 41 41  41 41 41 41 41 41 41 41  |AAAAAAAAAAAAAAAA|
+*
+00400000  42 42 42 42 42 42 42 42  42 42 42 42 42 42 42 42  |BBBBBBBBBBBBBBBB|
+*
+00801000  43 43 43 43 43 43 43 43  43 43 43 43 43 43 43 43  |CCCCCCCCCCCCCCCC|
+*
+00c03000  44 44 44 44 44 44 44 44  44 44 44 44 44 44 44 44  |DDDDDDDDDDDDDDDD|
+*
+01006000  45 45 45 45 45 45 45 45  45 45 45 45 45 45 45 45  |EEEEEEEEEEEEEEEE|
+*
+0140a000  46 46 46 46 46 46 46 46  46 46 46 46 46 46 46 46  |FFFFFFFFFFFFFFFF|
+*
+0180f000  47 47 47 47 47 47 47 47  47 47 47 47 47 47 47 47  |GGGGGGGGGGGGGGGG|
+*
+01c15000  48 48 48 48 48 48 48 48  48 48 48 48 48 48 48 48  |HHHHHHHHHHHHHHHH|
+*
+0201c000
@@ -1,6 +1,7 @@
 Ran:
 generic/001
 generic/002
+generic/004
 generic/005
 generic/006
 generic/007
@@ -73,7 +74,6 @@ generic/376
 generic/377
 Not
 run:
-generic/004
 generic/008
 generic/009
 generic/012
@@ -278,4 +278,4 @@ shared/004
 shared/032
 shared/051
 shared/289
-Passed all 72 tests
+Passed all 73 tests
@@ -18,6 +18,7 @@ createmany-large-names.sh
 createmany-rename-large-dir.sh
 stage-release-race-alloc.sh
 stage-multi-part.sh
+stage-tmpfile.sh
 basic-posix-consistency.sh
 dirent-consistency.sh
 lock-ex-race-processes.sh
@@ -0,0 +1,145 @@
+/*
+ * Exercise O_TMPFILE creation as well as staging from tmpfiles into
+ * a released destination file.
+ *
+ * Copyright (C) 2021 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <linux/types.h>
+#include <assert.h>
+
+#include "ioctl.h"
+
+#define array_size(arr) (sizeof(arr) / sizeof(arr[0]))
+
+/*
+ * Write known data into 8 tmpfiles.
+ * Make a new file X and release it
+ * Move contents of 8 tmpfiles into X.
+ */
+
+struct sub_tmp_info {
+	int fd;
+	unsigned int offset;
+	unsigned int length;
+};
+
+#define SZ	4096
+char buf[SZ];
+
+int main(int argc, char **argv)
+{
+	struct scoutfs_ioctl_release ioctl_args = {0};
+	struct scoutfs_ioctl_move_blocks mb;
+	struct sub_tmp_info sub_tmps[8];
+	int tot_size = 0;
+	char *dest_file;
+	int dest_fd;
+	char *mnt;
+	int ret;
+	int i;
+
+	if (argc < 3) {
+		printf("%s <mountpoint> <dest_file>\n", argv[0]);
+		return 1;
+	}
+
+	mnt = argv[1];
+	dest_file = argv[2];
+
+	for (i = 0; i < array_size(sub_tmps); i++) {
+		struct sub_tmp_info *sub_tmp = &sub_tmps[i];
+		int remaining;
+
+		sub_tmp->fd = open(mnt, O_RDWR | O_TMPFILE, S_IRUSR | S_IWUSR);
+		if (sub_tmp->fd < 0) {
+			perror("error");
+			exit(1);
+		}
+
+		sub_tmp->offset = tot_size;
+
+		/* First tmp file is 4MB */
+		/* Each is 4k bigger than last */
+		sub_tmp->length = (i + 1024) * sizeof(buf);
+
+		remaining = sub_tmp->length;
+
+		/* Each sub tmpfile written with 'A', 'B', etc. */
+		memset(buf, 'A' + i, sizeof(buf));
+		while (remaining) {
+			int written;
+
+			written = write(sub_tmp->fd, buf, sizeof(buf));
+			assert(written == sizeof(buf));
+			tot_size += sizeof(buf);
+			remaining -= written;
+		}
+	}
+
+	printf("total file size %d\n", tot_size);
+
+	dest_fd = open(dest_file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	if (dest_fd == -1) {
+		perror("error");
+		exit(1);
+	}
+
+	// make dest file big
+	ret = posix_fallocate(dest_fd, 0, tot_size);
+	if (ret) {
+		perror("error");
+		exit(1);
+	}
+
+	// release everything in dest file
+	ioctl_args.offset = 0;
+	ioctl_args.length = tot_size;
+	ioctl_args.data_version = 0;
+
+	ret = ioctl(dest_fd, SCOUTFS_IOC_RELEASE, &ioctl_args);
+	if (ret < 0) {
+		perror("error");
+		exit(1);
+	}
+
+	// move contents into dest in reverse order
+	for (i = array_size(sub_tmps) - 1; i >= 0 ; i--) {
+		struct sub_tmp_info *sub_tmp = &sub_tmps[i];
+
+		mb.from_fd = sub_tmp->fd;
+		mb.from_off = 0;
+		mb.len = sub_tmp->length;
+		mb.to_off = sub_tmp->offset;
+		mb.data_version = 0;
+		mb.flags = SCOUTFS_IOC_MB_STAGE;
+
+		ret = ioctl(dest_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
+		if (ret < 0) {
+			perror("error");
+			exit(1);
+		}
+
+	}
+
+	return 0;
+}
@@ -0,0 +1,15 @@
+#
+# Run tmpfile_stage and check the output with hexdump.
+#
+
+t_require_commands stage_tmpfile hexdump
+
+DEST_FILE="$T_D0/dest_file"
+
+stage_tmpfile $T_D0 $DEST_FILE
+
+hexdump -C "$DEST_FILE"
+
+rm -fr "$DEST_FILE"
+
+t_pass
@@ -32,7 +32,7 @@ struct move_blocks_args {

 static int do_move_blocks(struct move_blocks_args *args)
 {
-	struct scoutfs_ioctl_move_blocks mb;
+	struct scoutfs_ioctl_move_blocks mb = {0};
 	int from_fd = -1;
 	int to_fd = -1;
 	int ret;
Author	SHA1	Message	Date
Zach Brown	c3290771a0	Block cache use rht _lookup_ insert for EEXIST The sneaky rhashtable_insert_fast() can't return -EEXIST despite the last line of the function REALLY making it look like it can. It just inserts new objects at the head of the bucket lists without comparing the insertion with existing objects. The block cache was relying on insertion to resolve duplicate racing allocated blocks. Because it couldn't return -EEXIST we could get duplicate cached blocks present in the hash table. rhashtable_lookup_insert_fast() fixes this by actually comparing the inserted objects key with the objects found in the insertion bucket. A racing allocator trying to insert a duplicate cached block will get an error, drop their allocated block, and retry their lookup. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 09:24:23 -07:00
Zach Brown	cf3cb3f197	Wait for rhashtable to rehash on insert EBUSY The rhashtable can return EBUSY if you insert fast enough to trigger an expansion of the next table size that is waiting to be rehashed in an rcu callback. If we get EBUSY from rhasthable_insert we call synchronize_rcu to wait for the rehash to complete before trying again. This was hit in testing restores of a very large namespace and took a few hours to hit. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-13 09:24:23 -07:00
Andy Grover	cb4ed98b3c	Merge pull request #31 from versity/zab/block_shrink_wait_for_rebalance Block cache shrink restart waits for rcu callbacks	2021-04-08 09:03:12 -07:00
Zach Brown	9ee7f7b9dc	Block cache shrink restart waits for rcu callbacks We're seeing cpu livelocks in block shrinking where counters show that a single block cache shrink call is only getting EAGAIN from repeated rhashtable walk attempts. It occurred to me that the running task might be preventing an RCU grace period from ending by never blocking. The hope of this commit is that by waiting for rcu callbacks to run we'll ensure that any pending rebalance callback runs before we retry the rhashtable walk again. I haven't been able to reproduce this easily so this is a stab in the dark. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-07 12:50:50 -07:00
Zach Brown	300791ecfa	Merge pull request #29 from agrover/cleanup Cleanup	2021-04-07 12:27:00 -07:00
Andy Grover	4630b77b45	cleanup: Use flexible array members instead of 0-length arrays See Documentation/process/deprecated.rst:217, items[] now preferred over items[0]. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-07 10:14:47 -07:00
Andy Grover	bdc43ca634	cleanup: Fix ESTALE handling in forest_read_items Kinda weird to goto back to the out label and then out the bottom. Just return -EIO, like forest_next_hint() does. Don't call client_get_roots() right before retry, since is the first thing retry does. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-07 10:14:04 -07:00
Andy Grover	6406f05350	cleanup: Remove struct net_lock_grant_response We're not using the roots member of this struct, so we can just use struct scoutfs_net_lock directly. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-07 10:13:56 -07:00
Andy Grover	820b7295f0	cleanup: Unused LIST_HEADs Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-05 16:23:41 -07:00
Zach Brown	b3611103ee	Merge pull request #26 from agrover/tmpfile Support O_TMPFILE and allow MOVE_BLOCKS into released extents	2021-04-05 15:23:41 -07:00
Andy Grover	0deb232d3f	Support O_TMPFILE and allow MOVE_BLOCKS into released extents Support O_TMPFILE: Create an unlinked file and put it on the orphan list. If it ever gains a link, take it off the orphan list. Change MOVE_BLOCKS ioctl to allow moving blocks into offline extent ranges. Ioctl callers must set a new flag to enable this operation mode. RH-compat: tmpfile support it actually backported by RH into 3.10 kernel. We need to use some of their kabi-maintaining wrappers to use it: use a struct inode_operations_wrapper instead of base struct inode_operations, set S_IOPS_WRAPPER flag in i_flags. This lets RH's modified vfs_tmpfile() find our tmpfile fn pointer. Add a test that tests both creating tmpfiles as well as moving their contents into a destination file via MOVE_BLOCKS. xfstests common/004 now runs because tmpfile is supported. Signed-off-by: Andy Grover <agrover@versity.com>	2021-04-05 14:23:44 -07:00
Andy Grover	1366e254f9	Merge pull request #30 from versity/zab/srch_block_ref_leak Zab/srch block ref leak	2021-04-01 16:50:34 -07:00
Zach Brown	1259f899a3	srch compaction needs to prepare alloc for commit The srch client compaction work initializes allocators, dirties blocks, and writes them out as its transaction. It forgot to call the pre-commit allocator prepare function. The prepare function drops block references used by the meta allocator during the transaction. This leaked block references which kept blocks from being freed by the shrinker under memory pressure. Eventually memory was full of leaked blocks and the shrinker walked all of them looking blocks to free, resulting in an effective livelock that ground the system to a crawl. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-01 13:04:40 -07:00
Zach Brown	2d393f435b	Warn on leaked block refs on unmount By the time we get to destroying the block cache we should have put all our block references. Warn as we tear down the blocks if we see any blocks that still have references, implying a ref leak. This caught a leak caused by srch compaction forgetting to put allocator list block refs. Signed-off-by: Zach Brown <zab@versity.com>	2021-04-01 13:04:06 -07:00