diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 0d7df458..21d29b6b 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -840,7 +840,7 @@ retry: goto out; if (del_orphan) { - ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret) goto out; } @@ -852,7 +852,7 @@ retry: scoutfs_ino(inode), inode->i_mode, dir_lock, inode_lock); if (ret) { - err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */ goto out; } @@ -951,7 +951,7 @@ retry: goto unlock; if (should_orphan(inode)) { - ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret < 0) goto out; } @@ -959,7 +959,7 @@ retry: ret = del_entry_items(sb, scoutfs_ino(dir), le64_to_cpu(dent.hash), le64_to_cpu(dent.pos), scoutfs_ino(inode), dir_lock, inode_lock); if (ret) { - ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock); WARN_ON_ONCE(ret); /* should have been dirty */ goto out; } @@ -1669,7 +1669,8 @@ retry: ins_old = true; if (should_orphan(new_inode)) { - ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock, + new_inode_lock); if (ret) goto out; } @@ -1831,7 +1832,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod return PTR_ERR(inode); si = SCOUTFS_I(inode); - ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret < 0) goto out; /* XXX returning error but items created */ diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 02456fde..6d775515 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -955,7 +955,8 @@ void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major, static int update_index_items(struct super_block *sb, struct scoutfs_inode_info *si, u64 ino, u8 type, u64 major, u32 minor, - struct list_head *lock_list) + struct list_head *lock_list, + struct scoutfs_lock *primary) { struct scoutfs_lock *ins_lock; struct scoutfs_lock *del_lock; @@ -972,7 +973,7 @@ static int update_index_items(struct super_block *sb, scoutfs_inode_init_index_key(&ins, type, major, minor, ino); ins_lock = find_index_lock(lock_list, type, major, minor, ino); - ret = scoutfs_item_create_force(sb, &ins, NULL, 0, ins_lock); + ret = scoutfs_item_create_force(sb, &ins, NULL, 0, ins_lock, primary); if (ret || !will_del_index(si, type, major, minor)) return ret; @@ -984,7 +985,7 @@ static int update_index_items(struct super_block *sb, del_lock = find_index_lock(lock_list, type, get_item_major(si, type), get_item_minor(si, type), ino); - ret = scoutfs_item_delete_force(sb, &del, del_lock); + ret = scoutfs_item_delete_force(sb, &del, del_lock, primary); if (ret) { err = scoutfs_item_delete(sb, &ins, ins_lock); BUG_ON(err); @@ -996,7 +997,8 @@ static int update_index_items(struct super_block *sb, static int update_indices(struct super_block *sb, struct scoutfs_inode_info *si, u64 ino, umode_t mode, struct scoutfs_inode *sinode, - struct list_head *lock_list) + struct list_head *lock_list, + struct scoutfs_lock *primary) { struct index_update { u8 type; @@ -1016,7 +1018,7 @@ static int update_indices(struct super_block *sb, continue; ret = update_index_items(sb, si, ino, upd->type, upd->major, - upd->minor, lock_list); + upd->minor, lock_list, primary); if (ret) break; } @@ -1056,7 +1058,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock, /* only race with other inode field stores once */ store_inode(&sinode, inode); - ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list); + ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list, lock); BUG_ON(ret); scoutfs_inode_init_key(&key, ino); @@ -1325,7 +1327,7 @@ void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list) /* this is called on final inode cleanup so enoent is fine */ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major, - u32 minor, struct list_head *ind_locks) + u32 minor, struct list_head *ind_locks, struct scoutfs_lock *primary) { struct scoutfs_key key; struct scoutfs_lock *lock; @@ -1334,7 +1336,7 @@ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major, scoutfs_inode_init_index_key(&key, type, major, minor, ino); lock = find_index_lock(ind_locks, type, major, minor, ino); - ret = scoutfs_item_delete_force(sb, &key, lock); + ret = scoutfs_item_delete_force(sb, &key, lock, primary); if (ret == -ENOENT) ret = 0; return ret; @@ -1351,16 +1353,17 @@ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major, */ static int remove_index_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode, - struct list_head *ind_locks) + struct list_head *ind_locks, + struct scoutfs_lock *primary) { umode_t mode = le32_to_cpu(sinode->mode); int ret; ret = remove_index(sb, ino, SCOUTFS_INODE_INDEX_META_SEQ_TYPE, - le64_to_cpu(sinode->meta_seq), 0, ind_locks); + le64_to_cpu(sinode->meta_seq), 0, ind_locks, primary); if (ret == 0 && S_ISREG(mode)) ret = remove_index(sb, ino, SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE, - le64_to_cpu(sinode->data_seq), 0, ind_locks); + le64_to_cpu(sinode->data_seq), 0, ind_locks, primary); return ret; } @@ -1493,22 +1496,24 @@ static void init_orphan_key(struct scoutfs_key *key, u64 ino) * zone under a write only lock while the caller has the inode protected * by a write lock. */ -int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) +int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary) { struct scoutfs_key key; init_orphan_key(&key, ino); - return scoutfs_item_create_force(sb, &key, NULL, 0, lock); + return scoutfs_item_create_force(sb, &key, NULL, 0, lock, primary); } -int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) +int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary) { struct scoutfs_key key; init_orphan_key(&key, ino); - return scoutfs_item_delete_force(sb, &key, lock); + return scoutfs_item_delete_force(sb, &key, lock, primary); } /* @@ -1561,7 +1566,7 @@ retry: release = true; - ret = remove_index_items(sb, ino, sinode, &ind_locks); + ret = remove_index_items(sb, ino, sinode, &ind_locks, lock); if (ret) goto out; @@ -1576,7 +1581,7 @@ retry: if (ret < 0) goto out; - ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock); + ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock, lock); if (ret < 0) goto out; diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 88058117..6feb3f9e 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -125,8 +125,10 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int scoutfs_setattr(struct dentry *dentry, struct iattr *attr); -int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); -int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); +int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary); +int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary); void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb); void scoutfs_inode_queue_writeback(struct inode *inode); diff --git a/kmod/src/item.c b/kmod/src/item.c index 05b04550..8c7900d5 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -1676,6 +1676,14 @@ static int lock_safe(struct scoutfs_lock *lock, struct scoutfs_key *key, return 0; } +static int optional_lock_mode_match(struct scoutfs_lock *lock, int mode) +{ + if (WARN_ON_ONCE(lock && lock->mode != mode)) + return -EINVAL; + else + return 0; +} + /* * Copy the cached item's value into the caller's value. The number of * bytes copied is returned. A null val returns 0. @@ -1832,12 +1840,19 @@ out: * also increase the seqs. It lets us limit the inputs of item merging * to the last stable seq and ensure that all the items in open * transactions and granted locks will have greater seqs. + * + * This is a little awkward for WRITE_ONLY locks which can have much + * older versions than the version of locked primary data that they're + * operating on behalf of. Callers can optionally provide that primary + * lock to get the version from. This ensures that items created under + * WRITE_ONLY locks can not have versions less than their primary data. */ -static u64 item_seq(struct super_block *sb, struct scoutfs_lock *lock) +static u64 item_seq(struct super_block *sb, struct scoutfs_lock *lock, + struct scoutfs_lock *primary) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - return max(sbi->trans_seq, lock->write_seq); + return max3(sbi->trans_seq, lock->write_seq, primary ? primary->write_seq : 0); } /* @@ -1872,7 +1887,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key, if (!item || item->deletion) { ret = -ENOENT; } else { - item->seq = item_seq(sb, lock); + item->seq = item_seq(sb, lock, NULL); mark_item_dirty(sb, cinf, pg, NULL, item); ret = 0; } @@ -1889,10 +1904,10 @@ out: */ static int item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock, - int mode, bool force) + struct scoutfs_lock *primary, int mode, bool force) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, primary); struct cached_item *found; struct cached_item *item; struct cached_page *pg; @@ -1902,7 +1917,8 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_create); - if ((ret = lock_safe(lock, key, mode))) + if ((ret = lock_safe(lock, key, mode)) || + (ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -1943,15 +1959,15 @@ out: int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { - return item_create(sb, key, val, val_len, lock, + return item_create(sb, key, val, val_len, lock, NULL, SCOUTFS_LOCK_READ, false); } int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, - struct scoutfs_lock *lock) + struct scoutfs_lock *lock, struct scoutfs_lock *primary) { - return item_create(sb, key, val, val_len, lock, + return item_create(sb, key, val, val_len, lock, primary, SCOUTFS_LOCK_WRITE_ONLY, true); } @@ -1965,7 +1981,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, NULL); struct cached_item *item; struct cached_item *found; struct cached_page *pg; @@ -2025,12 +2041,16 @@ out: * current items so the caller always writes with write only locks. If * combining the current delta item and the caller's item results in a * null we can just drop it, we don't have to emit a deletion item. + * + * Delta items don't have to worry about creating items with old + * versions under write_only locks. The versions don't impact how we + * merge two items. */ int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, NULL); struct cached_item *item; struct cached_page *pg; struct rb_node **pnode; @@ -2099,10 +2119,11 @@ out: * deletion item if there isn't one already cached. */ static int item_delete(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock, int mode, bool force) + struct scoutfs_lock *lock, struct scoutfs_lock *primary, + int mode, bool force) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, primary); struct cached_item *item; struct cached_page *pg; struct rb_node **pnode; @@ -2111,7 +2132,8 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_delete); - if ((ret = lock_safe(lock, key, mode))) + if ((ret = lock_safe(lock, key, mode)) || + (ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -2161,13 +2183,13 @@ out: int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_lock *lock) { - return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE, false); + return item_delete(sb, key, lock, NULL, SCOUTFS_LOCK_WRITE, false); } int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock) + struct scoutfs_lock *lock, struct scoutfs_lock *primary) { - return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE_ONLY, true); + return item_delete(sb, key, lock, primary, SCOUTFS_LOCK_WRITE_ONLY, true); } u64 scoutfs_item_dirty_pages(struct super_block *sb) diff --git a/kmod/src/item.h b/kmod/src/item.h index 431866d5..d7fe6a2e 100644 --- a/kmod/src/item.h +++ b/kmod/src/item.h @@ -15,16 +15,15 @@ int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, - struct scoutfs_lock *lock); + struct scoutfs_lock *lock, struct scoutfs_lock *primary); int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_lock *lock); -int scoutfs_item_delete_force(struct super_block *sb, - struct scoutfs_key *key, - struct scoutfs_lock *lock); +int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key, + struct scoutfs_lock *lock, struct scoutfs_lock *primary); u64 scoutfs_item_dirty_pages(struct super_block *sb); int scoutfs_item_write_dirty(struct super_block *sb); diff --git a/tests/golden/simple-inode-index b/tests/golden/simple-inode-index index 3b2036e8..57f77933 100644 --- a/tests/golden/simple-inode-index +++ b/tests/golden/simple-inode-index @@ -7,3 +7,4 @@ found second == changing metadata must increase meta seq == changing contents must increase data seq == make sure dirtying doesn't livelock walk +== concurrent update attempts maintain single entries diff --git a/tests/tests/simple-inode-index.sh b/tests/tests/simple-inode-index.sh index 514a4348..414c0db3 100644 --- a/tests/tests/simple-inode-index.sh +++ b/tests/tests/simple-inode-index.sh @@ -103,4 +103,34 @@ while [ "$nr" -lt 100 ]; do ((nr++)) done +# +# make sure rapid concurrent metadata updates don't create multiple +# meta_seq entries +# +# we had a bug where deletion items created under concurrent_write locks +# could get versions older than the items they're deleting which were +# protected by read/write locks. +# +echo "== concurrent update attempts maintain single entries" +FILES=4 +nr=1 +while [ "$nr" -lt 10 ]; do + # touch a bunch of files in parallel from all mounts + for i in $(t_fs_nrs); do + eval path="\$T_D${i}" + seq -f "$path/file-%.0f" 1 $FILES | xargs touch & + done + wait || t_fail "concurrent file updates failed" + + # make sure no inodes have duplicate entries + sync + scoutfs walk-inodes -p "$T_D0" meta_seq -- 0 -1 | \ + grep -v "minor" | \ + awk '{print $4}' | \ + sort -n | uniq -c | \ + awk '($1 != 1)' | \ + sort -n + ((nr++)) +done + t_pass