From 71ed4512dc9fcc6563fbe1977820cbc398cd1eea Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 15 Nov 2022 13:01:17 -0800 Subject: [PATCH 1/2] Include primary lock write_seq for write_only vers FS items are deleted by logging a deletion item that has a greater item version than the item to delete. The versions are usually maintained by the write_seq of the exclusive write lock that protects the item. Any newer write hold will have a greater version than all previous write holds so any items created under the lock will have a greater vers than all previous items under the lock. All deletion items will be merged with the older item and both will be dropped. This doesn't work for concurrent write-only locks. The write-only locks match with each other so their write_seqs are asssigned in the order that they are granted. That grant order can be mismatched with item creation order. We can get deletion items with lesser versions than the item to delete because of when each creation's write-only lock was granted. Write only locks are used to maintain consistency between concurrent writers and readers, not between writers. Consistency between writers is done with another primary write lock. For example, if you're writing seq items to a write-only region you need to have the write lock on the inode for the specific seq item you're writing. The fix, then, is to pass these primary write locks down to the item cache so that it can chose an item version that is the greatest amongst the transaction, the write-only lock, and the primary lock. This now ensures that the primary lock's increasing write_seq makes it down to the item, bringing item version ordering in line with exclusive holds of the primary lock. All of this to fix concurrent inode updates sometimes leaving behind duplicate meta_seq items because old seq item deletions ended up with older versions than the seq item they tried to delete, nullifying the deletion. Signed-off-by: Zach Brown --- kmod/src/dir.c | 13 +++++------ kmod/src/inode.c | 39 ++++++++++++++++++--------------- kmod/src/inode.h | 6 ++++-- kmod/src/item.c | 56 +++++++++++++++++++++++++++++++++--------------- kmod/src/item.h | 7 +++--- 5 files changed, 75 insertions(+), 46 deletions(-) diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 0d7df458..21d29b6b 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -840,7 +840,7 @@ retry: goto out; if (del_orphan) { - ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret) goto out; } @@ -852,7 +852,7 @@ retry: scoutfs_ino(inode), inode->i_mode, dir_lock, inode_lock); if (ret) { - err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */ goto out; } @@ -951,7 +951,7 @@ retry: goto unlock; if (should_orphan(inode)) { - ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret < 0) goto out; } @@ -959,7 +959,7 @@ retry: ret = del_entry_items(sb, scoutfs_ino(dir), le64_to_cpu(dent.hash), le64_to_cpu(dent.pos), scoutfs_ino(inode), dir_lock, inode_lock); if (ret) { - ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock, inode_lock); WARN_ON_ONCE(ret); /* should have been dirty */ goto out; } @@ -1669,7 +1669,8 @@ retry: ins_old = true; if (should_orphan(new_inode)) { - ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock, + new_inode_lock); if (ret) goto out; } @@ -1831,7 +1832,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod return PTR_ERR(inode); si = SCOUTFS_I(inode); - ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); + ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock, inode_lock); if (ret < 0) goto out; /* XXX returning error but items created */ diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 02456fde..6d775515 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -955,7 +955,8 @@ void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major, static int update_index_items(struct super_block *sb, struct scoutfs_inode_info *si, u64 ino, u8 type, u64 major, u32 minor, - struct list_head *lock_list) + struct list_head *lock_list, + struct scoutfs_lock *primary) { struct scoutfs_lock *ins_lock; struct scoutfs_lock *del_lock; @@ -972,7 +973,7 @@ static int update_index_items(struct super_block *sb, scoutfs_inode_init_index_key(&ins, type, major, minor, ino); ins_lock = find_index_lock(lock_list, type, major, minor, ino); - ret = scoutfs_item_create_force(sb, &ins, NULL, 0, ins_lock); + ret = scoutfs_item_create_force(sb, &ins, NULL, 0, ins_lock, primary); if (ret || !will_del_index(si, type, major, minor)) return ret; @@ -984,7 +985,7 @@ static int update_index_items(struct super_block *sb, del_lock = find_index_lock(lock_list, type, get_item_major(si, type), get_item_minor(si, type), ino); - ret = scoutfs_item_delete_force(sb, &del, del_lock); + ret = scoutfs_item_delete_force(sb, &del, del_lock, primary); if (ret) { err = scoutfs_item_delete(sb, &ins, ins_lock); BUG_ON(err); @@ -996,7 +997,8 @@ static int update_index_items(struct super_block *sb, static int update_indices(struct super_block *sb, struct scoutfs_inode_info *si, u64 ino, umode_t mode, struct scoutfs_inode *sinode, - struct list_head *lock_list) + struct list_head *lock_list, + struct scoutfs_lock *primary) { struct index_update { u8 type; @@ -1016,7 +1018,7 @@ static int update_indices(struct super_block *sb, continue; ret = update_index_items(sb, si, ino, upd->type, upd->major, - upd->minor, lock_list); + upd->minor, lock_list, primary); if (ret) break; } @@ -1056,7 +1058,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock, /* only race with other inode field stores once */ store_inode(&sinode, inode); - ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list); + ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list, lock); BUG_ON(ret); scoutfs_inode_init_key(&key, ino); @@ -1325,7 +1327,7 @@ void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list) /* this is called on final inode cleanup so enoent is fine */ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major, - u32 minor, struct list_head *ind_locks) + u32 minor, struct list_head *ind_locks, struct scoutfs_lock *primary) { struct scoutfs_key key; struct scoutfs_lock *lock; @@ -1334,7 +1336,7 @@ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major, scoutfs_inode_init_index_key(&key, type, major, minor, ino); lock = find_index_lock(ind_locks, type, major, minor, ino); - ret = scoutfs_item_delete_force(sb, &key, lock); + ret = scoutfs_item_delete_force(sb, &key, lock, primary); if (ret == -ENOENT) ret = 0; return ret; @@ -1351,16 +1353,17 @@ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major, */ static int remove_index_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode, - struct list_head *ind_locks) + struct list_head *ind_locks, + struct scoutfs_lock *primary) { umode_t mode = le32_to_cpu(sinode->mode); int ret; ret = remove_index(sb, ino, SCOUTFS_INODE_INDEX_META_SEQ_TYPE, - le64_to_cpu(sinode->meta_seq), 0, ind_locks); + le64_to_cpu(sinode->meta_seq), 0, ind_locks, primary); if (ret == 0 && S_ISREG(mode)) ret = remove_index(sb, ino, SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE, - le64_to_cpu(sinode->data_seq), 0, ind_locks); + le64_to_cpu(sinode->data_seq), 0, ind_locks, primary); return ret; } @@ -1493,22 +1496,24 @@ static void init_orphan_key(struct scoutfs_key *key, u64 ino) * zone under a write only lock while the caller has the inode protected * by a write lock. */ -int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) +int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary) { struct scoutfs_key key; init_orphan_key(&key, ino); - return scoutfs_item_create_force(sb, &key, NULL, 0, lock); + return scoutfs_item_create_force(sb, &key, NULL, 0, lock, primary); } -int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock) +int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary) { struct scoutfs_key key; init_orphan_key(&key, ino); - return scoutfs_item_delete_force(sb, &key, lock); + return scoutfs_item_delete_force(sb, &key, lock, primary); } /* @@ -1561,7 +1566,7 @@ retry: release = true; - ret = remove_index_items(sb, ino, sinode, &ind_locks); + ret = remove_index_items(sb, ino, sinode, &ind_locks, lock); if (ret) goto out; @@ -1576,7 +1581,7 @@ retry: if (ret < 0) goto out; - ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock); + ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock, lock); if (ret < 0) goto out; diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 88058117..6feb3f9e 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -125,8 +125,10 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int scoutfs_setattr(struct dentry *dentry, struct iattr *attr); -int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); -int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); +int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary); +int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, + struct scoutfs_lock *primary); void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb); void scoutfs_inode_queue_writeback(struct inode *inode); diff --git a/kmod/src/item.c b/kmod/src/item.c index 05b04550..8c7900d5 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -1676,6 +1676,14 @@ static int lock_safe(struct scoutfs_lock *lock, struct scoutfs_key *key, return 0; } +static int optional_lock_mode_match(struct scoutfs_lock *lock, int mode) +{ + if (WARN_ON_ONCE(lock && lock->mode != mode)) + return -EINVAL; + else + return 0; +} + /* * Copy the cached item's value into the caller's value. The number of * bytes copied is returned. A null val returns 0. @@ -1832,12 +1840,19 @@ out: * also increase the seqs. It lets us limit the inputs of item merging * to the last stable seq and ensure that all the items in open * transactions and granted locks will have greater seqs. + * + * This is a little awkward for WRITE_ONLY locks which can have much + * older versions than the version of locked primary data that they're + * operating on behalf of. Callers can optionally provide that primary + * lock to get the version from. This ensures that items created under + * WRITE_ONLY locks can not have versions less than their primary data. */ -static u64 item_seq(struct super_block *sb, struct scoutfs_lock *lock) +static u64 item_seq(struct super_block *sb, struct scoutfs_lock *lock, + struct scoutfs_lock *primary) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - return max(sbi->trans_seq, lock->write_seq); + return max3(sbi->trans_seq, lock->write_seq, primary ? primary->write_seq : 0); } /* @@ -1872,7 +1887,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key, if (!item || item->deletion) { ret = -ENOENT; } else { - item->seq = item_seq(sb, lock); + item->seq = item_seq(sb, lock, NULL); mark_item_dirty(sb, cinf, pg, NULL, item); ret = 0; } @@ -1889,10 +1904,10 @@ out: */ static int item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock, - int mode, bool force) + struct scoutfs_lock *primary, int mode, bool force) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, primary); struct cached_item *found; struct cached_item *item; struct cached_page *pg; @@ -1902,7 +1917,8 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_create); - if ((ret = lock_safe(lock, key, mode))) + if ((ret = lock_safe(lock, key, mode)) || + (ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -1943,15 +1959,15 @@ out: int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { - return item_create(sb, key, val, val_len, lock, + return item_create(sb, key, val, val_len, lock, NULL, SCOUTFS_LOCK_READ, false); } int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, - struct scoutfs_lock *lock) + struct scoutfs_lock *lock, struct scoutfs_lock *primary) { - return item_create(sb, key, val, val_len, lock, + return item_create(sb, key, val, val_len, lock, primary, SCOUTFS_LOCK_WRITE_ONLY, true); } @@ -1965,7 +1981,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, NULL); struct cached_item *item; struct cached_item *found; struct cached_page *pg; @@ -2025,12 +2041,16 @@ out: * current items so the caller always writes with write only locks. If * combining the current delta item and the caller's item results in a * null we can just drop it, we don't have to emit a deletion item. + * + * Delta items don't have to worry about creating items with old + * versions under write_only locks. The versions don't impact how we + * merge two items. */ int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, NULL); struct cached_item *item; struct cached_page *pg; struct rb_node **pnode; @@ -2099,10 +2119,11 @@ out: * deletion item if there isn't one already cached. */ static int item_delete(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock, int mode, bool force) + struct scoutfs_lock *lock, struct scoutfs_lock *primary, + int mode, bool force) { DECLARE_ITEM_CACHE_INFO(sb, cinf); - const u64 seq = item_seq(sb, lock); + const u64 seq = item_seq(sb, lock, primary); struct cached_item *item; struct cached_page *pg; struct rb_node **pnode; @@ -2111,7 +2132,8 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_delete); - if ((ret = lock_safe(lock, key, mode))) + if ((ret = lock_safe(lock, key, mode)) || + (ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -2161,13 +2183,13 @@ out: int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_lock *lock) { - return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE, false); + return item_delete(sb, key, lock, NULL, SCOUTFS_LOCK_WRITE, false); } int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key, - struct scoutfs_lock *lock) + struct scoutfs_lock *lock, struct scoutfs_lock *primary) { - return item_delete(sb, key, lock, SCOUTFS_LOCK_WRITE_ONLY, true); + return item_delete(sb, key, lock, primary, SCOUTFS_LOCK_WRITE_ONLY, true); } u64 scoutfs_item_dirty_pages(struct super_block *sb) diff --git a/kmod/src/item.h b/kmod/src/item.h index 431866d5..d7fe6a2e 100644 --- a/kmod/src/item.h +++ b/kmod/src/item.h @@ -15,16 +15,15 @@ int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, - struct scoutfs_lock *lock); + struct scoutfs_lock *lock, struct scoutfs_lock *primary); int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_lock *lock); -int scoutfs_item_delete_force(struct super_block *sb, - struct scoutfs_key *key, - struct scoutfs_lock *lock); +int scoutfs_item_delete_force(struct super_block *sb, struct scoutfs_key *key, + struct scoutfs_lock *lock, struct scoutfs_lock *primary); u64 scoutfs_item_dirty_pages(struct super_block *sb); int scoutfs_item_write_dirty(struct super_block *sb); From 701f1a9538ca2f52b6a70de80cd382fd6cab640c Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 15 Nov 2022 12:34:52 -0800 Subject: [PATCH 2/2] Add test that checks duplicate meta_seq entries Add a quick test of the index items to make sure that rapid inode updates don't create duplicate meta_seq items. Signed-off-by: Zach Brown --- tests/golden/simple-inode-index | 1 + tests/tests/simple-inode-index.sh | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tests/golden/simple-inode-index b/tests/golden/simple-inode-index index 3b2036e8..57f77933 100644 --- a/tests/golden/simple-inode-index +++ b/tests/golden/simple-inode-index @@ -7,3 +7,4 @@ found second == changing metadata must increase meta seq == changing contents must increase data seq == make sure dirtying doesn't livelock walk +== concurrent update attempts maintain single entries diff --git a/tests/tests/simple-inode-index.sh b/tests/tests/simple-inode-index.sh index 514a4348..414c0db3 100644 --- a/tests/tests/simple-inode-index.sh +++ b/tests/tests/simple-inode-index.sh @@ -103,4 +103,34 @@ while [ "$nr" -lt 100 ]; do ((nr++)) done +# +# make sure rapid concurrent metadata updates don't create multiple +# meta_seq entries +# +# we had a bug where deletion items created under concurrent_write locks +# could get versions older than the items they're deleting which were +# protected by read/write locks. +# +echo "== concurrent update attempts maintain single entries" +FILES=4 +nr=1 +while [ "$nr" -lt 10 ]; do + # touch a bunch of files in parallel from all mounts + for i in $(t_fs_nrs); do + eval path="\$T_D${i}" + seq -f "$path/file-%.0f" 1 $FILES | xargs touch & + done + wait || t_fail "concurrent file updates failed" + + # make sure no inodes have duplicate entries + sync + scoutfs walk-inodes -p "$T_D0" meta_seq -- 0 -1 | \ + grep -v "minor" | \ + awk '{print $4}' | \ + sort -n | uniq -c | \ + awk '($1 != 1)' | \ + sort -n + ((nr++)) +done + t_pass