From 6db69b7a4f31b4e95f1cd26e6d607298e4b7b416 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 20 Jun 2024 09:54:04 -0700 Subject: [PATCH 01/10] Set root inode crtime in mkfs When we added the crtime creation timestamp to the inode we forgot to update mkfs to set the crtime of the root inode. Signed-off-by: Zach Brown --- utils/src/mkfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/src/mkfs.c b/utils/src/mkfs.c index f27a5674..211302aa 100644 --- a/utils/src/mkfs.c +++ b/utils/src/mkfs.c @@ -274,6 +274,8 @@ static int do_mkfs(struct mkfs_args *args) inode.ctime.nsec = inode.atime.nsec; inode.mtime.sec = inode.atime.sec; inode.mtime.nsec = inode.atime.nsec; + inode.crtime.sec = inode.atime.sec; + inode.crtime.nsec = inode.atime.nsec; btree_append_item(bt, &key, &inode, sizeof(inode)); ret = write_block(meta_fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, 1, blkno, From 2af6f47c8bd07c938f6164c38b93441f72145fd2 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 18 Jun 2024 12:50:55 -0700 Subject: [PATCH 02/10] Fix bad error exit path in unlink Unlink looks up the entry items for the name it is removing because we no longer store the extra key material in dentries. If this lookup fails it will use an error path which release a transaction which wasn't held. Thankfully this error path is unlikely (corruption or systemic errors like eio or enomem) so we haven't hit this in practice. Signed-off-by: Zach Brown --- kmod/src/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 357d33c6..4227457f 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -931,7 +931,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry) ret = lookup_dirent(sb, scoutfs_ino(dir), dentry->d_name.name, dentry->d_name.len, hash, &dent, dir_lock); if (ret < 0) - goto out; + goto unlock; if (should_orphan(inode)) { ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode), From 1fa0d7727c0469f8e0f46cabec06cc2e3e9abd78 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 21 Mar 2024 10:48:58 -0700 Subject: [PATCH 03/10] scoutfs_item_create checks wrong lock mode scoutfs_item_create() was checking that its lock had a read mode, when it should have been checking for a write mode. This worked out because callers with write mode locks are also protecting reads. Signed-off-by: Zach Brown --- kmod/src/item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmod/src/item.c b/kmod/src/item.c index 673797ef..c07b97da 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -1963,7 +1963,7 @@ int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock) { return item_create(sb, key, val, val_len, lock, NULL, - SCOUTFS_LOCK_READ, false); + SCOUTFS_LOCK_WRITE, false); } int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key, From 3052feac2968dd73dbef2c018d072d88630cf9ba Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 21 Mar 2024 10:39:10 -0700 Subject: [PATCH 04/10] Have item cache show unprotected lock The item cache has a bit of safety checks that make sure that an operation is performed while holding a lock that covers the item. It dumped a stack trace via WARN when that wasn't true, but it didn't include any details about the keys or lock modes involved. This adds a message that's printed once which includes the keys and modes when an operation is attempted that isn't protected. Signed-off-by: Zach Brown --- kmod/src/item.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/kmod/src/item.c b/kmod/src/item.c index c07b97da..a99202cf 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -24,6 +24,7 @@ #include "item.h" #include "forest.h" #include "block.h" +#include "msg.h" #include "trans.h" #include "counters.h" #include "scoutfs_trace.h" @@ -1670,13 +1671,24 @@ out: return ret; } -static int lock_safe(struct scoutfs_lock *lock, struct scoutfs_key *key, +static int lock_safe(struct super_block *sb, struct scoutfs_lock *lock, struct scoutfs_key *key, int mode) { - if (WARN_ON_ONCE(!scoutfs_lock_protected(lock, key, mode))) + bool prot = scoutfs_lock_protected(lock, key, mode); + + if (!prot) { + static bool once = false; + if (!once) { + scoutfs_err(sb, "lock (start "SK_FMT" end "SK_FMT" mode 0x%x) does not protect operation (key "SK_FMT" mode 0x%x)", + SK_ARG(&lock->start), SK_ARG(&lock->end), lock->mode, + SK_ARG(key), mode); + dump_stack(); + once = true; + } return -EINVAL; - else - return 0; + } + + return 0; } static int optional_lock_mode_match(struct scoutfs_lock *lock, int mode) @@ -1718,7 +1730,7 @@ int scoutfs_item_lookup(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_lookup); - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_READ))) + if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_READ))) goto out; ret = get_cached_page(sb, cinf, lock, key, false, false, 0, &pg); @@ -1793,7 +1805,7 @@ int scoutfs_item_next(struct super_block *sb, struct scoutfs_key *key, goto out; } - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_READ))) + if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_READ))) goto out; pos = *key; @@ -1874,7 +1886,7 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_dirty); - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE))) + if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_WRITE))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -1920,7 +1932,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_create); - if ((ret = lock_safe(lock, key, mode)) || + if ((ret = lock_safe(sb, lock, key, mode)) || (ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE))) goto out; @@ -1994,7 +2006,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_update); - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE))) + if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_WRITE))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -2062,7 +2074,7 @@ int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_delta); - if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE_ONLY))) + if ((ret = lock_safe(sb, lock, key, SCOUTFS_LOCK_WRITE_ONLY))) goto out; ret = scoutfs_forest_set_bloom_bits(sb, lock); @@ -2135,7 +2147,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key, scoutfs_inc_counter(sb, item_delete); - if ((ret = lock_safe(lock, key, mode)) || + if ((ret = lock_safe(sb, lock, key, mode)) || (ret = optional_lock_mode_match(primary, SCOUTFS_LOCK_WRITE))) goto out; From c296bc1959cacb5ab8faedd14759a0f842fdd6ca Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 1 Feb 2024 10:06:42 -0800 Subject: [PATCH 05/10] Remove scoutfs_data_wait_check_iter scoutfs_data_wait_check_iter() was checking the contiguous region of the file starting at its pos and extending for iter_iov_count() bytes. The caller can do that with the previous _data_wait_check() method by providing the same count that _check_iter() was using. Signed-off-by: Zach Brown --- kmod/src/data.c | 31 ------------------------------- kmod/src/data.h | 3 --- kmod/src/file.c | 6 ++---- 3 files changed, 2 insertions(+), 38 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 78dd74ad..54a1857b 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1807,37 +1807,6 @@ int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov, return ret; } -int scoutfs_data_wait_check_iter(struct inode *inode, loff_t pos, struct iov_iter *iter, - u8 sef, u8 op, struct scoutfs_data_wait *dw, - struct scoutfs_lock *lock) -{ - size_t count = iov_iter_count(iter); - size_t off = iter->iov_offset; - const struct iovec *iov; - size_t len; - int ret = 0; - - for (iov = iter->iov; count > 0; iov++) { - len = iov->iov_len - off; - if (len == 0) - continue; - - /* aren't we waiting on too much data here ? */ - ret = scoutfs_data_wait_check(inode, pos, len, - sef, op, dw, lock); - - if (ret != 0) - break; - - - pos += len; - count -= len; - off = 0; - } - - return ret; -} - int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw) { DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt); diff --git a/kmod/src/data.h b/kmod/src/data.h index 1fbfce9b..a34854eb 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -65,9 +65,6 @@ int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov, unsigned long nr_segs, loff_t pos, u8 sef, u8 op, struct scoutfs_data_wait *ow, struct scoutfs_lock *lock); -int scoutfs_data_wait_check_iter(struct inode *inode, loff_t pos, struct iov_iter *iter, - u8 sef, u8 op, struct scoutfs_data_wait *ow, - struct scoutfs_lock *lock); bool scoutfs_data_wait_found(struct scoutfs_data_wait *ow); int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *ow); diff --git a/kmod/src/file.c b/kmod/src/file.c index fd31a9d0..f572a55d 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -171,10 +171,8 @@ retry: goto out; if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) { - ret = scoutfs_data_wait_check_iter(inode, iocb->ki_pos, to, - SEF_OFFLINE, - SCOUTFS_IOC_DWO_READ, - &dw, scoutfs_inode_lock); + ret = scoutfs_data_wait_check(inode, iocb->ki_pos, iov_iter_count(to), SEF_OFFLINE, + SCOUTFS_IOC_DWO_READ, &dw, scoutfs_inode_lock); if (ret != 0) goto out; } else { From c385eea9a1d44e845b0b4189fbcab0ee1aa74dde Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 1 Feb 2024 09:59:47 -0800 Subject: [PATCH 06/10] Check for all offline in scoutfs_file_write_iter When we write to file contents we change the data_version. To stage old contents into an offline region the data_version of the file must match the archived copy. When writing we have to make sure that there is no offline data so that we don't increase the data_version which will prevent staging of any other file regions because the data_versions no longer match. scoutfs_file_write_iter was only checking for offline data in its write region, not the entire file. Fix it to match the _aio_write method and check the whole file. Signed-off-by: Zach Brown --- kmod/src/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kmod/src/file.c b/kmod/src/file.c index f572a55d..cac72245 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -223,10 +223,8 @@ retry: if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, scoutfs_inode_lock)) { /* data_version is per inode, whole file must be online */ - ret = scoutfs_data_wait_check_iter(inode, iocb->ki_pos, from, - SEF_OFFLINE, - SCOUTFS_IOC_DWO_WRITE, - &dw, scoutfs_inode_lock); + ret = scoutfs_data_wait_check(inode, 0, i_size_read(inode), SEF_OFFLINE, + SCOUTFS_IOC_DWO_WRITE, &dw, scoutfs_inode_lock); if (ret != 0) goto out; } From 3f773a8594ac6020d586bb4596c0c2ea393e1068 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 31 Jan 2024 17:13:39 -0800 Subject: [PATCH 07/10] Fix uninit written in scoutfs_file_write_iter scoutfs_file_write_iter tried to track written bytes and return those unless there was an error. But written was uninitialized if we got errors in any of the calls leading up to performing the write. The bytes written were also not being passed to the generic_write_sync helper. This fixes up all those inconsistencies and makes it look like the write_iter path in other filesystems. Signed-off-by: Zach Brown --- kmod/src/file.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kmod/src/file.c b/kmod/src/file.c index cac72245..55c8b230 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -203,8 +203,7 @@ ssize_t scoutfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct scoutfs_lock *scoutfs_inode_lock = NULL; SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); DECLARE_DATA_WAIT(dw); - int ret; - int written; + ssize_t ret; retry: inode_lock(inode); @@ -231,7 +230,7 @@ retry: /* XXX: remove SUID bit */ - written = __generic_file_write_iter(iocb, from); + ret = __generic_file_write_iter(iocb, from); out: scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); @@ -244,10 +243,10 @@ out: goto retry; } - if (ret > 0 || ret == -EIOCBQUEUED) - ret = generic_write_sync(iocb, written); + if (ret > 0) + ret = generic_write_sync(iocb, ret); - return written ? written : ret; + return ret; } #endif From 4b87045447e2b9f691532d366bdcbaa71f2993ac Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 16 Aug 2023 12:25:48 -0700 Subject: [PATCH 08/10] Pre-declare scoutfs_lock in forest.h Definitions in forest.h use lock pointers. Pre-declare the struct so it doesn't break inclusion without lock.h, following current practice in the header. Signed-off-by: Zach Brown --- kmod/src/forest.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 30564a11..24753e89 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -4,6 +4,7 @@ struct scoutfs_alloc; struct scoutfs_block_writer; struct scoutfs_block; +struct scoutfs_lock; #include "btree.h" From d6642da44d61e2f2173ddd03fe58747f4a67f018 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 11 Jun 2024 09:58:54 -0700 Subject: [PATCH 09/10] Prevent downgrade of format version Don't let change-format-version decrease the format version. It doesn't have the machinery to go back and migrate newer structures to older structures that would be compatible with code expecting the older version. Signed-off-by: Bryant G. Duffy-Ly [zab@versity.com: split from initial patch with other changes] Signed-off-by: Zach Brown --- utils/src/change_format_version.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/utils/src/change_format_version.c b/utils/src/change_format_version.c index 071e6303..1e5e1507 100644 --- a/utils/src/change_format_version.c +++ b/utils/src/change_format_version.c @@ -119,6 +119,16 @@ static int do_change_fmt_vers(struct change_fmt_vers_args *args) goto out; } + if (le64_to_cpu(meta_super->fmt_vers) > args->fmt_vers || + le64_to_cpu(data_super->fmt_vers) > args->fmt_vers) { + ret = -EPERM; + printf("Downgrade of Meta Format Version: %llu and Data Format Version: %llu to Format Version: %llu is not allowed\n", + le64_to_cpu(meta_super->fmt_vers), + le64_to_cpu(data_super->fmt_vers), + args->fmt_vers); + goto out; + } + if (le64_to_cpu(meta_super->fmt_vers) != args->fmt_vers) { meta_super->fmt_vers = cpu_to_le64(args->fmt_vers); From 8e37be279ca6f576b348935f28733e1b8f6d64ca Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 14 Jun 2024 15:41:13 -0700 Subject: [PATCH 10/10] Use seqlock to protect inode fields We were using a seqcount to protect high frequency reads and writes to some of our private inode fields. The writers were serialized by the caller but that's a bit too easy to get wrong. We're already storing the write seqcount update so the additional internal spinlock stores in seqlocks isn't a significant additional overhead. The seqlocks also handle preemption for us. Signed-off-by: Zach Brown --- kmod/src/inode.c | 46 +++++++++++++++++++--------------------------- kmod/src/inode.h | 2 +- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 540004cc..db76f798 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -91,7 +91,7 @@ static void scoutfs_inode_ctor(void *obj) init_rwsem(&si->extent_sem); mutex_init(&si->item_mutex); - seqcount_init(&si->seqcount); + seqlock_init(&si->seqlock); si->staging = false; scoutfs_per_task_init(&si->pt_data_lock); atomic64_set(&si->data_waitq.changed, 0); @@ -566,11 +566,9 @@ static void set_trans_seq(struct inode *inode, u64 *seq) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); if (*seq != sbi->trans_seq) { - preempt_disable(); - write_seqcount_begin(&si->seqcount); + write_seqlock(&si->seqlock); *seq = sbi->trans_seq; - write_seqcount_end(&si->seqcount); - preempt_enable(); + write_sequnlock(&si->seqlock); } } @@ -592,22 +590,18 @@ void scoutfs_inode_inc_data_version(struct inode *inode) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - preempt_disable(); - write_seqcount_begin(&si->seqcount); + write_seqlock(&si->seqlock); si->data_version++; - write_seqcount_end(&si->seqcount); - preempt_enable(); + write_sequnlock(&si->seqlock); } void scoutfs_inode_set_data_version(struct inode *inode, u64 data_version) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - preempt_disable(); - write_seqcount_begin(&si->seqcount); + write_seqlock(&si->seqlock); si->data_version = data_version; - write_seqcount_end(&si->seqcount); - preempt_enable(); + write_sequnlock(&si->seqlock); } void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off) @@ -616,8 +610,7 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off) if (inode && (on || off)) { si = SCOUTFS_I(inode); - preempt_disable(); - write_seqcount_begin(&si->seqcount); + write_seqlock(&si->seqlock); /* inode and extents out of sync, bad callers */ if (((s64)si->online_blocks + on < 0) || @@ -638,8 +631,7 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off) si->online_blocks, si->offline_blocks); - write_seqcount_end(&si->seqcount); - preempt_enable(); + write_sequnlock(&si->seqlock); } /* any time offline extents decreased we try and wake waiters */ @@ -647,16 +639,16 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off) scoutfs_data_wait_changed(inode); } -static u64 read_seqcount_u64(struct inode *inode, u64 *val) +static u64 read_seqlock_u64(struct inode *inode, u64 *val) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - unsigned int seq; + unsigned seq; u64 v; do { - seq = read_seqcount_begin(&si->seqcount); + seq = read_seqbegin(&si->seqlock); v = *val; - } while (read_seqcount_retry(&si->seqcount, seq)); + } while (read_seqretry(&si->seqlock, seq)); return v; } @@ -665,33 +657,33 @@ u64 scoutfs_inode_meta_seq(struct inode *inode) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - return read_seqcount_u64(inode, &si->meta_seq); + return read_seqlock_u64(inode, &si->meta_seq); } u64 scoutfs_inode_data_seq(struct inode *inode) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - return read_seqcount_u64(inode, &si->data_seq); + return read_seqlock_u64(inode, &si->data_seq); } u64 scoutfs_inode_data_version(struct inode *inode) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - return read_seqcount_u64(inode, &si->data_version); + return read_seqlock_u64(inode, &si->data_version); } void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); - unsigned int seq; + unsigned seq; do { - seq = read_seqcount_begin(&si->seqcount); + seq = read_seqbegin(&si->seqlock); *on = SCOUTFS_I(inode)->online_blocks; *off = SCOUTFS_I(inode)->offline_blocks; - } while (read_seqcount_retry(&si->seqcount, seq)); + } while (read_seqretry(&si->seqlock, seq)); } static int scoutfs_iget_test(struct inode *inode, void *arg) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 607ac9f8..dd8e33f3 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -47,7 +47,7 @@ struct scoutfs_inode_info { atomic64_t last_refreshed; /* initialized once for slab object */ - seqcount_t seqcount; + seqlock_t seqlock; bool staging; /* holder of i_mutex is staging */ struct scoutfs_per_task pt_data_lock; struct scoutfs_data_waitq data_waitq;