From e3c7e21c40a2bccbb5ae0d62e08540cef27bdbdf Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 19 Jan 2022 09:17:38 -0800 Subject: [PATCH 01/13] Use write memory barrier in set_shutting_down The server's little set_shutting_down() helper accidentally used a read barrier instead of a write barrier. Signed-off-by: Zach Brown --- kmod/src/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index e84d1198..c825bc78 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -172,7 +172,7 @@ static bool test_shutting_down(struct server_info *server) static void set_shutting_down(struct server_info *server, bool val) { server->shutting_down = val; - smp_rmb(); + smp_wmb(); } static void stop_server(struct server_info *server) From 89ca903c4122fa32c7887256bb6139636adec32e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 19 Jan 2022 09:21:02 -0800 Subject: [PATCH 02/13] Print log trees get/commit seqs Back when we added the get/commit transaction sequence numbers to the log_trees we forgot to add them to the scoutfs print output. Signed-off-by: Zach Brown --- utils/src/print.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/src/print.c b/utils/src/print.c index f9792c5e..7328b6f8 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -278,6 +278,8 @@ static int print_log_trees_item(struct scoutfs_key *key, u64 seq, u8 flags, void " data_freed: "ALCROOT_F"\n" " srch_file: "SRF_FMT"\n" " inode_count_delta: %lld\n" + " get_trans_seq: %lld\n" + " commit_trans_seq: %lld\n" " max_item_seq: %llu\n" " finalize_seq: %llu\n" " rid: %016llx\n" @@ -296,6 +298,8 @@ static int print_log_trees_item(struct scoutfs_key *key, u64 seq, u8 flags, void ALCROOT_A(<->data_freed), SRF_A(<->srch_file), le64_to_cpu(lt->inode_count_delta), + le64_to_cpu(lt->get_trans_seq), + le64_to_cpu(lt->commit_trans_seq), le64_to_cpu(lt->max_item_seq), le64_to_cpu(lt->finalize_seq), le64_to_cpu(lt->rid), From e2ce5ab6da80cfe845a6d9d1a9bbcb821fcee41a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 19 Jan 2022 09:22:48 -0800 Subject: [PATCH 03/13] Free pending recovery state on shutdown scoutfs_recov_shutdown() tried to move the recovery tracking structs off the shared list and into a private list so they could be freed. But then it went and walked the now empty shared list to free entries. It should walk the private list. This would leak a small amount of memory in the rare cases where the server was shutdown while recovery was still pending. Signed-off-by: Zach Brown --- kmod/src/recov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmod/src/recov.c b/kmod/src/recov.c index 3829760b..78123e27 100644 --- a/kmod/src/recov.c +++ b/kmod/src/recov.c @@ -262,7 +262,7 @@ void scoutfs_recov_shutdown(struct super_block *sb) recinf->timeout_fn = NULL; spin_unlock(&recinf->lock); - list_for_each_entry_safe(pend, tmp, &recinf->pending, head) { + list_for_each_entry_safe(pend, tmp, &list, head) { list_del(&pend->head); kfree(pend); } From 813ce24d798adb107251f23eb5a2636501ee8d8b Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 19 Jan 2022 09:30:40 -0800 Subject: [PATCH 04/13] Move local-force-unmount test script into tests/ The local-force-unmount fenced fencing script only works when all the mounts are on the local host and it uses force unmount. It is only used in our specific local testing scripts. Packaging it as an example lead people to believe that it could be used to cobble together a multi-host testing network, however temporary. Move it from being in utils and packged to being private to our tests so that it doesn't present an attractive nuisance. Signed-off-by: Zach Brown --- .../fenced-local-force-unmount.sh | 0 tests/run-tests.sh | 7 ++++--- utils/fenced/scoutfs-fenced.conf.example | 5 ++++- utils/scoutfs-utils.spec.in | 1 - 4 files changed, 8 insertions(+), 5 deletions(-) rename utils/fenced/local-force-unmount => tests/fenced-local-force-unmount.sh (100%) diff --git a/utils/fenced/local-force-unmount b/tests/fenced-local-force-unmount.sh similarity index 100% rename from utils/fenced/local-force-unmount rename to tests/fenced-local-force-unmount.sh diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 76e1eee2..5f826474 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -227,8 +227,9 @@ test "$T_QUORUM" -le "$T_NR_MOUNTS" || \ die "-q quorum mmembers must not be greater than -n mounts" # top level paths -T_KMOD=$(realpath "$(dirname $0)/../kmod") -T_UTILS=$(realpath "$T_KMOD/../utils") +T_TESTS=$(realpath "$(dirname $0)") +T_KMOD=$(realpath "$T_TESTS/../kmod") +T_UTILS=$(realpath "$T_TESTS/../utils") test -d "$T_KMOD" || die "kmod/ repo dir $T_KMOD not directory" test -d "$T_UTILS" || die "utils/ repo dir $T_UTILS not directory" @@ -382,7 +383,7 @@ cmd grep . /sys/kernel/debug/tracing/options/trace_printk \ conf="$T_RESULTS/scoutfs-fencd.conf" cat > $conf << EOF SCOUTFS_FENCED_DELAY=1 -SCOUTFS_FENCED_RUN=$T_UTILS/fenced/local-force-unmount +SCOUTFS_FENCED_RUN=$T_TESTS/fenced-local-force-unmount.sh SCOUTFS_FENCED_RUN_ARGS="" EOF export SCOUTFS_FENCED_CONFIG_FILE="$conf" diff --git a/utils/fenced/scoutfs-fenced.conf.example b/utils/fenced/scoutfs-fenced.conf.example index 3a2ec326..3868ded0 100644 --- a/utils/fenced/scoutfs-fenced.conf.example +++ b/utils/fenced/scoutfs-fenced.conf.example @@ -1,3 +1,6 @@ +# delay, in seconds, between each check for pending fence requests. SCOUTFS_FENCED_DELAY=1 -SCOUTFS_FENCED_RUN=/usr/libexec/scoutfs-fenced/run/local-force-unmount +# path to executable to run to service fence request +#SCOUTFS_FENCED_RUN= +# arguments to pass to binary SCOUTFS_FENCED_RUN_ARGS="" diff --git a/utils/scoutfs-utils.spec.in b/utils/scoutfs-utils.spec.in index 2c2a551e..4568b73f 100644 --- a/utils/scoutfs-utils.spec.in +++ b/utils/scoutfs-utils.spec.in @@ -55,7 +55,6 @@ install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced -install -m 755 -D fenced/local-force-unmount $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/run/local-force-unmount install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example From e14912974d3ff1f9cb6e933a65af3adba3e6f705 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 09:46:35 -0800 Subject: [PATCH 05/13] Wait for lock recovery before sending farewell We recently found that the server can send a farewell response and try to tear down a client's lock state while it was still in lock recovery with the client. The lock recovery response could add a lock for the client after farell's reclaim_rid() had thought the client was gone forever and tore down its locks. This left a lock in the lock server that wasn't associated with any clients and so could never be invalidated. Attempts to acquire conflicting locks with it would hang forever, which we saw as hangs in testing with lots of unmounting. We tried to fix it by serializing incoming request handling and forcefully clobbering the client's lock state as we first got the farewell request. That went very badly. This takes another approach of trying to explicitly wait for lock recovery to finish before sending farewell responses. It's more in line with the overall pattern of having the client be up and functional until farewell tears it down. With this in place we can revert the other attempted fix that was causing so many problems. Signed-off-by: Zach Brown --- kmod/src/server.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kmod/src/server.c b/kmod/src/server.c index c825bc78..6c9c3d38 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -3462,6 +3462,18 @@ static void farewell_worker(struct work_struct *work) } } + /* + * Responses that are ready to send can be further delayed by + * moving them back to the reqs list. + */ + list_for_each_entry_safe(fw, tmp, &send, entry) { + /* finish lock recovery before destroying locks, fenced if too long */ + if (scoutfs_recov_is_pending(sb, fw->rid, SCOUTFS_RECOV_LOCKS)) { + list_move_tail(&fw->entry, &reqs); + quo_reqs++; + } + } + /* clean up resources for mounts before sending responses */ list_for_each_entry_safe(fw, tmp, &send, entry) { ret = reclaim_rid(sb, fw->rid); @@ -3656,8 +3668,14 @@ static void finished_recovery(struct super_block *sb) void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which) { + DECLARE_SERVER_INFO(sb, server); + if (scoutfs_recov_finish(sb, rid, which) > 0) finished_recovery(sb); + + /* rid's farewell response might be sent after it finishes lock recov */ + if (which & SCOUTFS_RECOV_LOCKS) + queue_farewell_work(server); } /* From 5f2259c48f5d09858f786527a45b19e6279baf7a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 09:52:36 -0800 Subject: [PATCH 06/13] Revert "Fix client/server race btwn lock recov and farewell" This reverts commit 61ad84489198b009b91c3e2a08b802ec1beae567. This fix was trying to ensure that lock recovery response handling can't run after farewell calls reclaim_rid() by jumping through a bunch of hoops to tear down locking state as the first farewell request arrived. It introduced very slippery use after free during shutdown. It appears that it was from drain_workqueue() previously being able to stop chaining work. That's no longer possible when you're trying to drain two workqueues that can queue work in each other. We found a much clearer way to solve the problem so we can toss this. Signed-off-by: Zach Brown --- kmod/src/net.c | 37 +++---------------------------------- kmod/src/net.h | 2 -- kmod/src/server.c | 29 ----------------------------- 3 files changed, 3 insertions(+), 65 deletions(-) diff --git a/kmod/src/net.c b/kmod/src/net.c index e3b6a4a6..e028ce22 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -675,28 +675,18 @@ static void scoutfs_net_recv_worker(struct work_struct *work) scoutfs_tseq_add(&ninf->msg_tseq_tree, &mrecv->tseq_entry); - /* - * We want to drain the proc_workq in order to ensure that - * that the inflight lock recovery work is fully flushed out - * so that we can prevent the client/server racing trying to - * do lock recovery and processing farewell at the same time. - */ - if (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn) - drain_workqueue(conn->proc_workq); - /* - * Initial received greetings and farewell are processed + * Initial received greetings are processed * synchronously before any other incoming messages. * * Incoming requests or responses to the lock client are * called synchronously to avoid reordering. */ if (nh.cmd == SCOUTFS_NET_CMD_GREETING || - (nh.cmd == SCOUTFS_NET_CMD_FAREWELL && conn->listening_conn) || (nh.cmd == SCOUTFS_NET_CMD_LOCK && !conn->listening_conn)) scoutfs_net_proc_worker(&mrecv->proc_work); else - queue_work(conn->proc_workq, &mrecv->proc_work); + queue_work(conn->workq, &mrecv->proc_work); } if (ret) @@ -861,7 +851,6 @@ static void scoutfs_net_destroy_worker(struct work_struct *work) } destroy_workqueue(conn->workq); - destroy_workqueue(conn->proc_workq); scoutfs_tseq_del(&ninf->conn_tseq_tree, &conn->tseq_entry); kfree(conn->info); trace_scoutfs_conn_destroy_free(conn); @@ -1161,8 +1150,6 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work) /* wait for socket and proc work to finish, includes chained work */ drain_workqueue(conn->workq); - drain_workqueue(conn->proc_workq); - /* tear down the sock now that all work is done */ if (conn->sock) { sock_release(conn->sock); @@ -1360,7 +1347,7 @@ scoutfs_net_alloc_conn(struct super_block *sb, return NULL; } - conn->workq = alloc_workqueue("scoutfs_net_workq_%s", + conn->workq = alloc_workqueue("scoutfs_net_%s", WQ_UNBOUND | WQ_NON_REENTRANT, 0, name_suffix); if (!conn->workq) { @@ -1369,16 +1356,6 @@ scoutfs_net_alloc_conn(struct super_block *sb, return NULL; } - conn->proc_workq = alloc_workqueue("scoutfs_net_proc_workq_%s", - WQ_UNBOUND | WQ_NON_REENTRANT, 0, - name_suffix); - if (!conn->proc_workq) { - destroy_workqueue(conn->workq); - kfree(conn->info); - kfree(conn); - return NULL; - } - conn->sb = sb; conn->notify_up = notify_up; conn->notify_down = notify_down; @@ -1411,14 +1388,6 @@ scoutfs_net_alloc_conn(struct super_block *sb, return conn; } -/* Give the caller the client info of the connection. This is used by - * server processing the server_farewell, and lock response/recovery. - */ -void *scoutfs_net_client_info(struct scoutfs_net_connection *conn) -{ - return conn->info; -} - /* * Give the caller the client rid of the connection. This used by rare * server processing callers who want to send async responses after diff --git a/kmod/src/net.h b/kmod/src/net.h index dd8408a7..847e9204 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -63,7 +63,6 @@ struct scoutfs_net_connection { atomic64_t recv_seq; struct workqueue_struct *workq; - struct workqueue_struct *proc_workq; struct work_struct listen_work; struct work_struct connect_work; struct work_struct send_work; @@ -116,7 +115,6 @@ scoutfs_net_alloc_conn(struct super_block *sb, scoutfs_net_notify_t notify_up, scoutfs_net_notify_t notify_down, size_t info_size, scoutfs_net_request_t *req_funcs, char *name_suffix); -void *scoutfs_net_client_info(struct scoutfs_net_connection *conn); u64 scoutfs_net_client_rid(struct scoutfs_net_connection *conn); int scoutfs_net_connect(struct super_block *sb, struct scoutfs_net_connection *conn, diff --git a/kmod/src/server.c b/kmod/src/server.c index 6c9c3d38..24401a53 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -122,7 +122,6 @@ struct server_info { struct server_client_info { u64 rid; struct list_head head; - bool received_farewell; }; static __le64 *first_valopt(struct scoutfs_volume_options *valopt) @@ -1506,15 +1505,11 @@ static int server_lock(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { - struct server_client_info *sci = scoutfs_net_client_info(conn); u64 rid = scoutfs_net_client_rid(conn); if (arg_len != sizeof(struct scoutfs_net_lock)) return -EINVAL; - if (sci->received_farewell) - return scoutfs_net_response(sb, conn, cmd, id, -EINVAL, NULL, 0); - return scoutfs_lock_server_request(sb, rid, id, arg); } @@ -1523,15 +1518,11 @@ static int lock_response(struct super_block *sb, void *resp, unsigned int resp_len, int error, void *data) { - struct server_client_info *sci = scoutfs_net_client_info(conn); u64 rid = scoutfs_net_client_rid(conn); if (resp_len != sizeof(struct scoutfs_net_lock)) return -EINVAL; - if (sci->received_farewell) - return 0; - return scoutfs_lock_server_response(sb, rid, resp); } @@ -1569,15 +1560,11 @@ static int lock_recover_response(struct super_block *sb, void *resp, unsigned int resp_len, int error, void *data) { - struct server_client_info *sci = scoutfs_net_client_info(conn); u64 rid = scoutfs_net_client_rid(conn); if (invalid_recover(resp, resp_len)) return -EINVAL; - if (sci->received_farewell) - return 0; - return scoutfs_lock_server_recover_response(sb, rid, resp); } @@ -3541,11 +3528,9 @@ static int server_farewell(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, u64 id, void *arg, u16 arg_len) { - struct server_client_info *sci = scoutfs_net_client_info(conn); struct server_info *server = SCOUTFS_SB(sb)->server_info; u64 rid = scoutfs_net_client_rid(conn); struct farewell_request *fw; - int ret; if (arg_len != 0) return -EINVAL; @@ -3563,20 +3548,6 @@ static int server_farewell(struct super_block *sb, list_add_tail(&fw->entry, &server->farewell_requests); spin_unlock(&server->farewell_lock); - /* - * Tear down client lock server state and set that we recieved farewell - * to ensure that we do not race between client and server trying to process - * lock recovery at the same time (race). We also want to mark that the recovery - * finished so that if client's try to send stuff later; the server doesnt care. - */ - sci->received_farewell = true; - ret = scoutfs_lock_server_farewell(sb, rid); - if (ret < 0) { - kfree(fw); - return ret; - } - scoutfs_server_recov_finish(sb, rid, SCOUTFS_RECOV_LOCKS); - queue_farewell_work(server); /* response will be sent later */ From e9b3cc873a4db8874f9814f6dae888422c0e3dfc Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 15:56:54 -0800 Subject: [PATCH 07/13] Export scoutfs_inode_init_key We're adding an ioctl that wants to build inode item keys so let's export the private inode key initializer. Signed-off-by: Zach Brown --- kmod/src/inode.c | 12 ++++++------ kmod/src/inode.h | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/kmod/src/inode.c b/kmod/src/inode.c index bbc48daf..fb45645c 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -276,7 +276,7 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) set_item_info(si, cinode); } -static void init_inode_key(struct scoutfs_key *key, u64 ino) +void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino) { *key = (struct scoutfs_key) { .sk_zone = SCOUTFS_FS_ZONE, @@ -317,7 +317,7 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock, if (atomic64_read(&si->last_refreshed) == refresh_gen) return 0; - init_inode_key(&key, scoutfs_ino(inode)); + scoutfs_inode_init_key(&key, scoutfs_ino(inode)); mutex_lock(&si->item_mutex); if (atomic64_read(&si->last_refreshed) < refresh_gen) { @@ -803,7 +803,7 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock) store_inode(&sinode, inode); - init_inode_key(&key, scoutfs_ino(inode)); + scoutfs_inode_init_key(&key, scoutfs_ino(inode)); ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock); if (!ret) @@ -1022,7 +1022,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock, ret = update_indices(sb, si, ino, inode->i_mode, &sinode, lock_list); BUG_ON(ret); - init_inode_key(&key, ino); + scoutfs_inode_init_key(&key, ino); err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock); if (err) { @@ -1421,7 +1421,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, set_inode_ops(inode); store_inode(&sinode, inode); - init_inode_key(&key, scoutfs_ino(inode)); + scoutfs_inode_init_key(&key, scoutfs_ino(inode)); ret = scoutfs_omap_inc(sb, ino); if (ret < 0) @@ -1546,7 +1546,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo goto out; } - init_inode_key(&key, ino); + scoutfs_inode_init_key(&key, ino); ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock); diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 9b98b966..f02f34cc 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -83,6 +83,7 @@ void scoutfs_inode_queue_iput(struct inode *inode); struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf); struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino); +void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino); void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major, u32 minor, u64 ino); int scoutfs_inode_index_start(struct super_block *sb, u64 *seq); From 7a96e03148d8dd382e56d993455eab9300949e52 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 16:00:06 -0800 Subject: [PATCH 08/13] Add get_allocated_inos ioctl Add an ioctl that can give some indication of inodes that have inode items. We're exposing this for tests that verify the handling of open unlinked inodes. Signed-off-by: Zach Brown --- kmod/src/ioctl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ kmod/src/ioctl.h | 39 +++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 014527ec..d30a5021 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -1320,6 +1320,84 @@ out: return ret ?: count; } +static long scoutfs_ioc_get_allocated_inos(struct file *file, unsigned long arg) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct scoutfs_ioctl_get_allocated_inos __user *ugai = (void __user *)arg; + struct scoutfs_ioctl_get_allocated_inos gai; + struct scoutfs_lock *lock = NULL; + struct scoutfs_key key; + struct scoutfs_key end; + u64 __user *uinos; + u64 bytes; + u64 ino; + int nr; + int ret; + + if (!(file->f_mode & FMODE_READ)) { + ret = -EBADF; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + + if (copy_from_user(&gai, ugai, sizeof(gai))) { + ret = -EFAULT; + goto out; + } + + if ((gai.inos_ptr & (sizeof(__u64) - 1)) || (gai.inos_bytes < sizeof(__u64))) { + ret = -EINVAL; + goto out; + } + + scoutfs_inode_init_key(&key, gai.start_ino); + scoutfs_inode_init_key(&end, gai.start_ino | SCOUTFS_LOCK_INODE_GROUP_MASK); + uinos = (void __user *)gai.inos_ptr; + bytes = gai.inos_bytes; + nr = 0; + + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, gai.start_ino, &lock); + if (ret < 0) + goto out; + + while (bytes >= sizeof(*uinos)) { + + ret = scoutfs_item_next(sb, &key, &end, NULL, 0, lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + if (key.sk_zone != SCOUTFS_FS_ZONE) { + ret = 0; + break; + } + + /* all fs items are owned by allocated inodes, and _first is always ino */ + ino = le64_to_cpu(key._sk_first); + if (put_user(ino, uinos)) { + ret = -EFAULT; + break; + } + + uinos++; + bytes -= sizeof(*uinos); + if (++nr == INT_MAX) + break; + + scoutfs_inode_init_key(&key, ino + 1); + } + + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); +out: + return ret ?: nr; +} + long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1353,6 +1431,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_resize_devices(file, arg); case SCOUTFS_IOC_READ_XATTR_TOTALS: return scoutfs_ioc_read_xattr_totals(file, arg); + case SCOUTFS_IOC_GET_ALLOCATED_INOS: + return scoutfs_ioc_get_allocated_inos(file, arg); } return -ENOTTY; diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 24cd9a46..459ee7fb 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -520,4 +520,43 @@ struct scoutfs_ioctl_xattr_total { #define SCOUTFS_IOC_READ_XATTR_TOTALS \ _IOW(SCOUTFS_IOCTL_MAGIC, 15, struct scoutfs_ioctl_read_xattr_totals) +/* + * This fills the caller's inos array with inode numbers that are in use + * after the start ino, within an internal inode group. + * + * This only makes a promise about the state of the inode numbers within + * the first and last numbers returned by one call. At one time, all of + * those inodes were still allocated. They could have changed before + * the call returned. And any numbers outside of the first and last + * (or single) are undefined. + * + * This doesn't iterate over all allocated inodes, it only probes a + * single group that the start inode is within. This interface was + * first introduced to support tests that needed to find out about a + * specific inode, while having some other similarly niche uses. It is + * unsuitable for a consistent iteration over all the inode numbers in + * use. + * + * This test of inode items doesn't serialize with the inode lifetime + * mechanism. It only tells you the numbers of inodes that were once + * active in the system and haven't yet been fully deleted. The inode + * numbers returned could have been in the process of being deleted and + * were already unreachable even before the call started. + * + * @start_ino: the first inode number that could be returned + * @inos_ptr: pointer to an aligned array of 64bit inode numbers + * @inos_bytes: the number of bytes available in the inos_ptr array + * + * Returns errors or the count of inode numbers returned, quite possibly + * including 0. + */ +struct scoutfs_ioctl_get_allocated_inos { + __u64 start_ino; + __u64 inos_ptr; + __u64 inos_bytes; +}; + +#define SCOUTFS_IOC_GET_ALLOCATED_INOS \ + _IOW(SCOUTFS_IOCTL_MAGIC, 16, struct scoutfs_ioctl_get_allocated_inos) + #endif From e06796171471f1b6ccc555377f5ea6454517ade4 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 16:18:35 -0800 Subject: [PATCH 09/13] Add get-allocated-inos scoutfs command Add the get-allocated-inos scoutfs command which wraps the GET_ALLOCATED_INOS ioctl. It'll be used by tests to find items associated with an inode instead of trying to open the inode by a constructed handle after it was unlinked. Signed-off-by: Zach Brown --- utils/man/scoutfs.8 | 27 +++++++ utils/src/get_allocated_inos.c | 137 +++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 utils/src/get_allocated_inos.c diff --git a/utils/man/scoutfs.8 b/utils/man/scoutfs.8 index 53662e97..6bece55c 100644 --- a/utils/man/scoutfs.8 +++ b/utils/man/scoutfs.8 @@ -617,6 +617,33 @@ command is used first. .RE .PD +.TP +.BI "get-allocated-inos [-i|--ino INO] [-s|--single] [-p|--path PATH]" +.sp +This debugging command prints allocated inode numbers. It only prints +inodes +found in the group that contains the starting inode. The printed inode +numbers aren't necessarily reachable. They could be anywhere in the +process from being unlinked to finally deleted when their items +were found. +.RS 1.0i +.PD 0 +.TP +.sp +.B "-i, --ino INO" +The first 64bit inode number which could be printed. +.TP +.B "-s, --single" +Only print the single starting inode when it is allocated, all other allocated +inode numbers will be ignored. +.TP +.B "-p, --path PATH" +A path within a ScoutFS filesystem. +.RE +.PD + +.TP + .SH SEE ALSO .BR scoutfs (5), .BR xattr (7), diff --git a/utils/src/get_allocated_inos.c b/utils/src/get_allocated_inos.c new file mode 100644 index 00000000..cc9039b2 --- /dev/null +++ b/utils/src/get_allocated_inos.c @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "parse.h" +#include "util.h" +#include "format.h" +#include "ioctl.h" +#include "cmd.h" + +struct get_allocated_inos_args { + char *path; + u64 ino; + bool have_ino; + bool single; +}; + +static int do_get_allocated_inos(struct get_allocated_inos_args *args) +{ + struct scoutfs_ioctl_get_allocated_inos gai; + u64 *inos = NULL; + int fd = -1; + u64 bytes; + int ret; + int i; + + if (args->single) + bytes = sizeof(*inos); + else + bytes = SCOUTFS_LOCK_INODE_GROUP_NR * sizeof(*inos); + + inos = malloc(bytes); + if (!inos) { + fprintf(stderr, "inode number array allocation failed\n"); + ret = -ENOMEM; + goto out; + } + + fd = get_path(args->path, O_RDONLY); + if (fd < 0) + return fd; + + memset(&gai, 0, sizeof(gai)); + gai.start_ino = args->ino; + gai.inos_ptr = (unsigned long)inos; + gai.inos_bytes = bytes; + + ret = ioctl(fd, SCOUTFS_IOC_GET_ALLOCATED_INOS, &gai); + if (ret < 0) { + ret = -errno; + fprintf(stderr, "get_allocated_inos ioctl failed: " + "%s (%d)\n", strerror(errno), errno); + goto out; + } + + if (args->single && ret > 0 && inos[0] != args->ino) + ret = 0; + + for (i = 0; i < ret; i++) + printf("%llu\n", inos[i]); + + ret = 0; +out: + if (fd >= 0) + close(fd); + free(inos); + + return ret; +}; + +static int parse_opt(int key, char *arg, struct argp_state *state) +{ + struct get_allocated_inos_args *args = state->input; + int ret; + + switch (key) { + case 'i': + ret = parse_u64(arg, &args->ino); + if (ret) + return ret; + args->have_ino = true; + case 'p': + args->path = strdup_or_error(state, arg); + break; + case 's': + args->single = true; + break; + case ARGP_KEY_FINI: + if (!args->have_ino) + argp_error(state, "must provide --ino starting inode number option"); + default: + break; + } + + return 0; +} + +static struct argp_option options[] = { + { "ino", 'i', "NUMBER", 0, "Start from 64bit inode number (required)"}, + { "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"}, + { "single", 's', NULL, 0, "Only print single specific inode number argument"}, + { NULL } +}; + +static struct argp argp = { + options, + parse_opt, + NULL, + "Print allocated inode numbers from starting inode number" +}; + +static int get_allocated_inos_cmd(int argc, char **argv) +{ + + struct get_allocated_inos_args get_allocated_inos_args = {NULL}; + int ret; + + ret = argp_parse(&argp, argc, argv, 0, NULL, &get_allocated_inos_args); + if (ret) + return ret; + + return do_get_allocated_inos(&get_allocated_inos_args); +} + +static void __attribute__((constructor)) get_allocated_inos_ctor(void) +{ + cmd_register_argp("get-allocated-inos", &argp, GROUP_DEBUG, get_allocated_inos_cmd); +} From 9fa2c6af89cc86e33154458db395825ca9482de4 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 16:26:35 -0800 Subject: [PATCH 10/13] Use get-allocated-inos in orphan-inodes test The orphan inodes test needs to test if inode items exist as it manipulates inodes. It used to open the inode by a handle but we're fixing that to not allow opening unlinked files. The get-allocated-inos ioctl tests for the presence of items owned by the inode regardless of any other vfs state so we can use it to verify what scoutfs is doing as we work with the vfs inodes. Signed-off-by: Zach Brown --- tests/tests/orphan-inodes.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh index bc55d35b..ffd9f498 100644 --- a/tests/tests/orphan-inodes.sh +++ b/tests/tests/orphan-inodes.sh @@ -26,7 +26,8 @@ inode_exists() { local ino="$1" - handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1 + scoutfs get-allocated-inos -i "$ino" -s -p "$T_M0" > $T_TMP.inos.log 2>&1 + test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino" } echo "== test our inode existance function" From cff17a4cae0b0b8a3bd0cd56c5e3adf76084fc09 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 17:17:50 -0800 Subject: [PATCH 11/13] Remove unused flags scoutfs_inode_refresh arg The flags argument to scoutfs_inode_refresh wasn't being used. Signed-off-by: Zach Brown --- kmod/src/inode.c | 5 ++--- kmod/src/inode.h | 3 +-- kmod/src/lock.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/kmod/src/inode.c b/kmod/src/inode.c index fb45645c..0c91c96a 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -296,8 +296,7 @@ void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino) * fields because they should have already had a locked refreshed inode * to be dereferencing its contents. */ -int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock, - int flags) +int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; @@ -721,7 +720,7 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf) atomic64_set(&si->last_refreshed, 0); inode->i_version = 0; - ret = scoutfs_inode_refresh(inode, lock, 0); + ret = scoutfs_inode_refresh(inode, lock); if (ret == 0) ret = scoutfs_omap_inc(sb, ino); if (ret) { diff --git a/kmod/src/inode.h b/kmod/src/inode.h index f02f34cc..90a3b2da 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -118,8 +118,7 @@ u64 scoutfs_inode_data_version(struct inode *inode); void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off); int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock); -int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock, - int flags); +int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock); int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int scoutfs_setattr(struct dentry *dentry, struct iattr *attr); diff --git a/kmod/src/lock.c b/kmod/src/lock.c index ca674c8f..f0d0238d 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -1050,7 +1050,7 @@ int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int goto out; if (flags & SCOUTFS_LKF_REFRESH_INODE) { - ret = scoutfs_inode_refresh(inode, *lock, flags); + ret = scoutfs_inode_refresh(inode, *lock); if (ret < 0) { scoutfs_unlock(sb, *lock, mode); *lock = NULL; From 15d7eec1f9c6040094aac3056b955adade36ba21 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 17:19:03 -0800 Subject: [PATCH 12/13] Disallow openening unlinked files by handle Our open by handle functions didn't care that the inode wasn't referenced and let tasks open unlinked inodes by number. This interacted badly with the inode deletion mechanisms which required that inodes couldn't be cached on other nodes after the transaction which removed their final reference. If a task did accidentally open a file by inode while it was being deleted it could see the inode items in an inconsistent state and return very confusing errors that look like corruption. The fix is to give the handle iget callers a flag to tell iget to only get the inode if it has a positive nlink. If iget sees that the inode has been unlinked it returns enoent. Signed-off-by: Zach Brown --- kmod/src/dir.c | 2 +- kmod/src/export.c | 6 +++--- kmod/src/inode.c | 44 ++++++++++++++++++++++++++++---------------- kmod/src/inode.h | 3 ++- kmod/src/super.c | 2 +- 5 files changed, 35 insertions(+), 22 deletions(-) diff --git a/kmod/src/dir.c b/kmod/src/dir.c index 7470244c..a4b6dc09 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -511,7 +511,7 @@ out: else if (ino == 0) inode = NULL; else - inode = scoutfs_iget(sb, ino, 0); + inode = scoutfs_iget(sb, ino, 0, 0); /* * We can't splice dir aliases into the dcache. dir entries diff --git a/kmod/src/export.c b/kmod/src/export.c index 5321f9cd..29b89bc3 100644 --- a/kmod/src/export.c +++ b/kmod/src/export.c @@ -81,7 +81,7 @@ static struct dentry *scoutfs_fh_to_dentry(struct super_block *sb, trace_scoutfs_fh_to_dentry(sb, fh_type, sfid); if (scoutfs_valid_fileid(fh_type)) - inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0); + inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0, SCOUTFS_IGF_LINKED); return d_obtain_alias(inode); } @@ -100,7 +100,7 @@ static struct dentry *scoutfs_fh_to_parent(struct super_block *sb, if (scoutfs_valid_fileid(fh_type) && fh_type == FILEID_SCOUTFS_WITH_PARENT) - inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0); + inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0, SCOUTFS_IGF_LINKED); return d_obtain_alias(inode); } @@ -123,7 +123,7 @@ static struct dentry *scoutfs_get_parent(struct dentry *child) scoutfs_dir_free_backref_path(sb, &list); trace_scoutfs_get_parent(sb, inode, ino); - inode = scoutfs_iget(sb, ino, 0); + inode = scoutfs_iget(sb, ino, 0, SCOUTFS_IGF_LINKED); return d_obtain_alias(inode); } diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 0c91c96a..ac711ac8 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -696,21 +696,20 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino) return ilookup5(sb, ino, scoutfs_iget_test, &ino); } -struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf) +struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf) { struct scoutfs_lock *lock = NULL; struct scoutfs_inode_info *si; - struct inode *inode; + struct inode *inode = NULL; int ret; ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock); - if (ret) - return ERR_PTR(ret); + if (ret < 0) + goto out; - inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, - &ino); + inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino); if (!inode) { - inode = ERR_PTR(-ENOMEM); + ret = -ENOMEM; goto out; } @@ -721,19 +720,32 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf) inode->i_version = 0; ret = scoutfs_inode_refresh(inode, lock); - if (ret == 0) - ret = scoutfs_omap_inc(sb, ino); - if (ret) { - iget_failed(inode); - inode = ERR_PTR(ret); - } else { - set_inode_ops(inode); - unlock_new_inode(inode); + if (ret < 0) + goto out; + + if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) { + ret = -ENOENT; + goto out; } + + ret = scoutfs_omap_inc(sb, ino); + if (ret < 0) + goto out; + + set_inode_ops(inode); + unlock_new_inode(inode); } + ret = 0; out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + + if (ret < 0) { + if (inode) + iget_failed(inode); + inode = ERR_PTR(ret); + } + return inode; } @@ -1854,7 +1866,7 @@ static void inode_orphan_scan_worker(struct work_struct *work) } /* try to cached and evict unused inode to delete, can be racing */ - inode = scoutfs_iget(sb, ino, 0); + inode = scoutfs_iget(sb, ino, 0, 0); if (IS_ERR(inode)) { ret = PTR_ERR(inode); if (ret == -ENOENT) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 90a3b2da..eab303ee 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -80,7 +80,8 @@ int scoutfs_drop_inode(struct inode *inode); void scoutfs_evict_inode(struct inode *inode); void scoutfs_inode_queue_iput(struct inode *inode); -struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf); +#define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */ +struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf); struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino); void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino); diff --git a/kmod/src/super.c b/kmod/src/super.c index 792b8f1a..3302ad4f 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -601,7 +601,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) goto out; /* this interruptible iget lets hung mount be aborted with ctl-c */ - inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE); + inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE, 0); if (IS_ERR(inode)) { ret = PTR_ERR(inode); if (ret == -ERESTARTSYS) From 329ac0347d86ad66f5ad82542c5ab79b37ae2aa5 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 21 Jan 2022 17:25:00 -0800 Subject: [PATCH 13/13] Remove unused scoutfs_net_cancel_request() The net _cancel_request call hasn't been used or tested in approximately a bazillion years. Best to get rid of it and have to add and test it if we think we need it again. Signed-off-by: Zach Brown --- kmod/src/net.c | 17 ----------------- kmod/src/net.h | 3 --- 2 files changed, 20 deletions(-) diff --git a/kmod/src/net.c b/kmod/src/net.c index e028ce22..7d8316b5 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -1772,23 +1772,6 @@ int scoutfs_net_response_node(struct super_block *sb, NULL, NULL, NULL); } -/* - * The response function that was submitted with the request is not - * called if the request is canceled here. - */ -void scoutfs_net_cancel_request(struct super_block *sb, - struct scoutfs_net_connection *conn, - u8 cmd, u64 id) -{ - struct message_send *msend; - - spin_lock(&conn->lock); - msend = find_request(conn, cmd, id); - if (msend) - complete_send(conn, msend); - spin_unlock(&conn->lock); -} - struct sync_request_completion { struct completion comp; void *resp; diff --git a/kmod/src/net.h b/kmod/src/net.h index 847e9204..fe2c29e4 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -134,9 +134,6 @@ int scoutfs_net_submit_request_node(struct super_block *sb, u64 rid, u8 cmd, void *arg, u16 arg_len, scoutfs_net_response_t resp_func, void *resp_data, u64 *id_ret); -void scoutfs_net_cancel_request(struct super_block *sb, - struct scoutfs_net_connection *conn, - u8 cmd, u64 id); int scoutfs_net_sync_request(struct super_block *sb, struct scoutfs_net_connection *conn, u8 cmd, void *arg, unsigned arg_len,