From b477604339ec9c8d2e3d5d2ab542872eca97047a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 6 Oct 2021 13:42:50 -0700 Subject: [PATCH] Don't clobber srch compact errors The srch compaction worker will wait a bit before attempting another compaction as it finishes a compaction that failed. Unfortunately, it clobbered the errors it got during compaction with the result of sending the commit to the server with the error flag. If the commit is successful then it thinks there were no errors and immediately re-queues itself to try the next compaction. If the error is persistent, as it was with a bug in how we merged log files with a single page's worth of entries, then we can spin indefinitely getting and error, clobbering the error with the commit result, and immediately queueing our work to do it all over again. This fix preserves existing errors when geting the result of the commit and will correctly back off. If we get persistent merge errors at least they won't consume significant resources. We add a counter for commit for the errors so we can get some visibility if this happens. Signed-off-by: Zach Brown --- kmod/src/counters.h | 1 + kmod/src/srch.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 234c489d..b3e68bd4 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -178,6 +178,7 @@ EXPAND_COUNTER(srch_add_entry) \ EXPAND_COUNTER(srch_compact_dirty_block) \ EXPAND_COUNTER(srch_compact_entry) \ + EXPAND_COUNTER(srch_compact_error) \ EXPAND_COUNTER(srch_compact_flush) \ EXPAND_COUNTER(srch_compact_log_page) \ EXPAND_COUNTER(srch_compact_removed_entry) \ diff --git a/kmod/src/srch.c b/kmod/src/srch.c index 54b0711d..b23fc6c2 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -28,6 +28,7 @@ #include "btree.h" #include "spbm.h" #include "client.h" +#include "counters.h" #include "scoutfs_trace.h" /* @@ -2128,6 +2129,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work) struct scoutfs_alloc alloc; unsigned long delay; int ret; + int err; sc = kmalloc(sizeof(struct scoutfs_srch_compact), GFP_NOFS); if (sc == NULL) { @@ -2166,10 +2168,14 @@ commit: sc->meta_freed = alloc.freed; sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0; - ret = scoutfs_client_srch_commit_compact(sb, sc); + err = scoutfs_client_srch_commit_compact(sb, sc); + if (err < 0 && ret == 0) + ret = err; out: /* our allocators and files should be stable */ WARN_ON_ONCE(ret == -ESTALE); + if (ret < 0) + scoutfs_inc_counter(sb, srch_compact_error); scoutfs_block_writer_forget_all(sb, &wri); if (!atomic_read(&srinf->shutdown)) {