service: Improve error handling for split

Retry wasn't really happening since the loop was broken and sleep
part was skipped on error. Also, we were treating abort of split
during shutdown as if it were an actual error and that confused
longevity tests that parse for logs with error level. The fix is
about demoting the level of logs when we know the exception comes
from shutdown.

Fixes #20890.
This commit is contained in:
Raphael S. Carvalho
2024-10-02 10:08:11 -03:00
parent 7a7a1e3558
commit bcd358595f

View File

@@ -5386,9 +5386,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
};
exponential_backoff_retry split_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
bool sleep = false;
while (!_async_gate.is_closed() && !_group0_as.abort_requested()) {
bool sleep = false;
try {
// Ensures that latest changes to tablet metadata, in group0, are visible
auto guard = co_await _group0->client().start_operation(_group0_as);
@@ -5406,11 +5406,16 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
release_guard(std::move(guard));
co_await split_all_compaction_groups();
}
} catch (const seastar::abort_requested_exception& ex) {
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
break;
} catch (raft::request_aborted& ex) {
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
break;
} catch (...) {
slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
table, std::current_exception(), split_retry.sleep_time());
sleep = true;
break;
}
if (sleep) {
co_await split_retry.retry(_group0_as);