From 7007dabdf97239fe3adf839c93d72c477cc4e636 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Wed, 19 Feb 2025 10:38:40 -0300 Subject: [PATCH] storage_service: Don't retry split when table is dropped The split monitor wasn't handling the scenario where the table being split is dropped. The monitor would be unable to find the tablet map of such a table, and the error would be treated as a retryable one causing the monitor to fall into an endless retry loop, with sleeps in between. And that would block further splits, since the monitor would be busy with the retries. The fix is about detecting table was dropped and skipping to the next candidate, if any. Fixes #21859. Signed-off-by: Raphael S. Carvalho Closes scylladb/scylladb#22933 (cherry picked from commit 4d8a333a7fc14b095c006f0d00773aeb47106cf0) Closes scylladb/scylladb#23480 --- locator/tablets.cc | 11 ++++++++--- locator/tablets.hh | 5 +++++ service/storage_service.cc | 6 ++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/locator/tablets.cc b/locator/tablets.cc index 87b4a72c0f..748d3e975d 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -235,18 +235,23 @@ tablet_transition_info migration_to_transition_info(const tablet_info& ti, const }; } +no_such_tablet_map::no_such_tablet_map(const table_id& id) + : runtime_error{fmt::format("Tablet map not found for table {}", id)} +{ +} + const tablet_map& tablet_metadata::get_tablet_map(table_id id) const { try { return *_tablets.at(id); } catch (const std::out_of_range&) { - throw_with_backtrace(format("Tablet map not found for table {}", id)); + throw_with_backtrace(id); } } void tablet_metadata::mutate_tablet_map(table_id id, noncopyable_function func) { auto it = _tablets.find(id); if (it == _tablets.end()) { - throw std::runtime_error(format("Tablet map not found for table {}", id)); + throw no_such_tablet_map(id); } auto tablet_map_copy = make_lw_shared(*it->second); func(*tablet_map_copy); @@ -256,7 +261,7 @@ void tablet_metadata::mutate_tablet_map(table_id id, noncopyable_function tablet_metadata::mutate_tablet_map_async(table_id id, noncopyable_function(tablet_map&)> func) { auto it = _tablets.find(id); if (it == _tablets.end()) { - throw std::runtime_error(format("Tablet map not found for table {}", id)); + throw no_such_tablet_map(id); } auto tablet_map_copy = make_lw_shared(*it->second); co_await func(*tablet_map_copy); diff --git a/locator/tablets.hh b/locator/tablets.hh index 82cfaa6130..01c6189550 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -393,6 +393,11 @@ struct tablet_desc { const tablet_transition_info* transition; // null if there's no transition. }; +class no_such_tablet_map : public std::runtime_error { +public: + no_such_tablet_map(const table_id& id); +}; + /// Stores information about tablets of a single table. /// /// The map contains a constant number of tablets, tablet_count(). diff --git a/service/storage_service.cc b/service/storage_service.cc index 09bb258345..ffb690503d 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -5454,6 +5454,12 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep release_guard(std::move(guard)); co_await split_all_compaction_groups(); } + } catch (const locator::no_such_tablet_map& ex) { + slogger.warn("Failed to complete splitting of table {} due to {}", table, ex); + break; + } catch (const replica::no_such_column_family& ex) { + slogger.warn("Failed to complete splitting of table {} due to {}", table, ex); + break; } catch (const seastar::abort_requested_exception& ex) { slogger.warn("Failed to complete splitting of table {} due to {}", table, ex); break;