From 5202bb9d3c837358aa6779a10d8fca150fa0a58c Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Tue, 6 Feb 2024 13:53:42 +0100 Subject: [PATCH] repair: add methods to skip dropped table Schema propagation is async so one node can see the table while on the other node it is already dropped. So, if the nodes stream the table data, the latter node throws no_such_column_family. The exception is propagated to the other node, but its type is lost, so the operation fails on the other node. Add method which waits until all raft changes are applied and then checks whether given table exists. Add the function which uses the above to determine, whether the function failed because of dropped table (eg. on the remote node so the exact exception type is unknown). If so, the exception isn't rethrown. --- configure.py | 1 + repair/table_check.cc | 60 ++++++++++++++++++++++++++++++++++++ repair/table_check.hh | 42 +++++++++++++++++++++++++ service/migration_manager.hh | 5 +++ 4 files changed, 108 insertions(+) create mode 100644 repair/table_check.cc create mode 100644 repair/table_check.hh diff --git a/configure.py b/configure.py index b02502876d..8fb5d33935 100755 --- a/configure.py +++ b/configure.py @@ -1126,6 +1126,7 @@ scylla_core = (['message/messaging_service.cc', 'utils/lister.cc', 'repair/repair.cc', 'repair/row_level.cc', + 'repair/table_check.cc', 'exceptions/exceptions.cc', 'auth/allow_all_authenticator.cc', 'auth/allow_all_authorizer.cc', diff --git a/repair/table_check.cc b/repair/table_check.cc new file mode 100644 index 0000000000..7095749b89 --- /dev/null +++ b/repair/table_check.cc @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2024-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include "replica/database.hh" +#include "repair/table_check.hh" +#include "service/migration_manager.hh" + +namespace repair { + +future table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid) { + if (mm.use_raft()) { + abort_on_expiry aoe(lowres_clock::now() + std::chrono::seconds{10}); + auto& as = aoe.abort_source(); + auto sub = mm.get_abort_source().subscribe([&as] () noexcept { + if (!as.abort_requested()) { + as.request_abort(); + } + }); + + // Trigger read barrier to synchronize schema. + co_await mm.get_group0_barrier().trigger(as); + } + + co_return !db.column_family_exists(uuid); +} + +future with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid, + std::function(const table_id&)> f) { + std::exception_ptr ex = nullptr; + try { + co_await f(uuid); + co_return table_dropped::no; + } catch (replica::no_such_column_family&) { + // No need to synchronize while we know the table was dropped. + } catch (...) { + // This node may still see a table while it is dropped on the remote node + // and so the remote node returns an error. In that case we want to skip + // that table and continue with the operation. + // + // But since RPC does not enable returning the exception type, the cause + // of the failure cannot be determined. Synchronize schema to see the latest + // changes and determine whether the table was dropped. + ex = std::current_exception(); + } + + if (ex) { + auto dropped = co_await table_sync_and_check(db, mm, uuid); + if (!dropped) { + co_await coroutine::return_exception_ptr(std::move(ex)); + } + } + co_return table_dropped::yes; +} + +} diff --git a/repair/table_check.hh b/repair/table_check.hh new file mode 100644 index 0000000000..587ba87b5a --- /dev/null +++ b/repair/table_check.hh @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2024-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#pragma once + +#include +#include + +#include "schema/schema_fwd.hh" + +using table_dropped = bool_class; + +namespace raft { +class server; +} + +namespace replica { +class database; +} + +namespace service { +class migration_manager; +} + +namespace repair { + +class database; + +future table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid); + +// Runs function f on given table. If f throws and the table is dropped, the exception is swallowed. +// Function is aimed to handle no_such_column_family on remote node or different shard, as it synchronizes +// schema before checking the table. Prefer standard error handling whenever possible. +future with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid, + std::function(const table_id&)> f); + +} diff --git a/service/migration_manager.hh b/service/migration_manager.hh index 71e9d645fc..f90d5756af 100644 --- a/service/migration_manager.hh +++ b/service/migration_manager.hh @@ -94,6 +94,11 @@ public: const migration_notifier& get_notifier() const { return _notifier; } service::storage_proxy& get_storage_proxy() { return _storage_proxy; } const service::storage_proxy& get_storage_proxy() const { return _storage_proxy; } + abort_source& get_abort_source() noexcept { return _as; } + const abort_source& get_abort_source() const noexcept { return _as; } + serialized_action& get_group0_barrier() noexcept { return _group0_barrier; } + const serialized_action& get_group0_barrier() const noexcept { return _group0_barrier; } + bool use_raft() const noexcept { return !_enable_schema_pulls; } // Disable schema pulls when Raft group 0 is fully responsible for managing schema. future<> disable_schema_pulls();