repair: add methods to skip dropped table
Schema propagation is async so one node can see the table while on the other node it is already dropped. So, if the nodes stream the table data, the latter node throws no_such_column_family. The exception is propagated to the other node, but its type is lost, so the operation fails on the other node. Add method which waits until all raft changes are applied and then checks whether given table exists. Add the function which uses the above to determine, whether the function failed because of dropped table (eg. on the remote node so the exact exception type is unknown). If so, the exception isn't rethrown.
This commit is contained in:
@@ -1126,6 +1126,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'utils/lister.cc',
|
||||
'repair/repair.cc',
|
||||
'repair/row_level.cc',
|
||||
'repair/table_check.cc',
|
||||
'exceptions/exceptions.cc',
|
||||
'auth/allow_all_authenticator.cc',
|
||||
'auth/allow_all_authorizer.cc',
|
||||
|
||||
60
repair/table_check.cc
Normal file
60
repair/table_check.cc
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (C) 2024-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
#include "replica/database.hh"
|
||||
#include "repair/table_check.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
|
||||
namespace repair {
|
||||
|
||||
future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid) {
|
||||
if (mm.use_raft()) {
|
||||
abort_on_expiry aoe(lowres_clock::now() + std::chrono::seconds{10});
|
||||
auto& as = aoe.abort_source();
|
||||
auto sub = mm.get_abort_source().subscribe([&as] () noexcept {
|
||||
if (!as.abort_requested()) {
|
||||
as.request_abort();
|
||||
}
|
||||
});
|
||||
|
||||
// Trigger read barrier to synchronize schema.
|
||||
co_await mm.get_group0_barrier().trigger(as);
|
||||
}
|
||||
|
||||
co_return !db.column_family_exists(uuid);
|
||||
}
|
||||
|
||||
future<table_dropped> with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid,
|
||||
std::function<future<>(const table_id&)> f) {
|
||||
std::exception_ptr ex = nullptr;
|
||||
try {
|
||||
co_await f(uuid);
|
||||
co_return table_dropped::no;
|
||||
} catch (replica::no_such_column_family&) {
|
||||
// No need to synchronize while we know the table was dropped.
|
||||
} catch (...) {
|
||||
// This node may still see a table while it is dropped on the remote node
|
||||
// and so the remote node returns an error. In that case we want to skip
|
||||
// that table and continue with the operation.
|
||||
//
|
||||
// But since RPC does not enable returning the exception type, the cause
|
||||
// of the failure cannot be determined. Synchronize schema to see the latest
|
||||
// changes and determine whether the table was dropped.
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
auto dropped = co_await table_sync_and_check(db, mm, uuid);
|
||||
if (!dropped) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
}
|
||||
co_return table_dropped::yes;
|
||||
}
|
||||
|
||||
}
|
||||
42
repair/table_check.hh
Normal file
42
repair/table_check.hh
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (C) 2024-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/util/bool_class.hh>
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
|
||||
using table_dropped = bool_class<class table_dropped_tag>;
|
||||
|
||||
namespace raft {
|
||||
class server;
|
||||
}
|
||||
|
||||
namespace replica {
|
||||
class database;
|
||||
}
|
||||
|
||||
namespace service {
|
||||
class migration_manager;
|
||||
}
|
||||
|
||||
namespace repair {
|
||||
|
||||
class database;
|
||||
|
||||
future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid);
|
||||
|
||||
// Runs function f on given table. If f throws and the table is dropped, the exception is swallowed.
|
||||
// Function is aimed to handle no_such_column_family on remote node or different shard, as it synchronizes
|
||||
// schema before checking the table. Prefer standard error handling whenever possible.
|
||||
future<table_dropped> with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid,
|
||||
std::function<future<>(const table_id&)> f);
|
||||
|
||||
}
|
||||
@@ -94,6 +94,11 @@ public:
|
||||
const migration_notifier& get_notifier() const { return _notifier; }
|
||||
service::storage_proxy& get_storage_proxy() { return _storage_proxy; }
|
||||
const service::storage_proxy& get_storage_proxy() const { return _storage_proxy; }
|
||||
abort_source& get_abort_source() noexcept { return _as; }
|
||||
const abort_source& get_abort_source() const noexcept { return _as; }
|
||||
serialized_action& get_group0_barrier() noexcept { return _group0_barrier; }
|
||||
const serialized_action& get_group0_barrier() const noexcept { return _group0_barrier; }
|
||||
bool use_raft() const noexcept { return !_enable_schema_pulls; }
|
||||
|
||||
// Disable schema pulls when Raft group 0 is fully responsible for managing schema.
|
||||
future<> disable_schema_pulls();
|
||||
|
||||
Reference in New Issue
Block a user