mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-22 15:52:13 +00:00
When a topology command (e.g., rebuild) fails on a target node, the
exception message was being swallowed at multiple levels:
1. raft_topology_cmd_handler caught exceptions and returned a bare
fail status with no error details.
2. exec_direct_command_helper saw the fail status and threw a generic
"failed status returned from {id}" message.
3. The rebuilding handler caught that and stored a hardcoded
"streaming failed" message.
This meant users only saw "rebuild failed: streaming failed" instead
of the actionable error from the safety check (e.g., "it is unsafe
to use source_dc=dc2 to rebuild keyspace=...").
Fix by:
- Adding an error_message field to raft_topology_cmd_result (with
[[version 2026.2]] for wire compatibility).
- Populating error_message with the exception text in the handler's
catch blocks.
- Including error_message in the exception thrown by
exec_direct_command_helper.
- Passing the actual error through to rtbuilder.done() instead of
the hardcoded "streaming failed".
A follow-up test is in https://github.com/scylladb/scylladb/pull/29363
Fixes: SCYLLADB-1404
Closes scylladb/scylladb#29362
394 lines
16 KiB
C++
394 lines
16 KiB
C++
/*
|
|
* Copyright (C) 2022-present ScyllaDB
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <functional>
|
|
#include <set>
|
|
#include <unordered_set>
|
|
#include <unordered_map>
|
|
#include <seastar/core/condition-variable.hh>
|
|
#include <seastar/core/sstring.hh>
|
|
#include "cdc/generation_id.hh"
|
|
#include "dht/token.hh"
|
|
#include "raft/raft.hh"
|
|
#include "utils/UUID.hh"
|
|
#include "service/session.hh"
|
|
#include "mutation/canonical_mutation.hh"
|
|
#include "replica/database_fwd.hh"
|
|
#include "locator/host_id.hh"
|
|
#include "gms/feature_service.hh"
|
|
|
|
namespace db {
|
|
class system_keyspace;
|
|
}
|
|
|
|
namespace service {
|
|
|
|
class raft_group0;
|
|
class group0_guard;
|
|
|
|
enum class node_state: uint16_t {
|
|
none, // the new node joined group0 but has not bootstrapped yet (has no tokens and data to serve)
|
|
bootstrapping, // the node is currently in the process of streaming its part of the ring
|
|
decommissioning, // the node is being decommissioned and stream its data to nodes that took over
|
|
removing, // the node is being removed and its data is streamed to nodes that took over from still alive owners
|
|
replacing, // the node replaces another dead node in the cluster and it data is being streamed to it
|
|
rebuilding, // the node is being rebuild and is streaming data from other replicas
|
|
normal, // the node does not do any streaming and serves the slice of the ring that belongs to it
|
|
left, // the node left the cluster and group0
|
|
};
|
|
|
|
// The order of the requests is a priority
|
|
// order in which requests are executes in case
|
|
// there are multiple requests in the queue.
|
|
// The order tries to minimize the amount of cleanups.
|
|
enum class topology_request: uint16_t {
|
|
replace,
|
|
join,
|
|
remove,
|
|
leave,
|
|
rebuild
|
|
};
|
|
|
|
enum class cleanup_status : uint16_t {
|
|
clean,
|
|
needed,
|
|
running,
|
|
};
|
|
|
|
struct join_param {
|
|
uint32_t num_tokens;
|
|
sstring tokens_string;
|
|
};
|
|
|
|
struct rebuild_param {
|
|
sstring source_dc;
|
|
};
|
|
|
|
struct replace_param {
|
|
raft::server_id replaced_id;
|
|
};
|
|
|
|
using request_param = std::variant<join_param, rebuild_param, replace_param>;
|
|
|
|
enum class global_topology_request: uint16_t {
|
|
new_cdc_generation,
|
|
cleanup,
|
|
keyspace_rf_change,
|
|
truncate_table,
|
|
|
|
// High priority no-operation request.
|
|
// Used to synchronize API calls with topology coordinator.
|
|
// Ensures that all later requests and tablet scheduler will see prior updates to group0.
|
|
noop_request,
|
|
snapshot_tables,
|
|
finalize_migration,
|
|
};
|
|
|
|
struct ring_slice {
|
|
std::unordered_set<dht::token> tokens;
|
|
};
|
|
|
|
// The intended storage mode for a node during vnodes-to-tablets migration.
|
|
//
|
|
// When migrating a table from vnodes to tablets, each node needs to reshard
|
|
// its local SSTables on vnode boundaries. Conversely, SSTables need to be
|
|
// resharded in the opposite direction (i.e., with the static sharder) when the
|
|
// operation is rolled back. This property is an indicator that a node needs to
|
|
// perform resharding on restart, and it declares the resharding direction.
|
|
enum class intended_storage_mode : uint16_t {
|
|
vnodes,
|
|
tablets,
|
|
};
|
|
|
|
struct replica_state {
|
|
node_state state;
|
|
seastar::sstring datacenter;
|
|
seastar::sstring rack;
|
|
seastar::sstring release_version;
|
|
std::optional<ring_slice> ring; // if engaged contain the set of tokens the node owns together with their state
|
|
size_t shard_count;
|
|
uint8_t ignore_msb;
|
|
std::set<sstring> supported_features;
|
|
cleanup_status cleanup;
|
|
utils::UUID request_id; // id of the current request for the node or the last one if no current one exists
|
|
std::optional<intended_storage_mode> storage_mode;
|
|
};
|
|
|
|
struct topology_features {
|
|
// Supported features, for normal nodes
|
|
std::unordered_map<raft::server_id, std::set<sstring>> normal_supported_features;
|
|
|
|
// Features that are considered enabled by the cluster
|
|
std::set<sstring> enabled_features;
|
|
|
|
// Calculates a set of features that are supported by all normal nodes but not yet enabled.
|
|
std::set<sstring> calculate_not_yet_enabled_features() const;
|
|
};
|
|
|
|
struct topology {
|
|
enum class transition_state: uint16_t {
|
|
join_group0,
|
|
commit_cdc_generation,
|
|
tablet_draining, // deprecated, not set after feature_service::parallel_tablet_draining is enabled.
|
|
write_both_read_old,
|
|
write_both_read_new,
|
|
tablet_migration,
|
|
tablet_resize_finalization,
|
|
tablet_split_finalization, // deprecated in favor of tablet_resize_finalization, kept for backward compatibility.
|
|
left_token_ring,
|
|
rollback_to_normal,
|
|
truncate_table,
|
|
lock,
|
|
snapshot_tables,
|
|
};
|
|
|
|
std::optional<transition_state> tstate;
|
|
|
|
using version_t = int64_t;
|
|
static constexpr version_t initial_version = 1;
|
|
version_t version = initial_version;
|
|
version_t fence_version = initial_version;
|
|
|
|
// Nodes that are normal members of the ring
|
|
std::unordered_map<raft::server_id, replica_state> normal_nodes;
|
|
// Nodes that are left
|
|
std::unordered_set<raft::server_id> left_nodes;
|
|
// Left nodes for which we need topology information.
|
|
std::unordered_map<raft::server_id, replica_state> left_nodes_rs;
|
|
// Nodes that are waiting to be joined by the topology coordinator
|
|
std::unordered_map<raft::server_id, replica_state> new_nodes;
|
|
// Nodes that are in the process to be added to the ring
|
|
// Currently at most one node at a time will be here, but the code shouldn't assume it
|
|
// because we might support parallel operations in the future.
|
|
std::unordered_map<raft::server_id, replica_state> transition_nodes;
|
|
|
|
// Pending topology requests
|
|
std::unordered_map<raft::server_id, topology_request> requests;
|
|
|
|
// Paused topology requests.
|
|
// Those are pending requests which are ignored by the scheduler
|
|
// because they are waiting for the node to be drained of tablet replicas first.
|
|
std::unordered_map<raft::server_id, topology_request> paused_requests;
|
|
|
|
// Holds parameters for a request per node and valid during entire
|
|
// operation until the node becomes normal
|
|
std::unordered_map<raft::server_id, request_param> req_param;
|
|
|
|
// Pending global topology request (i.e. not related to any specific node).
|
|
std::optional<global_topology_request> global_request;
|
|
|
|
// Pending global topology request's id, which is a new group0's state id
|
|
std::optional<utils::UUID> global_request_id;
|
|
|
|
// A queue of pending global topology request's ids. Replaces the single one above
|
|
std::vector<utils::UUID> global_requests_queue;
|
|
|
|
// The IDs of the committed CDC generations sorted by timestamps.
|
|
// The obsolete generations may not be in this list as they are continually deleted.
|
|
std::vector<cdc::generation_id> committed_cdc_generations;
|
|
|
|
// This is the time UUID used to access the data of a new CDC generation introduced
|
|
// e.g. when a new node bootstraps, needed in `commit_cdc_generation` transition state.
|
|
// It's used as the first column of the clustering key in CDC_GENERATIONS_V3 table.
|
|
std::optional<utils::UUID> new_cdc_generation_data_uuid;
|
|
|
|
// The name of the KS that is being the target of the scheduled ALTER KS statement
|
|
std::optional<sstring> new_keyspace_rf_change_ks_name;
|
|
// The KS options to be used when executing the scheduled ALTER KS statement
|
|
std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
|
|
|
|
// The ids of RF change requests that are paused because they require tablet co-location.
|
|
// It may happen during altering from numerical RF to rack list.
|
|
std::unordered_set<utils::UUID> paused_rf_change_requests;
|
|
|
|
// The ids of ongoing RF change requests.
|
|
// Here we keep the ids only for rf-changes using rack_lists.
|
|
std::unordered_set<utils::UUID> ongoing_rf_changes;
|
|
|
|
// The IDs of the committed yet unpublished CDC generations sorted by timestamps.
|
|
std::vector<cdc::generation_id> unpublished_cdc_generations;
|
|
|
|
// Set of features that are considered to be enabled by the cluster.
|
|
std::set<sstring> enabled_features;
|
|
|
|
// Session used to create topology_guard for operations like streaming.
|
|
session_id session;
|
|
|
|
// When false, tablet load balancer will not try to rebalance tablets.
|
|
bool tablet_balancing_enabled = true;
|
|
|
|
// The set of nodes that should be considered dead during topology operations
|
|
std::unordered_set<raft::server_id> ignored_nodes;
|
|
|
|
// The set of nodes currently excluded from synchronization in the tablets management code.
|
|
// The barrier should not wait for these nodes.
|
|
// This set is effectively equal to: ignored_nodes + keys(left_nodes_rs).
|
|
// Tablet replicas may temporarily include left nodes (e.g. when a node is replaced),
|
|
// hence the need for this field.
|
|
std::unordered_set<raft::server_id> excluded_tablet_nodes;
|
|
|
|
// Find only nodes in non 'left' state
|
|
const std::pair<const raft::server_id, replica_state>* find(raft::server_id id) const;
|
|
// Return true if node exists in any state including 'left' one
|
|
bool contains(raft::server_id id);
|
|
// Number of nodes that are not in the 'left' state
|
|
size_t size() const;
|
|
// Are there any non-left nodes?
|
|
bool is_empty() const;
|
|
|
|
// Returns false iff we can safely start a new topology change.
|
|
bool is_busy() const;
|
|
|
|
std::optional<topology_request> get_request(raft::server_id) const;
|
|
std::optional<request_param> get_request_param(raft::server_id) const;
|
|
static raft::server_id parse_replaced_node(const std::optional<request_param>&);
|
|
|
|
// Calculates a set of features that are supported by all normal nodes but not yet enabled.
|
|
std::set<sstring> calculate_not_yet_enabled_features() const;
|
|
|
|
// Returns the set of zero-token normal nodes.
|
|
std::unordered_set<raft::server_id> get_normal_zero_token_nodes() const;
|
|
};
|
|
|
|
struct raft_snapshot {
|
|
// FIXME: handle this with rpc streaming instead as we can't guarantee size bounds.
|
|
utils::chunked_vector<canonical_mutation> mutations;
|
|
};
|
|
|
|
struct raft_snapshot_pull_params {
|
|
std::vector<table_id> tables;
|
|
};
|
|
|
|
// State machine that is responsible for topology change
|
|
struct topology_state_machine {
|
|
using topology_type = topology;
|
|
topology_type _topology;
|
|
condition_variable event;
|
|
size_t reload_count = 0;
|
|
|
|
// Called by the tablet split monitor when all local storage groups
|
|
// for a table are split-ready, to trigger an early load stats
|
|
// refresh so the coordinator can finalize the resize promptly.
|
|
std::function<void()> on_tablet_split_ready;
|
|
|
|
future<> await_not_busy();
|
|
future<sstring> wait_for_request_completion(db::system_keyspace& sys_ks, utils::UUID id, bool require_entry);
|
|
|
|
// Generates mutations that cancel a topology request which is active on the given node.
|
|
// If no request is found, or it cannot be canceled at this stage, no mutations are generated.
|
|
// In case it's topology_request::join/replace, you must also call respond_to_joining_node().
|
|
void generate_cancel_request_update(utils::chunked_vector<canonical_mutation>& muts,
|
|
gms::feature_service& features,
|
|
const group0_guard& guard,
|
|
raft::server_id node,
|
|
sstring reason);
|
|
|
|
// Initiates abort of a topology request with a given ID.
|
|
// Returns a failed future if request is not abortable.
|
|
// Doesn't wait until request is done. Use wait_for_request_completion() for that.
|
|
future<> abort_request(raft_group0&, abort_source&, gms::feature_service&, utils::UUID request_id);
|
|
};
|
|
|
|
// Raft leader uses this command to drive bootstrap process on other nodes
|
|
struct raft_topology_cmd {
|
|
enum class command: uint16_t {
|
|
barrier, // request to wait for the latest topology
|
|
barrier_and_drain, // same + drain requests which use previous versions
|
|
stream_ranges, // request to stream data, return when streaming is
|
|
// done
|
|
wait_for_ip // wait for a joining node IP to appear in gossiper
|
|
};
|
|
command cmd;
|
|
|
|
raft_topology_cmd(command c) : cmd(c) {}
|
|
};
|
|
|
|
// returned as a result of raft_bootstrap_cmd
|
|
struct raft_topology_cmd_result {
|
|
enum class command_status: uint16_t {
|
|
fail,
|
|
success
|
|
};
|
|
command_status status = command_status::fail;
|
|
// Carries the error description back to the topology coordinator
|
|
// when the command fails.
|
|
sstring error_message;
|
|
};
|
|
|
|
// This class is used in RPC's signatures to hold the topology_version of the caller.
|
|
// The reason why we wrap the topology_version in this class is that we anticipate
|
|
// other versions to occur in the future, such as the schema version.
|
|
struct fencing_token {
|
|
topology::version_t topology_version{0};
|
|
// topology_version == 0 means the caller is not aware about
|
|
// the fencing or doesn't use it for some reason.
|
|
explicit operator bool() const {
|
|
return topology_version != 0;
|
|
}
|
|
};
|
|
|
|
struct topology_request_state {
|
|
bool done;
|
|
sstring error;
|
|
};
|
|
|
|
struct node_validation_success {};
|
|
struct node_validation_failure {
|
|
sstring reason;
|
|
};
|
|
using node_validation_result = std::variant<node_validation_success, node_validation_failure>;
|
|
|
|
node_validation_result validate_removing_node(replica::database&, locator::host_id);
|
|
topology::transition_state transition_state_from_string(const sstring& s);
|
|
node_state node_state_from_string(const sstring& s);
|
|
std::optional<topology_request> try_topology_request_from_string(const sstring& s);
|
|
topology_request topology_request_from_string(const sstring& s);
|
|
global_topology_request global_topology_request_from_string(const sstring&);
|
|
cleanup_status cleanup_status_from_string(const sstring& s);
|
|
intended_storage_mode intended_storage_mode_from_string(const sstring& s);
|
|
}
|
|
|
|
template <> struct fmt::formatter<service::cleanup_status> {
|
|
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
|
auto format(service::cleanup_status status, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::intended_storage_mode> {
|
|
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
|
auto format(service::intended_storage_mode mode, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::fencing_token> : fmt::formatter<string_view> {
|
|
auto format(const service::fencing_token& fencing_token, fmt::format_context& ctx) const {
|
|
return fmt::format_to(ctx.out(), "{{{}}}", fencing_token.topology_version);
|
|
}
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::topology::transition_state> : fmt::formatter<string_view> {
|
|
auto format(service::topology::transition_state, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::node_state> : fmt::formatter<string_view> {
|
|
auto format(service::node_state, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::topology_request> : fmt::formatter<string_view> {
|
|
auto format(service::topology_request, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::global_topology_request> : fmt::formatter<string_view> {
|
|
auto format(service::global_topology_request, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|
|
|
|
template <> struct fmt::formatter<service::raft_topology_cmd::command> : fmt::formatter<string_view> {
|
|
auto format(service::raft_topology_cmd::command, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|