Files
scylladb/service/topology_state_machine.hh
Yaniv Michael Kaul 3cba27d25f topology: propagate error messages through raft_topology_cmd_result
When a topology command (e.g., rebuild) fails on a target node, the
exception message was being swallowed at multiple levels:

1. raft_topology_cmd_handler caught exceptions and returned a bare
   fail status with no error details.
2. exec_direct_command_helper saw the fail status and threw a generic
   "failed status returned from {id}" message.
3. The rebuilding handler caught that and stored a hardcoded
   "streaming failed" message.

This meant users only saw "rebuild failed: streaming failed" instead
of the actionable error from the safety check (e.g., "it is unsafe
to use source_dc=dc2 to rebuild keyspace=...").

Fix by:
- Adding an error_message field to raft_topology_cmd_result (with
  [[version 2026.2]] for wire compatibility).
- Populating error_message with the exception text in the handler's
  catch blocks.
- Including error_message in the exception thrown by
  exec_direct_command_helper.
- Passing the actual error through to rtbuilder.done() instead of
  the hardcoded "streaming failed".

A follow-up test is in https://github.com/scylladb/scylladb/pull/29363

Fixes: SCYLLADB-1404

Closes scylladb/scylladb#29362
2026-05-11 17:01:15 +03:00

394 lines
16 KiB
C++

/*
* Copyright (C) 2022-present ScyllaDB
*
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
*/
#pragma once
#include <functional>
#include <set>
#include <unordered_set>
#include <unordered_map>
#include <seastar/core/condition-variable.hh>
#include <seastar/core/sstring.hh>
#include "cdc/generation_id.hh"
#include "dht/token.hh"
#include "raft/raft.hh"
#include "utils/UUID.hh"
#include "service/session.hh"
#include "mutation/canonical_mutation.hh"
#include "replica/database_fwd.hh"
#include "locator/host_id.hh"
#include "gms/feature_service.hh"
namespace db {
class system_keyspace;
}
namespace service {
class raft_group0;
class group0_guard;
enum class node_state: uint16_t {
none, // the new node joined group0 but has not bootstrapped yet (has no tokens and data to serve)
bootstrapping, // the node is currently in the process of streaming its part of the ring
decommissioning, // the node is being decommissioned and stream its data to nodes that took over
removing, // the node is being removed and its data is streamed to nodes that took over from still alive owners
replacing, // the node replaces another dead node in the cluster and it data is being streamed to it
rebuilding, // the node is being rebuild and is streaming data from other replicas
normal, // the node does not do any streaming and serves the slice of the ring that belongs to it
left, // the node left the cluster and group0
};
// The order of the requests is a priority
// order in which requests are executes in case
// there are multiple requests in the queue.
// The order tries to minimize the amount of cleanups.
enum class topology_request: uint16_t {
replace,
join,
remove,
leave,
rebuild
};
enum class cleanup_status : uint16_t {
clean,
needed,
running,
};
struct join_param {
uint32_t num_tokens;
sstring tokens_string;
};
struct rebuild_param {
sstring source_dc;
};
struct replace_param {
raft::server_id replaced_id;
};
using request_param = std::variant<join_param, rebuild_param, replace_param>;
enum class global_topology_request: uint16_t {
new_cdc_generation,
cleanup,
keyspace_rf_change,
truncate_table,
// High priority no-operation request.
// Used to synchronize API calls with topology coordinator.
// Ensures that all later requests and tablet scheduler will see prior updates to group0.
noop_request,
snapshot_tables,
finalize_migration,
};
struct ring_slice {
std::unordered_set<dht::token> tokens;
};
// The intended storage mode for a node during vnodes-to-tablets migration.
//
// When migrating a table from vnodes to tablets, each node needs to reshard
// its local SSTables on vnode boundaries. Conversely, SSTables need to be
// resharded in the opposite direction (i.e., with the static sharder) when the
// operation is rolled back. This property is an indicator that a node needs to
// perform resharding on restart, and it declares the resharding direction.
enum class intended_storage_mode : uint16_t {
vnodes,
tablets,
};
struct replica_state {
node_state state;
seastar::sstring datacenter;
seastar::sstring rack;
seastar::sstring release_version;
std::optional<ring_slice> ring; // if engaged contain the set of tokens the node owns together with their state
size_t shard_count;
uint8_t ignore_msb;
std::set<sstring> supported_features;
cleanup_status cleanup;
utils::UUID request_id; // id of the current request for the node or the last one if no current one exists
std::optional<intended_storage_mode> storage_mode;
};
struct topology_features {
// Supported features, for normal nodes
std::unordered_map<raft::server_id, std::set<sstring>> normal_supported_features;
// Features that are considered enabled by the cluster
std::set<sstring> enabled_features;
// Calculates a set of features that are supported by all normal nodes but not yet enabled.
std::set<sstring> calculate_not_yet_enabled_features() const;
};
struct topology {
enum class transition_state: uint16_t {
join_group0,
commit_cdc_generation,
tablet_draining, // deprecated, not set after feature_service::parallel_tablet_draining is enabled.
write_both_read_old,
write_both_read_new,
tablet_migration,
tablet_resize_finalization,
tablet_split_finalization, // deprecated in favor of tablet_resize_finalization, kept for backward compatibility.
left_token_ring,
rollback_to_normal,
truncate_table,
lock,
snapshot_tables,
};
std::optional<transition_state> tstate;
using version_t = int64_t;
static constexpr version_t initial_version = 1;
version_t version = initial_version;
version_t fence_version = initial_version;
// Nodes that are normal members of the ring
std::unordered_map<raft::server_id, replica_state> normal_nodes;
// Nodes that are left
std::unordered_set<raft::server_id> left_nodes;
// Left nodes for which we need topology information.
std::unordered_map<raft::server_id, replica_state> left_nodes_rs;
// Nodes that are waiting to be joined by the topology coordinator
std::unordered_map<raft::server_id, replica_state> new_nodes;
// Nodes that are in the process to be added to the ring
// Currently at most one node at a time will be here, but the code shouldn't assume it
// because we might support parallel operations in the future.
std::unordered_map<raft::server_id, replica_state> transition_nodes;
// Pending topology requests
std::unordered_map<raft::server_id, topology_request> requests;
// Paused topology requests.
// Those are pending requests which are ignored by the scheduler
// because they are waiting for the node to be drained of tablet replicas first.
std::unordered_map<raft::server_id, topology_request> paused_requests;
// Holds parameters for a request per node and valid during entire
// operation until the node becomes normal
std::unordered_map<raft::server_id, request_param> req_param;
// Pending global topology request (i.e. not related to any specific node).
std::optional<global_topology_request> global_request;
// Pending global topology request's id, which is a new group0's state id
std::optional<utils::UUID> global_request_id;
// A queue of pending global topology request's ids. Replaces the single one above
std::vector<utils::UUID> global_requests_queue;
// The IDs of the committed CDC generations sorted by timestamps.
// The obsolete generations may not be in this list as they are continually deleted.
std::vector<cdc::generation_id> committed_cdc_generations;
// This is the time UUID used to access the data of a new CDC generation introduced
// e.g. when a new node bootstraps, needed in `commit_cdc_generation` transition state.
// It's used as the first column of the clustering key in CDC_GENERATIONS_V3 table.
std::optional<utils::UUID> new_cdc_generation_data_uuid;
// The name of the KS that is being the target of the scheduled ALTER KS statement
std::optional<sstring> new_keyspace_rf_change_ks_name;
// The KS options to be used when executing the scheduled ALTER KS statement
std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
// The ids of RF change requests that are paused because they require tablet co-location.
// It may happen during altering from numerical RF to rack list.
std::unordered_set<utils::UUID> paused_rf_change_requests;
// The ids of ongoing RF change requests.
// Here we keep the ids only for rf-changes using rack_lists.
std::unordered_set<utils::UUID> ongoing_rf_changes;
// The IDs of the committed yet unpublished CDC generations sorted by timestamps.
std::vector<cdc::generation_id> unpublished_cdc_generations;
// Set of features that are considered to be enabled by the cluster.
std::set<sstring> enabled_features;
// Session used to create topology_guard for operations like streaming.
session_id session;
// When false, tablet load balancer will not try to rebalance tablets.
bool tablet_balancing_enabled = true;
// The set of nodes that should be considered dead during topology operations
std::unordered_set<raft::server_id> ignored_nodes;
// The set of nodes currently excluded from synchronization in the tablets management code.
// The barrier should not wait for these nodes.
// This set is effectively equal to: ignored_nodes + keys(left_nodes_rs).
// Tablet replicas may temporarily include left nodes (e.g. when a node is replaced),
// hence the need for this field.
std::unordered_set<raft::server_id> excluded_tablet_nodes;
// Find only nodes in non 'left' state
const std::pair<const raft::server_id, replica_state>* find(raft::server_id id) const;
// Return true if node exists in any state including 'left' one
bool contains(raft::server_id id);
// Number of nodes that are not in the 'left' state
size_t size() const;
// Are there any non-left nodes?
bool is_empty() const;
// Returns false iff we can safely start a new topology change.
bool is_busy() const;
std::optional<topology_request> get_request(raft::server_id) const;
std::optional<request_param> get_request_param(raft::server_id) const;
static raft::server_id parse_replaced_node(const std::optional<request_param>&);
// Calculates a set of features that are supported by all normal nodes but not yet enabled.
std::set<sstring> calculate_not_yet_enabled_features() const;
// Returns the set of zero-token normal nodes.
std::unordered_set<raft::server_id> get_normal_zero_token_nodes() const;
};
struct raft_snapshot {
// FIXME: handle this with rpc streaming instead as we can't guarantee size bounds.
utils::chunked_vector<canonical_mutation> mutations;
};
struct raft_snapshot_pull_params {
std::vector<table_id> tables;
};
// State machine that is responsible for topology change
struct topology_state_machine {
using topology_type = topology;
topology_type _topology;
condition_variable event;
size_t reload_count = 0;
// Called by the tablet split monitor when all local storage groups
// for a table are split-ready, to trigger an early load stats
// refresh so the coordinator can finalize the resize promptly.
std::function<void()> on_tablet_split_ready;
future<> await_not_busy();
future<sstring> wait_for_request_completion(db::system_keyspace& sys_ks, utils::UUID id, bool require_entry);
// Generates mutations that cancel a topology request which is active on the given node.
// If no request is found, or it cannot be canceled at this stage, no mutations are generated.
// In case it's topology_request::join/replace, you must also call respond_to_joining_node().
void generate_cancel_request_update(utils::chunked_vector<canonical_mutation>& muts,
gms::feature_service& features,
const group0_guard& guard,
raft::server_id node,
sstring reason);
// Initiates abort of a topology request with a given ID.
// Returns a failed future if request is not abortable.
// Doesn't wait until request is done. Use wait_for_request_completion() for that.
future<> abort_request(raft_group0&, abort_source&, gms::feature_service&, utils::UUID request_id);
};
// Raft leader uses this command to drive bootstrap process on other nodes
struct raft_topology_cmd {
enum class command: uint16_t {
barrier, // request to wait for the latest topology
barrier_and_drain, // same + drain requests which use previous versions
stream_ranges, // request to stream data, return when streaming is
// done
wait_for_ip // wait for a joining node IP to appear in gossiper
};
command cmd;
raft_topology_cmd(command c) : cmd(c) {}
};
// returned as a result of raft_bootstrap_cmd
struct raft_topology_cmd_result {
enum class command_status: uint16_t {
fail,
success
};
command_status status = command_status::fail;
// Carries the error description back to the topology coordinator
// when the command fails.
sstring error_message;
};
// This class is used in RPC's signatures to hold the topology_version of the caller.
// The reason why we wrap the topology_version in this class is that we anticipate
// other versions to occur in the future, such as the schema version.
struct fencing_token {
topology::version_t topology_version{0};
// topology_version == 0 means the caller is not aware about
// the fencing or doesn't use it for some reason.
explicit operator bool() const {
return topology_version != 0;
}
};
struct topology_request_state {
bool done;
sstring error;
};
struct node_validation_success {};
struct node_validation_failure {
sstring reason;
};
using node_validation_result = std::variant<node_validation_success, node_validation_failure>;
node_validation_result validate_removing_node(replica::database&, locator::host_id);
topology::transition_state transition_state_from_string(const sstring& s);
node_state node_state_from_string(const sstring& s);
std::optional<topology_request> try_topology_request_from_string(const sstring& s);
topology_request topology_request_from_string(const sstring& s);
global_topology_request global_topology_request_from_string(const sstring&);
cleanup_status cleanup_status_from_string(const sstring& s);
intended_storage_mode intended_storage_mode_from_string(const sstring& s);
}
template <> struct fmt::formatter<service::cleanup_status> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(service::cleanup_status status, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<service::intended_storage_mode> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(service::intended_storage_mode mode, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<service::fencing_token> : fmt::formatter<string_view> {
auto format(const service::fencing_token& fencing_token, fmt::format_context& ctx) const {
return fmt::format_to(ctx.out(), "{{{}}}", fencing_token.topology_version);
}
};
template <> struct fmt::formatter<service::topology::transition_state> : fmt::formatter<string_view> {
auto format(service::topology::transition_state, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<service::node_state> : fmt::formatter<string_view> {
auto format(service::node_state, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<service::topology_request> : fmt::formatter<string_view> {
auto format(service::topology_request, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<service::global_topology_request> : fmt::formatter<string_view> {
auto format(service::global_topology_request, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <> struct fmt::formatter<service::raft_topology_cmd::command> : fmt::formatter<string_view> {
auto format(service::raft_topology_cmd::command, fmt::format_context& ctx) const -> decltype(ctx.out());
};