Files
scylladb/idl/storage_service.idl.hh
Yaniv Michael Kaul 3cba27d25f topology: propagate error messages through raft_topology_cmd_result
When a topology command (e.g., rebuild) fails on a target node, the
exception message was being swallowed at multiple levels:

1. raft_topology_cmd_handler caught exceptions and returned a bare
   fail status with no error details.
2. exec_direct_command_helper saw the fail status and threw a generic
   "failed status returned from {id}" message.
3. The rebuilding handler caught that and stored a hardcoded
   "streaming failed" message.

This meant users only saw "rebuild failed: streaming failed" instead
of the actionable error from the safety check (e.g., "it is unsafe
to use source_dc=dc2 to rebuild keyspace=...").

Fix by:
- Adding an error_message field to raft_topology_cmd_result (with
  [[version 2026.2]] for wire compatibility).
- Populating error_message with the exception text in the handler's
  catch blocks.
- Including error_message in the exception thrown by
  exec_direct_command_helper.
- Passing the actual error through to rtbuilder.done() instead of
  the hardcoded "streaming failed".

A follow-up test is in https://github.com/scylladb/scylladb/pull/29363

Fixes: SCYLLADB-1404

Closes scylladb/scylladb#29362
2026-05-11 17:01:15 +03:00

101 lines
3.0 KiB
C++

/*
* Copyright 2022-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#include "service/tablet_operation.hh"
namespace locator {
struct tablet_id final {
uint64_t value();
};
struct global_tablet_id final {
::table_id table;
locator::tablet_id tablet;
};
struct table_load_stats final {
uint64_t size_in_bytes;
int64_t split_ready_seq_number;
};
struct range_based_tablet_id final {
::table_id table;
dht::token_range range;
};
struct load_stats_v1 final {
std::unordered_map<::table_id, locator::table_load_stats> tables;
};
struct tablet_load_stats final {
// Sum of all tablet sizes on a node and available disk space.
uint64_t effective_capacity;
// Contains tablet sizes per table. The token ranges must be in the form
// (a, b] and only such ranges are allowed
std::unordered_map<::table_id, std::unordered_map<dht::token_range, uint64_t>> tablet_sizes;
};
struct load_stats {
std::unordered_map<::table_id, locator::table_load_stats> tables;
std::unordered_map<locator::host_id, uint64_t> capacity;
std::unordered_map<locator::host_id, bool> critical_disk_utilization [[version 2025.3]];
std::unordered_map<locator::host_id, locator::tablet_load_stats> tablet_stats [[version 2026.1]];
};
}
namespace service {
struct fencing_token {
service::topology::version_t topology_version;
};
struct raft_topology_cmd {
enum class command: uint8_t {
barrier,
barrier_and_drain,
stream_ranges,
wait_for_ip
};
service::raft_topology_cmd::command cmd;
};
struct raft_topology_cmd_result {
enum class command_status: uint8_t {
fail,
success
};
service::raft_topology_cmd_result::command_status status;
sstring error_message [[version 2026.2]];
};
struct raft_snapshot {
utils::chunked_vector<canonical_mutation> mutations;
};
struct raft_snapshot_pull_params {
std::vector<table_id> tables;
};
struct tablet_operation_repair_result {
gc_clock::time_point repair_time;
};
verb raft_topology_cmd (raft::server_id dst_id, raft::term_t term, uint64_t cmd_index, service::raft_topology_cmd) -> service::raft_topology_cmd_result;
verb [[cancellable]] raft_pull_snapshot (raft::server_id dst_id, service::raft_snapshot_pull_params) -> service::raft_snapshot;
verb [[cancellable]] tablet_stream_data (raft::server_id dst_id, locator::global_tablet_id);
verb [[cancellable]] tablet_cleanup (raft::server_id dst_id, locator::global_tablet_id);
verb [[cancellable]] table_load_stats_v1 (raft::server_id dst_id) -> locator::load_stats_v1;
verb [[cancellable]] table_load_stats (raft::server_id dst_id) -> locator::load_stats;
verb [[cancellable]] tablet_repair(raft::server_id dst_id, locator::global_tablet_id, service::session_id session [[version 2025.4]]) -> service::tablet_operation_repair_result;
verb [[]] estimate_sstable_volume(table_id table) -> uint64_t;
verb [[]] sample_sstables(table_id table, uint64_t chunk_size, uint64_t n_chunks) -> utils::chunked_vector<temporary_buffer<char>>;
}