mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-25 01:02:20 +00:00
When a topology command (e.g., rebuild) fails on a target node, the
exception message was being swallowed at multiple levels:
1. raft_topology_cmd_handler caught exceptions and returned a bare
fail status with no error details.
2. exec_direct_command_helper saw the fail status and threw a generic
"failed status returned from {id}" message.
3. The rebuilding handler caught that and stored a hardcoded
"streaming failed" message.
This meant users only saw "rebuild failed: streaming failed" instead
of the actionable error from the safety check (e.g., "it is unsafe
to use source_dc=dc2 to rebuild keyspace=...").
Fix by:
- Adding an error_message field to raft_topology_cmd_result (with
[[version 2026.2]] for wire compatibility).
- Populating error_message with the exception text in the handler's
catch blocks.
- Including error_message in the exception thrown by
exec_direct_command_helper.
- Passing the actual error through to rtbuilder.done() instead of
the hardcoded "streaming failed".
A follow-up test is in https://github.com/scylladb/scylladb/pull/29363
Fixes: SCYLLADB-1404
Closes scylladb/scylladb#29362
101 lines
3.0 KiB
C++
101 lines
3.0 KiB
C++
/*
|
|
* Copyright 2022-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#include "service/tablet_operation.hh"
|
|
|
|
namespace locator {
|
|
|
|
struct tablet_id final {
|
|
uint64_t value();
|
|
};
|
|
|
|
struct global_tablet_id final {
|
|
::table_id table;
|
|
locator::tablet_id tablet;
|
|
};
|
|
|
|
struct table_load_stats final {
|
|
uint64_t size_in_bytes;
|
|
int64_t split_ready_seq_number;
|
|
};
|
|
|
|
struct range_based_tablet_id final {
|
|
::table_id table;
|
|
dht::token_range range;
|
|
};
|
|
|
|
struct load_stats_v1 final {
|
|
std::unordered_map<::table_id, locator::table_load_stats> tables;
|
|
};
|
|
|
|
struct tablet_load_stats final {
|
|
// Sum of all tablet sizes on a node and available disk space.
|
|
uint64_t effective_capacity;
|
|
|
|
// Contains tablet sizes per table. The token ranges must be in the form
|
|
// (a, b] and only such ranges are allowed
|
|
std::unordered_map<::table_id, std::unordered_map<dht::token_range, uint64_t>> tablet_sizes;
|
|
};
|
|
|
|
struct load_stats {
|
|
std::unordered_map<::table_id, locator::table_load_stats> tables;
|
|
std::unordered_map<locator::host_id, uint64_t> capacity;
|
|
std::unordered_map<locator::host_id, bool> critical_disk_utilization [[version 2025.3]];
|
|
std::unordered_map<locator::host_id, locator::tablet_load_stats> tablet_stats [[version 2026.1]];
|
|
};
|
|
|
|
}
|
|
|
|
namespace service {
|
|
struct fencing_token {
|
|
service::topology::version_t topology_version;
|
|
};
|
|
|
|
struct raft_topology_cmd {
|
|
enum class command: uint8_t {
|
|
barrier,
|
|
barrier_and_drain,
|
|
stream_ranges,
|
|
wait_for_ip
|
|
};
|
|
service::raft_topology_cmd::command cmd;
|
|
};
|
|
|
|
struct raft_topology_cmd_result {
|
|
enum class command_status: uint8_t {
|
|
fail,
|
|
success
|
|
};
|
|
service::raft_topology_cmd_result::command_status status;
|
|
sstring error_message [[version 2026.2]];
|
|
};
|
|
|
|
struct raft_snapshot {
|
|
utils::chunked_vector<canonical_mutation> mutations;
|
|
};
|
|
|
|
struct raft_snapshot_pull_params {
|
|
std::vector<table_id> tables;
|
|
};
|
|
|
|
struct tablet_operation_repair_result {
|
|
gc_clock::time_point repair_time;
|
|
};
|
|
|
|
verb raft_topology_cmd (raft::server_id dst_id, raft::term_t term, uint64_t cmd_index, service::raft_topology_cmd) -> service::raft_topology_cmd_result;
|
|
verb [[cancellable]] raft_pull_snapshot (raft::server_id dst_id, service::raft_snapshot_pull_params) -> service::raft_snapshot;
|
|
verb [[cancellable]] tablet_stream_data (raft::server_id dst_id, locator::global_tablet_id);
|
|
verb [[cancellable]] tablet_cleanup (raft::server_id dst_id, locator::global_tablet_id);
|
|
verb [[cancellable]] table_load_stats_v1 (raft::server_id dst_id) -> locator::load_stats_v1;
|
|
verb [[cancellable]] table_load_stats (raft::server_id dst_id) -> locator::load_stats;
|
|
verb [[cancellable]] tablet_repair(raft::server_id dst_id, locator::global_tablet_id, service::session_id session [[version 2025.4]]) -> service::tablet_operation_repair_result;
|
|
verb [[]] estimate_sstable_volume(table_id table) -> uint64_t;
|
|
verb [[]] sample_sstables(table_id table, uint64_t chunk_size, uint64_t n_chunks) -> utils::chunked_vector<temporary_buffer<char>>;
|
|
|
|
}
|