Compare commits

..

1 Commits

Author SHA1 Message Date
Yaniv Michael Kaul
9b65d6d0fc topology: propagate error messages through raft_topology_cmd_result
When a topology command (e.g., rebuild) fails on a target node, the
exception message was being swallowed at multiple levels:

1. raft_topology_cmd_handler caught exceptions and returned a bare
   fail status with no error details.
2. exec_direct_command_helper saw the fail status and threw a generic
   "failed status returned from {id}" message.
3. The rebuilding handler caught that and stored a hardcoded
   "streaming failed" message.

This meant users only saw "rebuild failed: streaming failed" instead
of the actionable error from the safety check (e.g., "it is unsafe
to use source_dc=dc2 to rebuild keyspace=...").

Fix by:
- Adding an error_message field to raft_topology_cmd_result (with
  [[version 2026.2]] for wire compatibility).
- Populating error_message with the exception text in the handler's
  catch blocks.
- Including error_message in the exception thrown by
  exec_direct_command_helper.
- Passing the actual error through to rtbuilder.done() instead of
  the hardcoded "streaming failed".

A follow-up test is in https://github.com/scylladb/scylladb/pull/29363

Fixes: SCYLLADB-1404
2026-04-24 11:25:31 +03:00
4 changed files with 19 additions and 2 deletions

View File

@@ -72,6 +72,7 @@ struct raft_topology_cmd_result {
success
};
service::raft_topology_cmd_result::command_status status;
sstring error_message [[version 2026.2]];
};
struct raft_snapshot {

View File

@@ -4792,8 +4792,13 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
}
} catch (const raft::request_aborted& e) {
rtlogger.warn("raft_topology_cmd {} failed with: {}", cmd.cmd, e);
result.error_message = e.what();
} catch (const std::exception& e) {
rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, e);
result.error_message = e.what();
} catch (...) {
rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, std::current_exception());
result.error_message = "unknown error";
}
rtlogger.info("topology cmd rpc {} completed with status={} index={}",

View File

@@ -443,8 +443,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd(
&_messaging, to_host_id(id), id, _term, cmd_index, cmd);
if (result.status == raft_topology_cmd_result::command_status::fail) {
auto msg = result.error_message.empty()
? ::format("failed status returned from {}", id)
: ::format("failed status returned from {}: {}", id, result.error_message);
co_await coroutine::exception(std::make_exception_ptr(
std::runtime_error(::format("failed status returned from {}", id))));
std::runtime_error(std::move(msg))));
}
};
@@ -3909,10 +3912,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (const std::exception& e) {
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
" (node state is rebuilding): {}", e);
rtbuilder.done(e.what());
retake = true;
} catch (...) {
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
" (node state is rebuilding): {}", std::current_exception());
rtbuilder.done("streaming failed");
rtbuilder.done("unknown error");
retake = true;
}
if (retake) {

View File

@@ -318,6 +318,9 @@ struct raft_topology_cmd_result {
success
};
command_status status = command_status::fail;
// Carries the error description back to the topology coordinator
// when the command fails.
sstring error_message;
};
// This class is used in RPC's signatures to hold the topology_version of the caller.