service: tasks: scan all tablets in tablet_virtual_task::wait

Currently, for repair tasks tablet_virtual_task::wait gathers the
ids of tablets that are to be repaired. The gathered set is later
used to check if the repair is still ongoing.

However, if the tablets are resized (split or merged), the gathered
set becomes irrelevant. Those, we may end up with invalid tablet id
error being thrown.

Wait until repair is done for all tablets in the table.
This commit is contained in:
Aleksandra Martyniuk
2026-01-23 15:42:10 +01:00
parent d78ea3d498
commit e5928497ce

View File

@@ -150,6 +150,7 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished"); tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s)); co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
while (true) {
co_await _ss._topology_state_machine.event.wait([&] { co_await _ss._topology_state_machine.event.wait([&] {
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) { if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
return true; return true;
@@ -160,12 +161,29 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
} else if (tablet_id_opt.has_value()) { // Migration task. } else if (tablet_id_opt.has_value()) { // Migration task.
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid(); return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
} else { // Repair task. } else { // Repair task.
return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) { return true;
return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
});
} }
}); });
if (!is_repair_task(task_type)) {
break;
}
auto tmptr = _ss.get_token_metadata_ptr();
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
break;
}
auto& tmap = tmptr->tablets().get_tablet_map(table);
bool repair_still_running = false;
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
return make_ready_future();
});
if (!repair_still_running) {
break;
}
}
res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried. res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) { if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
res->status.end_time = db_clock::now(); res->status.end_time = db_clock::now();