From cded6bd207757f799bae2d8ec2b49b1610efd7b8 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Thu, 19 Jun 2025 16:36:49 +0800 Subject: [PATCH] cancel pvb/pvr on velero server restarts Signed-off-by: Lyndon-Li --- changelogs/unreleased/9031-Lyndon-Li | 1 + pkg/cmd/server/server.go | 90 +++++++++++++++++++ pkg/controller/data_download_controller.go | 2 +- pkg/controller/data_upload_controller.go | 2 +- .../pod_volume_backup_controller.go | 2 +- .../pod_volume_restore_controller.go | 8 +- .../pod_volume_restore_controller_legacy.go | 6 +- 7 files changed, 101 insertions(+), 10 deletions(-) create mode 100644 changelogs/unreleased/9031-Lyndon-Li diff --git a/changelogs/unreleased/9031-Lyndon-Li b/changelogs/unreleased/9031-Lyndon-Li new file mode 100644 index 000000000..1d612dc0f --- /dev/null +++ b/changelogs/unreleased/9031-Lyndon-Li @@ -0,0 +1 @@ +Fix issue #8961, cancel PVB/PVR on Velero server restart \ No newline at end of file diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index 50b91d71b..34e071430 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -961,6 +961,7 @@ func markInProgressBackupsFailed(ctx context.Context, client ctrlclient.Client, } log.WithField("backup", backup.GetName()).Warn(updated.Status.FailureReason) markDataUploadsCancel(ctx, client, backup, log) + markPodVolumeBackupsCancel(ctx, client, backup, log) } } @@ -983,8 +984,10 @@ func markInProgressRestoresFailed(ctx context.Context, client ctrlclient.Client, log.WithError(errors.WithStack(err)).Errorf("failed to patch restore %q", restore.GetName()) continue } + log.WithField("restore", restore.GetName()).Warn(updated.Status.FailureReason) markDataDownloadsCancel(ctx, client, restore, log) + markPodVolumeRestoresCancel(ctx, client, restore, log) } } @@ -1069,3 +1072,90 @@ func markDataDownloadsCancel(ctx context.Context, client ctrlclient.Client, rest } } } + +func markPodVolumeBackupsCancel(ctx context.Context, client ctrlclient.Client, backup velerov1api.Backup, log logrus.FieldLogger) { + pvbs := &velerov1api.PodVolumeBackupList{} + + if err := client.List(ctx, pvbs, &ctrlclient.ListOptions{ + Namespace: backup.GetNamespace(), + LabelSelector: labels.Set(map[string]string{ + velerov1api.BackupUIDLabel: string(backup.GetUID()), + }).AsSelector(), + }); err != nil { + log.WithError(errors.WithStack(err)).Error("failed to list PVBs") + return + } + + for i := range pvbs.Items { + pvb := pvbs.Items[i] + if pvb.Status.Phase == velerov1api.PodVolumeBackupPhaseAccepted || + pvb.Status.Phase == velerov1api.PodVolumeBackupPhasePrepared || + pvb.Status.Phase == velerov1api.PodVolumeBackupPhaseInProgress || + pvb.Status.Phase == velerov1api.PodVolumeBackupPhaseNew || + pvb.Status.Phase == "" { + err := controller.UpdatePVBWithRetry(ctx, client, types.NamespacedName{Namespace: pvb.Namespace, Name: pvb.Name}, log.WithField("PVB", pvb.Name), + func(pvb *velerov1api.PodVolumeBackup) bool { + if pvb.Spec.Cancel { + return false + } + + pvb.Spec.Cancel = true + pvb.Status.Message = fmt.Sprintf("PVB is in status %q during the velero server starting, mark it as cancel", pvb.Status.Phase) + + return true + }) + + if err != nil { + log.WithError(errors.WithStack(err)).Errorf("failed to mark PVB %q cancel", pvb.GetName()) + continue + } + log.WithField("PVB is mark for cancel due to server restart", pvb.GetName()).Warn(pvb.Status.Message) + } + } +} + +func markPodVolumeRestoresCancel(ctx context.Context, client ctrlclient.Client, restore velerov1api.Restore, log logrus.FieldLogger) { + pvrs := &velerov1api.PodVolumeRestoreList{} + + if err := client.List(ctx, pvrs, &ctrlclient.ListOptions{ + Namespace: restore.GetNamespace(), + LabelSelector: labels.Set(map[string]string{ + velerov1api.RestoreUIDLabel: string(restore.GetUID()), + }).AsSelector(), + }); err != nil { + log.WithError(errors.WithStack(err)).Error("failed to list PVRs") + return + } + + for i := range pvrs.Items { + pvr := pvrs.Items[i] + if controller.IsLegacyPVR(&pvr) { + log.WithField("PVR", pvr.GetName()).Warn("Found a legacy PVR during velero server restart, cannot stop it") + continue + } + + if pvr.Status.Phase == velerov1api.PodVolumeRestorePhaseAccepted || + pvr.Status.Phase == velerov1api.PodVolumeRestorePhasePrepared || + pvr.Status.Phase == velerov1api.PodVolumeRestorePhaseInProgress || + pvr.Status.Phase == velerov1api.PodVolumeRestorePhaseNew || + pvr.Status.Phase == "" { + err := controller.UpdatePVRWithRetry(ctx, client, types.NamespacedName{Namespace: pvr.Namespace, Name: pvr.Name}, log.WithField("PVR", pvr.Name), + func(pvr *velerov1api.PodVolumeRestore) bool { + if pvr.Spec.Cancel { + return false + } + + pvr.Spec.Cancel = true + pvr.Status.Message = fmt.Sprintf("PVR is in status %q during the velero server starting, mark it as cancel", pvr.Status.Phase) + + return true + }) + + if err != nil { + log.WithError(errors.WithStack(err)).Errorf("failed to mark PVR %q cancel", pvr.GetName()) + continue + } + log.WithField("PVR is mark for cancel due to server restart", pvr.GetName()).Warn(pvr.Status.Message) + } + } +} diff --git a/pkg/controller/data_download_controller.go b/pkg/controller/data_download_controller.go index 53d83d98e..fe9f66ec0 100644 --- a/pkg/controller/data_download_controller.go +++ b/pkg/controller/data_download_controller.go @@ -885,7 +885,7 @@ func UpdateDataDownloadWithRetry(ctx context.Context, client client.Client, name err := client.Update(ctx, dd) if err != nil { if apierrors.IsConflict(err) { - log.Warnf("failed to update datadownload for %s/%s and will retry it", dd.Namespace, dd.Name) + log.Debugf("failed to update datadownload for %s/%s and will retry it", dd.Namespace, dd.Name) return false, nil } else { return false, errors.Wrapf(err, "error updating datadownload %s/%s", dd.Namespace, dd.Name) diff --git a/pkg/controller/data_upload_controller.go b/pkg/controller/data_upload_controller.go index 6ab65f172..0bee6a924 100644 --- a/pkg/controller/data_upload_controller.go +++ b/pkg/controller/data_upload_controller.go @@ -990,7 +990,7 @@ func UpdateDataUploadWithRetry(ctx context.Context, client client.Client, namesp err := client.Update(ctx, du) if err != nil { if apierrors.IsConflict(err) { - log.Warnf("failed to update dataupload for %s/%s and will retry it", du.Namespace, du.Name) + log.Debugf("failed to update dataupload for %s/%s and will retry it", du.Namespace, du.Name) return false, nil } else { return false, errors.Wrapf(err, "error updating dataupload with error %s/%s", du.Namespace, du.Name) diff --git a/pkg/controller/pod_volume_backup_controller.go b/pkg/controller/pod_volume_backup_controller.go index 6fde05784..bc66a9634 100644 --- a/pkg/controller/pod_volume_backup_controller.go +++ b/pkg/controller/pod_volume_backup_controller.go @@ -833,7 +833,7 @@ func UpdatePVBWithRetry(ctx context.Context, client client.Client, namespacedNam err := client.Update(ctx, pvb) if err != nil { if apierrors.IsConflict(err) { - log.Warnf("failed to update PVB for %s/%s and will retry it", pvb.Namespace, pvb.Name) + log.Debugf("failed to update PVB for %s/%s and will retry it", pvb.Namespace, pvb.Name) return false, nil } else { return false, errors.Wrapf(err, "error updating PVB with error %s/%s", pvb.Namespace, pvb.Name) diff --git a/pkg/controller/pod_volume_restore_controller.go b/pkg/controller/pod_volume_restore_controller.go index 41362871a..5d0b789bd 100644 --- a/pkg/controller/pod_volume_restore_controller.go +++ b/pkg/controller/pod_volume_restore_controller.go @@ -545,7 +545,7 @@ func (r *PodVolumeRestoreReconciler) closeDataPath(ctx context.Context, pvrName func (r *PodVolumeRestoreReconciler) SetupWithManager(mgr ctrl.Manager) error { gp := kube.NewGenericEventPredicate(func(object client.Object) bool { pvr := object.(*velerov1api.PodVolumeRestore) - if isLegacyPVR(pvr) { + if IsLegacyPVR(pvr) { return false } @@ -570,7 +570,7 @@ func (r *PodVolumeRestoreReconciler) SetupWithManager(mgr ctrl.Manager) error { pred := kube.NewAllEventPredicate(func(obj client.Object) bool { pvr := obj.(*velerov1api.PodVolumeRestore) - return !isLegacyPVR(pvr) + return !IsLegacyPVR(pvr) }) return ctrl.NewControllerManagedBy(mgr). @@ -620,7 +620,7 @@ func (r *PodVolumeRestoreReconciler) findPVRForTargetPod(ctx context.Context, po requests := []reconcile.Request{} for _, item := range list.Items { - if isLegacyPVR(&item) { + if IsLegacyPVR(&item) { continue } @@ -897,7 +897,7 @@ func UpdatePVRWithRetry(ctx context.Context, client client.Client, namespacedNam err := client.Update(ctx, pvr) if err != nil { if apierrors.IsConflict(err) { - log.Warnf("failed to update PVR for %s/%s and will retry it", pvr.Namespace, pvr.Name) + log.Debugf("failed to update PVR for %s/%s and will retry it", pvr.Namespace, pvr.Name) return false, nil } else { return false, errors.Wrapf(err, "error updating PVR %s/%s", pvr.Namespace, pvr.Name) diff --git a/pkg/controller/pod_volume_restore_controller_legacy.go b/pkg/controller/pod_volume_restore_controller_legacy.go index b03e27b0e..731b70db9 100644 --- a/pkg/controller/pod_volume_restore_controller_legacy.go +++ b/pkg/controller/pod_volume_restore_controller_legacy.go @@ -205,7 +205,7 @@ func (c *PodVolumeRestoreReconcilerLegacy) SetupWithManager(mgr ctrl.Manager) er // By watching the pods, we can trigger the PVR reconciliation again once the pod is finally scheduled on the node. pred := kube.NewAllEventPredicate(func(obj client.Object) bool { pvr := obj.(*velerov1api.PodVolumeRestore) - return isLegacyPVR(pvr) + return IsLegacyPVR(pvr) }) return ctrl.NewControllerManagedBy(mgr).Named("podvolumerestorelegacy"). @@ -229,7 +229,7 @@ func (c *PodVolumeRestoreReconcilerLegacy) findVolumeRestoresForPod(ctx context. requests := []reconcile.Request{} for _, item := range list.Items { - if !isLegacyPVR(&item) { + if !IsLegacyPVR(&item) { continue } @@ -359,6 +359,6 @@ func (c *PodVolumeRestoreReconcilerLegacy) closeDataPath(ctx context.Context, pv c.dataPathMgr.RemoveAsyncBR(pvbName) } -func isLegacyPVR(pvr *velerov1api.PodVolumeRestore) bool { +func IsLegacyPVR(pvr *velerov1api.PodVolumeRestore) bool { return pvr.Spec.UploaderType == uploader.ResticType }