Mark in-progress CRs as failed when starting the server

Mark in-progress CRs as failed when starting the server

Fixes #4953

Signed-off-by: Wenkai Yin(尹文开) <yinw@vmware.com>
This commit is contained in:
Wenkai Yin(尹文开)
2022-06-06 22:56:29 +08:00
parent 9af031b84a
commit 6fa4d7d606
10 changed files with 204 additions and 186 deletions

View File

@@ -74,6 +74,7 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"github.com/vmware-tanzu/velero/internal/storage"
@@ -378,6 +379,8 @@ func (s *server) run() error {
return err
}
markInProgressCRsFailed(s.ctx, s.mgr.GetConfig(), s.mgr.GetScheme(), s.namespace, s.logger)
if err := s.runControllers(s.config.defaultVolumeSnapshotLocations); err != nil {
return err
}
@@ -925,3 +928,65 @@ func (w *CSIInformerFactoryWrapper) WaitForCacheSync(stopCh <-chan struct{}) map
}
return nil
}
// if there is a restarting during the reconciling of backups/restores/etc, these CRs may be stuck in progress status
// markInProgressCRsFailed tries to mark the in progress CRs as failed when starting the server to avoid the issue
func markInProgressCRsFailed(ctx context.Context, cfg *rest.Config, scheme *runtime.Scheme, namespace string, log logrus.FieldLogger) {
// the function is called before starting the controller manager, the embedded client isn't ready to use, so create a new one here
client, err := ctrlclient.New(cfg, ctrlclient.Options{Scheme: scheme})
if err != nil {
log.WithError(errors.WithStack(err)).Error("failed to create client")
return
}
markInProgressBackupsFailed(ctx, client, namespace, log)
markInProgressRestoresFailed(ctx, client, namespace, log)
}
func markInProgressBackupsFailed(ctx context.Context, client ctrlclient.Client, namespace string, log logrus.FieldLogger) {
backups := &velerov1api.BackupList{}
if err := client.List(ctx, backups, &ctrlclient.MatchingFields{"metadata.namespace": namespace}); err != nil {
log.WithError(errors.WithStack(err)).Error("failed to list backups")
return
}
for _, backup := range backups.Items {
if backup.Status.Phase != velerov1api.BackupPhaseInProgress {
log.Debugf("the status of backup %q is %q, skip", backup.GetName(), backup.Status.Phase)
continue
}
updated := backup.DeepCopy()
updated.Status.Phase = velerov1api.BackupPhaseFailed
updated.Status.FailureReason = fmt.Sprintf("get a backup with status %q during the server starting, mark it as %q", velerov1api.BackupPhaseInProgress, updated.Status.Phase)
updated.Status.CompletionTimestamp = &metav1.Time{Time: time.Now()}
if err := client.Patch(ctx, updated, ctrlclient.MergeFrom(&backup)); err != nil {
log.WithError(errors.WithStack(err)).Errorf("failed to patch backup %q", backup.GetName())
continue
}
log.WithField("backup", backup.GetName()).Warn(updated.Status.FailureReason)
}
}
func markInProgressRestoresFailed(ctx context.Context, client ctrlclient.Client, namespace string, log logrus.FieldLogger) {
restores := &velerov1api.RestoreList{}
if err := client.List(ctx, restores, &ctrlclient.MatchingFields{"metadata.namespace": namespace}); err != nil {
log.WithError(errors.WithStack(err)).Error("failed to list restores")
return
}
for _, restore := range restores.Items {
if restore.Status.Phase != velerov1api.RestorePhaseInProgress {
log.Debugf("the status of restore %q is %q, skip", restore.GetName(), restore.Status.Phase)
continue
}
updated := restore.DeepCopy()
updated.Status.Phase = velerov1api.RestorePhaseFailed
updated.Status.FailureReason = fmt.Sprintf("get a restore with status %q during the server starting, mark it as %q", velerov1api.RestorePhaseInProgress, updated.Status.Phase)
updated.Status.CompletionTimestamp = &metav1.Time{Time: time.Now()}
if err := client.Patch(ctx, updated, ctrlclient.MergeFrom(&restore)); err != nil {
log.WithError(errors.WithStack(err)).Errorf("failed to patch restore %q", restore.GetName())
continue
}
log.WithField("restore", restore.GetName()).Warn(updated.Status.FailureReason)
}
}