mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-01-07 05:46:37 +00:00
Merge pull request #6616 from qiuming-best/add-accept-label
Fix data mover controller bugs
This commit is contained in:
@@ -27,11 +27,13 @@ import (
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/utils/clock"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/builder"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
|
||||
"sigs.k8s.io/controller-runtime/pkg/event"
|
||||
"sigs.k8s.io/controller-runtime/pkg/predicate"
|
||||
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
||||
@@ -118,6 +120,33 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
|
||||
return r.errorOut(ctx, dd, errors.New("uninitialized generic exposer"), "uninitialized exposer", log)
|
||||
}
|
||||
|
||||
// Add finalizer
|
||||
// Logic for clear resources when datadownload been deleted
|
||||
if dd.DeletionTimestamp.IsZero() { // add finalizer for all cr at beginning
|
||||
if !isDataDownloadInFinalState(dd) && !controllerutil.ContainsFinalizer(dd, dataUploadDownloadFinalizer) {
|
||||
succeeded, err := r.exclusiveUpdateDataDownload(ctx, dd, func(dd *velerov2alpha1api.DataDownload) {
|
||||
controllerutil.AddFinalizer(dd, dataUploadDownloadFinalizer)
|
||||
})
|
||||
if err != nil {
|
||||
log.Errorf("failed to add finalizer with error %s for %s/%s", err.Error(), dd.Namespace, dd.Name)
|
||||
return ctrl.Result{}, err
|
||||
} else if !succeeded {
|
||||
log.Warnf("failed to add finilizer for %s/%s and will requeue later", dd.Namespace, dd.Name)
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
}
|
||||
} else if controllerutil.ContainsFinalizer(dd, dataUploadDownloadFinalizer) && !dd.Spec.Cancel && !isDataDownloadInFinalState(dd) {
|
||||
// when delete cr we need to clear up internal resources created by Velero, here we use the cancel mechanism
|
||||
// to help clear up resources instead of clear them directly in case of some conflict with Expose action
|
||||
if err := UpdateDataDownloadWithRetry(ctx, r.client, req.NamespacedName, log, func(dataDownload *velerov2alpha1api.DataDownload) {
|
||||
dataDownload.Spec.Cancel = true
|
||||
dataDownload.Status.Message = fmt.Sprintf("found a dataupload %s/%s is being deleted, mark it as cancel", dd.Namespace, dd.Name)
|
||||
}); err != nil {
|
||||
log.Errorf("failed to set cancel flag with error %s for %s/%s", err.Error(), dd.Namespace, dd.Name)
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
|
||||
if dd.Status.Phase == "" || dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseNew {
|
||||
log.Info("Data download starting")
|
||||
|
||||
@@ -150,15 +179,44 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
|
||||
// And then only the controller who is in the same node could do the rest work.
|
||||
err = r.restoreExposer.Expose(ctx, getDataDownloadOwnerObject(dd), dd.Spec.TargetVolume.PVC, dd.Spec.TargetVolume.Namespace, hostingPodLabels, dd.Spec.OperationTimeout.Duration)
|
||||
if err != nil {
|
||||
return r.errorOut(ctx, dd, err, "error to start restore expose", log)
|
||||
if err := r.client.Get(ctx, req.NamespacedName, dd); err != nil {
|
||||
if !apierrors.IsNotFound(err) {
|
||||
return ctrl.Result{}, errors.Wrap(err, "getting DataUpload")
|
||||
}
|
||||
}
|
||||
if isDataDownloadInFinalState(dd) {
|
||||
log.Warnf("expose snapshot with err %v but it may caused by clean up resources in cancel action", err)
|
||||
r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))
|
||||
return ctrl.Result{}, nil
|
||||
} else {
|
||||
return r.errorOut(ctx, dd, err, "error to expose snapshot", log)
|
||||
}
|
||||
}
|
||||
|
||||
log.Info("Restore is exposed")
|
||||
|
||||
// we need to get CR again for it may canceled by datadownload controller on other
|
||||
// nodes when doing expose action, if detectd cancel action we need to clear up the internal
|
||||
// resources created by velero during backup.
|
||||
if err := r.client.Get(ctx, req.NamespacedName, dd); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
log.Debug("Unable to find datadownload")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
return ctrl.Result{}, errors.Wrap(err, "getting datadownload")
|
||||
}
|
||||
|
||||
// we need to clean up resources as resources created in Expose it may later than cancel action or prepare time
|
||||
// and need to clean up resources again
|
||||
if isDataDownloadInFinalState(dd) {
|
||||
r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
} else if dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseAccepted {
|
||||
if dd.Spec.Cancel {
|
||||
log.Debugf("Data download is been canceled %s in Phase %s", dd.GetName(), dd.Status.Phase)
|
||||
r.OnDataDownloadCancelled(ctx, dd.GetNamespace(), dd.GetName())
|
||||
r.TryCancelDataDownload(ctx, dd)
|
||||
} else if dd.Status.StartTimestamp != nil {
|
||||
if time.Since(dd.Status.StartTimestamp.Time) >= r.preparingTimeout {
|
||||
r.onPrepareTimeout(ctx, dd)
|
||||
@@ -249,7 +307,15 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
} else {
|
||||
log.Debugf("Data download now is in %s phase and do nothing by current %s controller", dd.Status.Phase, r.nodeName)
|
||||
// put the finilizer remove action here for all cr will goes to the final status, we could check finalizer and do remove action in final status
|
||||
// instead of intermediate state
|
||||
if isDataDownloadInFinalState(dd) && !dd.DeletionTimestamp.IsZero() && controllerutil.ContainsFinalizer(dd, dataUploadDownloadFinalizer) {
|
||||
original := dd.DeepCopy()
|
||||
controllerutil.RemoveFinalizer(dd, dataUploadDownloadFinalizer)
|
||||
if err := r.client.Patch(ctx, dd, client.MergeFrom(original)); err != nil {
|
||||
log.WithError(err).Error("error to remove finalizer")
|
||||
}
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
}
|
||||
@@ -353,6 +419,32 @@ func (r *DataDownloadReconciler) OnDataDownloadCancelled(ctx context.Context, na
|
||||
}
|
||||
}
|
||||
|
||||
func (r *DataDownloadReconciler) TryCancelDataDownload(ctx context.Context, dd *velerov2alpha1api.DataDownload) {
|
||||
log := r.logger.WithField("datadownload", dd.Name)
|
||||
log.Warn("Async fs backup data path canceled")
|
||||
|
||||
succeeded, err := r.exclusiveUpdateDataDownload(ctx, dd, func(dataDownload *velerov2alpha1api.DataDownload) {
|
||||
dataDownload.Status.Phase = velerov2alpha1api.DataDownloadPhaseCanceled
|
||||
if dataDownload.Status.StartTimestamp.IsZero() {
|
||||
dataDownload.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
|
||||
}
|
||||
dataDownload.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error updating datadownload status")
|
||||
return
|
||||
} else if !succeeded {
|
||||
log.Warn("conflict in updating datadownload status and will try it again later")
|
||||
return
|
||||
}
|
||||
|
||||
// success update
|
||||
r.metrics.RegisterDataDownloadCancel(r.nodeName)
|
||||
r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))
|
||||
r.closeDataPath(ctx, dd.Name)
|
||||
}
|
||||
|
||||
func (r *DataDownloadReconciler) OnDataDownloadProgress(ctx context.Context, namespace string, ddName string, progress *uploader.Progress) {
|
||||
log := r.logger.WithField("datadownload", ddName)
|
||||
|
||||
@@ -515,16 +607,28 @@ func (r *DataDownloadReconciler) acceptDataDownload(ctx context.Context, dd *vel
|
||||
|
||||
// For all data download controller in each node-agent will try to update download CR, and only one controller will success,
|
||||
// and the success one could handle later logic
|
||||
succeeded, err := r.exclusiveUpdateDataDownload(ctx, dd, func(dd *velerov2alpha1api.DataDownload) {
|
||||
dd.Status.Phase = velerov2alpha1api.DataDownloadPhaseAccepted
|
||||
dd.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
|
||||
})
|
||||
|
||||
updated := dd.DeepCopy()
|
||||
|
||||
updateFunc := func(datadownload *velerov2alpha1api.DataDownload) {
|
||||
datadownload.Status.Phase = velerov2alpha1api.DataDownloadPhaseAccepted
|
||||
datadownload.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
|
||||
labels := datadownload.GetLabels()
|
||||
if labels == nil {
|
||||
labels = make(map[string]string)
|
||||
}
|
||||
labels[acceptNodeLabelKey] = r.nodeName
|
||||
datadownload.SetLabels(labels)
|
||||
}
|
||||
|
||||
succeeded, err := r.exclusiveUpdateDataDownload(ctx, updated, updateFunc)
|
||||
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if succeeded {
|
||||
updateFunc(dd) // If update success, it's need to update du values in memory
|
||||
r.logger.WithField("DataDownload", dd.Name).Infof("This datadownload has been accepted by %s", r.nodeName)
|
||||
return true, nil
|
||||
}
|
||||
@@ -537,7 +641,6 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
|
||||
log := r.logger.WithField("DataDownload", dd.Name)
|
||||
|
||||
log.Info("Timeout happened for preparing datadownload")
|
||||
|
||||
succeeded, err := r.exclusiveUpdateDataDownload(ctx, dd, func(dd *velerov2alpha1api.DataDownload) {
|
||||
dd.Status.Phase = velerov2alpha1api.DataDownloadPhaseFailed
|
||||
dd.Status.Message = "timeout on preparing data download"
|
||||
@@ -562,13 +665,15 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
|
||||
|
||||
func (r *DataDownloadReconciler) exclusiveUpdateDataDownload(ctx context.Context, dd *velerov2alpha1api.DataDownload,
|
||||
updateFunc func(*velerov2alpha1api.DataDownload)) (bool, error) {
|
||||
updated := dd.DeepCopy()
|
||||
updateFunc(updated)
|
||||
updateFunc(dd)
|
||||
|
||||
err := r.client.Update(ctx, dd)
|
||||
|
||||
err := r.client.Update(ctx, updated)
|
||||
if err == nil {
|
||||
return true, nil
|
||||
} else if apierrors.IsConflict(err) {
|
||||
}
|
||||
// it won't rollback dd in memory when error
|
||||
if apierrors.IsConflict(err) {
|
||||
return false, nil
|
||||
} else {
|
||||
return false, err
|
||||
@@ -614,3 +719,31 @@ func findDataDownloadByPod(client client.Client, pod v1.Pod) (*velerov2alpha1api
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func isDataDownloadInFinalState(dd *velerov2alpha1api.DataDownload) bool {
|
||||
return dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseFailed ||
|
||||
dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseCanceled ||
|
||||
dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseCompleted
|
||||
}
|
||||
|
||||
func UpdateDataDownloadWithRetry(ctx context.Context, client client.Client, namespacedName types.NamespacedName, log *logrus.Entry, updateFunc func(dataDownload *velerov2alpha1api.DataDownload)) error {
|
||||
return wait.PollUntilWithContext(ctx, time.Second, func(ctx context.Context) (done bool, err error) {
|
||||
dd := &velerov2alpha1api.DataDownload{}
|
||||
if err := client.Get(ctx, namespacedName, dd); err != nil {
|
||||
return false, errors.Wrap(err, "getting DataDownload")
|
||||
}
|
||||
|
||||
updateFunc(dd)
|
||||
updateErr := client.Update(ctx, dd)
|
||||
if updateErr != nil {
|
||||
if apierrors.IsConflict(updateErr) {
|
||||
log.Warnf("failed to update datadownload for %s/%s and will retry it", dd.Namespace, dd.Name)
|
||||
return false, nil
|
||||
}
|
||||
log.Errorf("failed to update datadownload with error %s for %s/%s", updateErr.Error(), dd.Namespace, dd.Name)
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user