Issue 8344: constrain data path expose (#9064)
Some checks failed
Run the E2E test on kind / build (push) Failing after 7m38s
Run the E2E test on kind / setup-test-matrix (push) Successful in 4s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / Build (push) Failing after 39s
Close stale issues and PRs / stale (push) Successful in 22s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m32s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 1m41s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 1m30s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 1m18s

* issue 8344: constrain data path exposure.

Signed-off-by: Lyndon-Li <lyonghui@vmware.com>
This commit is contained in:
lyndon-li
2025-07-18 13:32:45 +08:00
committed by GitHub
parent 29a8bc4492
commit 06d305ea47
16 changed files with 671 additions and 57 deletions

View File

@@ -64,6 +64,7 @@ type DataDownloadReconciler struct {
restoreExposer exposer.GenericRestoreExposer
nodeName string
dataPathMgr *datapath.Manager
vgdpCounter *exposer.VgdpCounter
loadAffinity []*kube.LoadAffinity
restorePVCConfig nodeagent.RestorePVC
podResources corev1api.ResourceRequirements
@@ -77,6 +78,7 @@ func NewDataDownloadReconciler(
mgr manager.Manager,
kubeClient kubernetes.Interface,
dataPathMgr *datapath.Manager,
counter *exposer.VgdpCounter,
loadAffinity []*kube.LoadAffinity,
restorePVCConfig nodeagent.RestorePVC,
podResources corev1api.ResourceRequirements,
@@ -95,6 +97,7 @@ func NewDataDownloadReconciler(
restoreExposer: exposer.NewGenericRestoreExposer(kubeClient, logger),
restorePVCConfig: restorePVCConfig,
dataPathMgr: dataPathMgr,
vgdpCounter: counter,
loadAffinity: loadAffinity,
podResources: podResources,
preparingTimeout: preparingTimeout,
@@ -220,13 +223,26 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
}
if dd.Status.Phase == "" || dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseNew {
log.Info("Data download starting")
if dd.Spec.Cancel {
log.Debugf("Data download is canceled in Phase %s", dd.Status.Phase)
r.tryCancelDataDownload(ctx, dd, "")
return ctrl.Result{}, nil
}
if r.vgdpCounter != nil && r.vgdpCounter.IsConstrained(ctx, r.logger) {
log.Debug("Data path initiation is constrained, requeue later")
return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 5}, nil
}
if _, err := r.getTargetPVC(ctx, dd); err != nil {
log.WithField("error", err).Debugf("Cannot find target PVC for DataDownload yet. Retry later.")
return ctrl.Result{Requeue: true}, nil
}
log.Info("Data download starting")
accepted, err := r.acceptDataDownload(ctx, dd)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error accepting the data download %s", dd.Name)
@@ -239,12 +255,6 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Info("Data download is accepted")
if dd.Spec.Cancel {
log.Debugf("Data download is been canceled %s in Phase %s", dd.GetName(), dd.Status.Phase)
r.OnDataDownloadCancelled(ctx, dd.GetNamespace(), dd.GetName())
return ctrl.Result{}, nil
}
exposeParam, err := r.setupExposeParam(dd)
if err != nil {
return r.errorOut(ctx, dd, err, "failed to set exposer parameters", log)
@@ -312,7 +322,7 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
dd.Name, dd.Namespace, result.ByPod.HostingPod.Name, result.ByPod.HostingContainer, dd.Name, callbacks, false, log)
if err != nil {
if err == datapath.ConcurrentLimitExceed {
log.Info("Data path instance is concurrent limited requeue later")
log.Debug("Data path instance is concurrent limited requeue later")
return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 5}, nil
} else {
return r.errorOut(ctx, dd, err, "error to create data path", log)
@@ -337,6 +347,8 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
dd.Status.Phase = velerov2alpha1api.DataDownloadPhaseInProgress
dd.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
delete(dd.Labels, exposer.ExposeOnGoingLabel)
return true
}); err != nil {
log.WithError(err).Warnf("Failed to update datadownload %s to InProgress, will data path close and retry", dd.Name)
@@ -454,6 +466,8 @@ func (r *DataDownloadReconciler) OnDataDownloadCompleted(ctx context.Context, na
dd.Status.Phase = velerov2alpha1api.DataDownloadPhaseCompleted
dd.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
delete(dd.Labels, exposer.ExposeOnGoingLabel)
return true
}); err != nil {
log.WithError(err).Error("error updating data download status")
@@ -504,6 +518,8 @@ func (r *DataDownloadReconciler) OnDataDownloadCancelled(ctx context.Context, na
}
dd.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
delete(dd.Labels, exposer.ExposeOnGoingLabel)
return true
}); err != nil {
log.WithError(err).Error("error updating data download status")
@@ -525,6 +541,8 @@ func (r *DataDownloadReconciler) tryCancelDataDownload(ctx context.Context, dd *
if message != "" {
dataDownload.Status.Message = message
}
delete(dataDownload.Labels, exposer.ExposeOnGoingLabel)
})
if err != nil {
@@ -702,6 +720,8 @@ func (r *DataDownloadReconciler) updateStatusToFailed(ctx context.Context, dd *v
dd.Status.Message = errors.WithMessage(err, msg).Error()
dd.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
delete(dd.Labels, exposer.ExposeOnGoingLabel)
return true
}); patchErr != nil {
log.WithError(patchErr).Error("error updating DataDownload status")
@@ -724,6 +744,11 @@ func (r *DataDownloadReconciler) acceptDataDownload(ctx context.Context, dd *vel
datadownload.Status.Phase = velerov2alpha1api.DataDownloadPhaseAccepted
datadownload.Status.AcceptedByNode = r.nodeName
datadownload.Status.AcceptedTimestamp = &metav1.Time{Time: r.Clock.Now()}
if datadownload.Labels == nil {
datadownload.Labels = make(map[string]string)
}
datadownload.Labels[exposer.ExposeOnGoingLabel] = "true"
}
succeeded, err := funcExclusiveUpdateDataDownload(ctx, r.client, updated, updateFunc)
@@ -749,6 +774,8 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
succeeded, err := funcExclusiveUpdateDataDownload(ctx, r.client, dd, func(dd *velerov2alpha1api.DataDownload) {
dd.Status.Phase = velerov2alpha1api.DataDownloadPhaseFailed
dd.Status.Message = "timeout on preparing data download"
delete(dd.Labels, exposer.ExposeOnGoingLabel)
})
if err != nil {