Merge pull request #7437 from Lyndon-Li/issue-fix-7036

Issue 7036: node selection for data mover backup
This commit is contained in:
lyndon-li
2024-03-29 17:04:40 +08:00
committed by GitHub
17 changed files with 678 additions and 139 deletions

View File

@@ -214,7 +214,10 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
} else if dd.Status.Phase == velerov2alpha1api.DataDownloadPhaseAccepted {
if dd.Spec.Cancel {
log.Debugf("Data download is been canceled %s in Phase %s", dd.GetName(), dd.Status.Phase)
r.TryCancelDataDownload(ctx, dd)
r.TryCancelDataDownload(ctx, dd, "")
} else if peekErr := r.restoreExposer.PeekExposed(ctx, getDataDownloadOwnerObject(dd)); peekErr != nil {
r.TryCancelDataDownload(ctx, dd, fmt.Sprintf("found a dataupload %s/%s with expose error: %s. mark it as cancel", dd.Namespace, dd.Name, peekErr))
log.Errorf("Cancel dd %s/%s because of expose error %s", dd.Namespace, dd.Name, peekErr)
} else if dd.Status.StartTimestamp != nil {
if time.Since(dd.Status.StartTimestamp.Time) >= r.preparingTimeout {
r.onPrepareTimeout(ctx, dd)
@@ -418,7 +421,7 @@ func (r *DataDownloadReconciler) OnDataDownloadCancelled(ctx context.Context, na
}
}
func (r *DataDownloadReconciler) TryCancelDataDownload(ctx context.Context, dd *velerov2alpha1api.DataDownload) {
func (r *DataDownloadReconciler) TryCancelDataDownload(ctx context.Context, dd *velerov2alpha1api.DataDownload, message string) {
log := r.logger.WithField("datadownload", dd.Name)
log.Warn("Async fs backup data path canceled")
@@ -428,6 +431,7 @@ func (r *DataDownloadReconciler) TryCancelDataDownload(ctx context.Context, dd *
dataDownload.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
}
dataDownload.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
dataDownload.Status.Message = message
})
if err != nil {

View File

@@ -174,6 +174,7 @@ func TestDataDownloadReconcile(t *testing.T) {
needCreateFSBR bool
isExposeErr bool
isGetExposeErr bool
isPeekExposeErr bool
isNilExposer bool
isFSBRInitErr bool
isFSBRRestoreErr bool
@@ -302,6 +303,12 @@ func TestDataDownloadReconcile(t *testing.T) {
dd: dataDownloadBuilder().Phase(velerov2alpha1api.DataDownloadPhaseAccepted).StartTimestamp(&metav1.Time{Time: time.Now().Add(-time.Minute * 5)}).Result(),
expected: dataDownloadBuilder().Phase(velerov2alpha1api.DataDownloadPhaseFailed).Result(),
},
{
name: "peek error",
dd: dataDownloadBuilder().Phase(velerov2alpha1api.DataDownloadPhaseAccepted).Result(),
isPeekExposeErr: true,
expected: dataDownloadBuilder().Phase(velerov2alpha1api.DataDownloadPhaseCanceled).Result(),
},
{
name: "dataDownload with enabled cancel",
dd: func() *velerov2alpha1api.DataDownload {
@@ -369,7 +376,7 @@ func TestDataDownloadReconcile(t *testing.T) {
return fsBR
}
if test.isExposeErr || test.isGetExposeErr || test.isNilExposer || test.notNilExpose {
if test.isExposeErr || test.isGetExposeErr || test.isPeekExposeErr || test.isNilExposer || test.notNilExpose {
if test.isNilExposer {
r.restoreExposer = nil
} else {
@@ -383,6 +390,8 @@ func TestDataDownloadReconcile(t *testing.T) {
ep.On("GetExposed", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&exposer.ExposeResult{ByPod: exposer.ExposeByPod{HostingPod: hostingPod, VolumeName: "test-pvc"}}, nil)
} else if test.isGetExposeErr {
ep.On("GetExposed", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil, errors.New("Error to get restore exposer"))
} else if test.isPeekExposeErr {
ep.On("PeekExposed", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(errors.New("fake-peek-error"))
}
if !test.notMockCleanUp {
@@ -801,7 +810,7 @@ func TestTryCancelDataDownload(t *testing.T) {
err = r.client.Create(ctx, test.dd)
require.NoError(t, err)
r.TryCancelDataDownload(ctx, test.dd)
r.TryCancelDataDownload(ctx, test.dd, "")
if test.expectedErr == "" {
assert.NoError(t, err)

View File

@@ -48,6 +48,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/datapath"
"github.com/vmware-tanzu/velero/pkg/exposer"
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/vmware-tanzu/velero/pkg/nodeagent"
"github.com/vmware-tanzu/velero/pkg/repository"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
@@ -74,12 +75,13 @@ type DataUploadReconciler struct {
logger logrus.FieldLogger
snapshotExposerList map[velerov2alpha1api.SnapshotType]exposer.SnapshotExposer
dataPathMgr *datapath.Manager
loadAffinity *nodeagent.LoadAffinity
preparingTimeout time.Duration
metrics *metrics.ServerMetrics
}
func NewDataUploadReconciler(client client.Client, kubeClient kubernetes.Interface, csiSnapshotClient snapshotter.SnapshotV1Interface,
dataPathMgr *datapath.Manager, repoEnsurer *repository.Ensurer, clock clocks.WithTickerAndDelayedExecution,
dataPathMgr *datapath.Manager, loadAffinity *nodeagent.LoadAffinity, repoEnsurer *repository.Ensurer, clock clocks.WithTickerAndDelayedExecution,
cred *credentials.CredentialGetter, nodeName string, fs filesystem.Interface, preparingTimeout time.Duration, log logrus.FieldLogger, metrics *metrics.ServerMetrics) *DataUploadReconciler {
return &DataUploadReconciler{
client: client,
@@ -93,6 +95,7 @@ func NewDataUploadReconciler(client client.Client, kubeClient kubernetes.Interfa
repoEnsurer: repoEnsurer,
snapshotExposerList: map[velerov2alpha1api.SnapshotType]exposer.SnapshotExposer{velerov2alpha1api.SnapshotTypeCSI: exposer.NewCSISnapshotExposer(kubeClient, csiSnapshotClient, log)},
dataPathMgr: dataPathMgr,
loadAffinity: loadAffinity,
preparingTimeout: preparingTimeout,
metrics: metrics,
}
@@ -224,7 +227,10 @@ func (r *DataUploadReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// we don't want to update CR into cancel status forcely as it may conflict with CR update in Expose action
// we could retry when the CR requeue in periodcally
log.Debugf("Data upload is been canceled %s in Phase %s", du.GetName(), du.Status.Phase)
r.TryCancelDataUpload(ctx, du)
r.TryCancelDataUpload(ctx, du, "")
} else if peekErr := ep.PeekExposed(ctx, getOwnerObject(du)); peekErr != nil {
r.TryCancelDataUpload(ctx, du, fmt.Sprintf("found a dataupload %s/%s with expose error: %s. mark it as cancel", du.Namespace, du.Name, peekErr))
log.Errorf("Cancel du %s/%s because of expose error %s", du.Namespace, du.Name, peekErr)
} else if du.Status.StartTimestamp != nil {
if time.Since(du.Status.StartTimestamp.Time) >= r.preparingTimeout {
r.onPrepareTimeout(ctx, du)
@@ -440,7 +446,7 @@ func (r *DataUploadReconciler) OnDataUploadCancelled(ctx context.Context, namesp
}
// TryCancelDataUpload clear up resources only when update success
func (r *DataUploadReconciler) TryCancelDataUpload(ctx context.Context, du *velerov2alpha1api.DataUpload) {
func (r *DataUploadReconciler) TryCancelDataUpload(ctx context.Context, du *velerov2alpha1api.DataUpload, message string) {
log := r.logger.WithField("dataupload", du.Name)
log.Warn("Async fs backup data path canceled")
succeeded, err := r.exclusiveUpdateDataUpload(ctx, du, func(dataUpload *velerov2alpha1api.DataUpload) {
@@ -449,6 +455,7 @@ func (r *DataUploadReconciler) TryCancelDataUpload(ctx context.Context, du *vele
dataUpload.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
}
dataUpload.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
dataUpload.Status.Message = message
})
if err != nil {
@@ -825,6 +832,7 @@ func (r *DataUploadReconciler) setupExposeParam(du *velerov2alpha1api.DataUpload
OperationTimeout: du.Spec.OperationTimeout.Duration,
ExposeTimeout: r.preparingTimeout,
VolumeSize: pvc.Spec.Resources.Requests[corev1.ResourceStorage],
Affinity: r.loadAffinity,
}, nil
}
return nil, nil

View File

@@ -232,7 +232,7 @@ func initDataUploaderReconcilerWithError(needError ...error) (*DataUploadReconci
if err != nil {
return nil, err
}
return NewDataUploadReconciler(fakeClient, fakeKubeClient, fakeSnapshotClient.SnapshotV1(), dataPathMgr, nil,
return NewDataUploadReconciler(fakeClient, fakeKubeClient, fakeSnapshotClient.SnapshotV1(), dataPathMgr, nil, nil,
testclocks.NewFakeClock(now), &credentials.CredentialGetter{FromFile: credentialFileStore}, "test_node", fakeFS, time.Minute*5, velerotest.NewLogger(), metrics.NewServerMetrics()), nil
}
@@ -252,6 +252,7 @@ func dataUploadBuilder() *builder.DataUploadBuilder {
type fakeSnapshotExposer struct {
kubeClient kbclient.Client
clock clock.WithTickerAndDelayedExecution
peekErr error
}
func (f *fakeSnapshotExposer) Expose(ctx context.Context, ownerObject corev1.ObjectReference, param interface{}) error {
@@ -283,6 +284,10 @@ func (f *fakeSnapshotExposer) GetExposed(ctx context.Context, du corev1.ObjectRe
return &exposer.ExposeResult{ByPod: exposer.ExposeByPod{HostingPod: pod, VolumeName: dataUploadName}}, nil
}
func (f *fakeSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev1.ObjectReference) error {
return f.peekErr
}
func (f *fakeSnapshotExposer) CleanUp(context.Context, corev1.ObjectReference, string, string) {
}
@@ -330,6 +335,7 @@ func TestReconcile(t *testing.T) {
expectedRequeue ctrl.Result
expectedErrMsg string
needErrs []bool
peekErr error
}{
{
name: "Dataupload is not initialized",
@@ -420,6 +426,13 @@ func TestReconcile(t *testing.T) {
du: dataUploadBuilder().Phase(velerov2alpha1api.DataUploadPhaseAccepted).SnapshotType(fakeSnapshotType).StartTimestamp(&metav1.Time{Time: time.Now().Add(-time.Minute * 5)}).Result(),
expected: dataUploadBuilder().Phase(velerov2alpha1api.DataUploadPhaseFailed).Result(),
},
{
name: "peek error",
du: dataUploadBuilder().Phase(velerov2alpha1api.DataUploadPhaseAccepted).SnapshotType(fakeSnapshotType).Result(),
peekErr: errors.New("fake-peek-error"),
expectedProcessed: true,
expected: dataUploadBuilder().Phase(velerov2alpha1api.DataUploadPhaseCanceled).Result(),
},
{
name: "Dataupload with enabled cancel",
pod: builder.ForPod(velerov1api.DefaultNamespace, dataUploadName).Volumes(&corev1.Volume{Name: "dataupload-1"}).Result(),
@@ -494,7 +507,7 @@ func TestReconcile(t *testing.T) {
}
if test.du.Spec.SnapshotType == fakeSnapshotType {
r.snapshotExposerList = map[velerov2alpha1api.SnapshotType]exposer.SnapshotExposer{fakeSnapshotType: &fakeSnapshotExposer{r.client, r.Clock}}
r.snapshotExposerList = map[velerov2alpha1api.SnapshotType]exposer.SnapshotExposer{fakeSnapshotType: &fakeSnapshotExposer{r.client, r.Clock, test.peekErr}}
} else if test.du.Spec.SnapshotType == velerov2alpha1api.SnapshotTypeCSI {
r.snapshotExposerList = map[velerov2alpha1api.SnapshotType]exposer.SnapshotExposer{velerov2alpha1api.SnapshotTypeCSI: exposer.NewCSISnapshotExposer(r.kubeClient, r.csiSnapshotClient, velerotest.NewLogger())}
}
@@ -874,7 +887,7 @@ func TestTryCancelDataUpload(t *testing.T) {
err = r.client.Create(ctx, test.dd)
require.NoError(t, err)
r.TryCancelDataUpload(ctx, test.dd)
r.TryCancelDataUpload(ctx, test.dd, "")
if test.expectedErr == "" {
assert.NoError(t, err)