Merge pull request #8591 from reasonerjt/finalize-async-op

Skip patching the PV in finalization for failed operation
This commit is contained in:
lyndon-li
2025-01-10 14:02:42 +08:00
committed by GitHub
6 changed files with 415 additions and 83 deletions

View File

@@ -22,6 +22,12 @@ import (
"sync"
"time"
"k8s.io/apimachinery/pkg/runtime/schema"
"github.com/vmware-tanzu/velero/pkg/constant"
"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
storagev1api "k8s.io/api/storage/v1"
"github.com/pkg/errors"
@@ -155,6 +161,12 @@ func (r *restoreFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Req
restoredPVCList := volume.RestoredPVCFromRestoredResourceList(restoredResourceList)
restoreItemOperations, err := backupStore.GetRestoreItemOperations(restore.Name)
if err != nil {
log.WithError(err).Error("error getting itemOperationList")
return ctrl.Result{}, errors.Wrap(err, "error getting itemOperationList")
}
finalizerCtx := &finalizerContext{
logger: log,
restore: restore,
@@ -163,6 +175,9 @@ func (r *restoreFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Req
restoredPVCList: restoredPVCList,
multiHookTracker: r.multiHookTracker,
resourceTimeout: r.resourceTimeout,
restoreItemOperationList: restoreItemOperationList{
items: restoreItemOperations,
},
}
warnings, errs := finalizerCtx.execute()
@@ -239,16 +254,44 @@ func (r *restoreFinalizerReconciler) finishProcessing(restorePhase velerov1api.R
return kubeutil.PatchResourceWithRetriesOnErrors(r.resourceTimeout, original, restore, r.Client)
}
type restoreItemOperationList struct {
items []*itemoperation.RestoreOperation
}
func (r *restoreItemOperationList) selectByResource(group, resource, ns, name string) []*itemoperation.RestoreOperation {
var res []*itemoperation.RestoreOperation
rid := velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: group,
Resource: resource,
},
Namespace: ns,
Name: name,
}
for _, item := range r.items {
if item != nil && item.Spec.ResourceIdentifier == rid {
res = append(res, item)
}
}
return res
}
// SelectByPVC filters the restore item operation list by PVC namespace and name.
func (r *restoreItemOperationList) SelectByPVC(ns, name string) []*itemoperation.RestoreOperation {
return r.selectByResource("", "persistentvolumeclaims", ns, name)
}
// finalizerContext includes all the dependencies required by finalization tasks and
// a function execute() to orderly implement task logic.
type finalizerContext struct {
logger logrus.FieldLogger
restore *velerov1api.Restore
crClient client.Client
volumeInfo []*volume.BackupVolumeInfo
restoredPVCList map[string]struct{}
multiHookTracker *hook.MultiHookTracker
resourceTimeout time.Duration
logger logrus.FieldLogger
restore *velerov1api.Restore
crClient client.Client
volumeInfo []*volume.BackupVolumeInfo
restoredPVCList map[string]struct{}
restoreItemOperationList restoreItemOperationList
multiHookTracker *hook.MultiHookTracker
resourceTimeout time.Duration
}
func (ctx *finalizerContext) execute() (results.Result, results.Result) { //nolint:unparam //temporarily ignore the lint report: result 0 is always nil (unparam)
@@ -310,6 +353,17 @@ func (ctx *finalizerContext) patchDynamicPVWithVolumeInfo() (errs results.Result
return false, err
}
// Check whether the async operation to populate the PVC is successful. If it's not, will skip patching the PV, instead of waiting.
operations := ctx.restoreItemOperationList.SelectByPVC(pvc.Namespace, pvc.Name)
for _, op := range operations {
if op.Spec.RestoreItemAction == constant.PluginCSIPVCRestoreRIA &&
op.Status.Phase != itemoperation.OperationPhaseCompleted {
log.Warnf("skipping PV patch, because the operation to restore the PVC is not completed, "+
"operation: %s, phase: %s", op.Spec.OperationID, op.Status.Phase)
return true, nil
}
}
// We are handling a common but specific scenario where a PVC is in a pending state and uses a storage class with
// VolumeBindingMode set to WaitForFirstConsumer. In this case, the PV patch step is skipped to avoid
// failures due to the PVC not being bound, which could cause a timeout and result in a failed restore.

View File

@@ -23,6 +23,11 @@ import (
"testing"
"time"
"k8s.io/apimachinery/pkg/runtime/schema"
"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
@@ -145,6 +150,7 @@ func TestRestoreFinalizerReconcile(t *testing.T) {
if test.restore != nil && test.restore.Namespace == velerov1api.DefaultNamespace {
require.NoError(t, r.Client.Create(context.Background(), test.restore))
backupStore.On("GetRestoredResourceList", test.restore.Name).Return(map[string][]string{}, nil)
backupStore.On("GetRestoreItemOperations", test.restore.Name).Return([]*itemoperation.RestoreOperation{}, nil)
}
if test.backup != nil {
assert.NoError(t, r.Client.Create(context.Background(), test.backup))
@@ -627,3 +633,112 @@ func Test_restoreFinalizerReconciler_finishProcessing(t *testing.T) {
})
}
}
func TestRestoreOperationList(t *testing.T) {
var empty []*itemoperation.RestoreOperation
tests := []struct {
name string
items []*itemoperation.RestoreOperation
inputPVCNS string
inputPVCName string
expected []*itemoperation.RestoreOperation
}{
{
name: "no restore operations",
items: []*itemoperation.RestoreOperation{},
inputPVCNS: "ns-1",
inputPVCName: "pvc-1",
expected: empty,
},
{
name: "one operation with matched info and a nil element",
items: []*itemoperation.RestoreOperation{
nil,
{
Spec: itemoperation.RestoreOperationSpec{
RestoreName: "restore-1",
RestoreUID: "uid-1",
RestoreItemAction: "velero.io/csi-pvc-restorer",
OperationID: "dd-abbb048d-7036-4855-bf50-ebba978b59a6.2426dd0e-b863-4222b5b2b",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: "",
Resource: "persistentvolumeclaims",
},
Namespace: "ns-1",
Name: "pvc-1",
},
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
OperationUnits: "Byte",
Description: "Completed",
},
},
},
inputPVCNS: "ns-1",
inputPVCName: "pvc-1",
expected: []*itemoperation.RestoreOperation{
{
Spec: itemoperation.RestoreOperationSpec{
RestoreName: "restore-1",
RestoreUID: "uid-1",
RestoreItemAction: "velero.io/csi-pvc-restorer",
OperationID: "dd-abbb048d-7036-4855-bf50-ebba978b59a6.2426dd0e-b863-4222b5b2b",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: "",
Resource: "persistentvolumeclaims",
},
Namespace: "ns-1",
Name: "pvc-1",
},
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
OperationUnits: "Byte",
Description: "Completed",
},
},
},
},
{
name: "one operation with incorrect resource type",
items: []*itemoperation.RestoreOperation{
{
Spec: itemoperation.RestoreOperationSpec{
RestoreName: "restore-1",
RestoreUID: "uid-1",
RestoreItemAction: "velero.io/csi-pvc-restorer",
OperationID: "dd-abbb048d-7036-4855-bf50-ebba978b59a6.2426dd0e-b863-4222b5b2b",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: "",
Resource: "configmaps",
},
Namespace: "ns-1",
Name: "pvc-1",
},
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
OperationUnits: "Byte",
Description: "Completed",
},
},
},
inputPVCNS: "ns-1",
inputPVCName: "pvc-1",
expected: empty,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
l := restoreItemOperationList{
items: tt.items,
}
assert.Equal(t, tt.expected, l.SelectByPVC(tt.inputPVCNS, tt.inputPVCName))
})
}
}