mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-05-02 13:25:48 +00:00
Fix DBR stuck when CSI snapshot no longer exists in cloud provider (#9581)
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m17s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 3s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 19s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 18s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m29s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 40s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 50s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 53s
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m17s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 3s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 19s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 18s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m29s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 40s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 50s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 53s
* Fix DBR stuck when CSI snapshot no longer exists in cloud provider During backup deletion, VolumeSnapshotContentDeleteItemAction creates a new VSC with the snapshot handle from the backup and polls for readiness. If the underlying snapshot no longer exists (e.g., deleted externally), the CSI driver reports Status.Error but checkVSCReadiness() only checks ReadyToUse, causing it to poll for the full 10-minute timeout instead of failing fast. Additionally, the newly created VSC is never cleaned up on failure, leaving orphaned resources in the cluster. This commit: - Adds Status.Error detection in checkVSCReadiness() to fail immediately on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) - Cleans up the dangling VSC when readiness polling fails Fixes #9579 Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Add changelog for PR #9581 Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Fix typo in pod_volume_test.go: colume -> volume Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> --------- Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
committed by
GitHub
parent
2145c57642
commit
a31f4abcb3
1
changelogs/unreleased/9581-shubham-pampattiwar
Normal file
1
changelogs/unreleased/9581-shubham-pampattiwar
Normal file
@@ -0,0 +1 @@
|
||||
Fix DBR stuck when CSI snapshot no longer exists in cloud provider
|
||||
@@ -137,6 +137,10 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
|
||||
return checkVSCReadiness(ctx, &snapCont, p.crClient)
|
||||
},
|
||||
); err != nil {
|
||||
// Clean up the VSC we created since it can't become ready
|
||||
if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) {
|
||||
p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name)
|
||||
}
|
||||
return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name)
|
||||
}
|
||||
|
||||
@@ -167,6 +171,13 @@ var checkVSCReadiness = func(
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound)
|
||||
if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil {
|
||||
return false, errors.Errorf(
|
||||
"VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message,
|
||||
)
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -94,6 +94,19 @@ func TestVSCExecute(t *testing.T) {
|
||||
return false, errors.Errorf("test error case")
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Error case with CSI error, dangling VSC should be cleaned up",
|
||||
vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(),
|
||||
backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(),
|
||||
expectErr: true,
|
||||
function: func(
|
||||
ctx context.Context,
|
||||
vsc *snapshotv1api.VolumeSnapshotContent,
|
||||
client crclient.Client,
|
||||
) (bool, error) {
|
||||
return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name)
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
@@ -190,6 +203,24 @@ func TestCheckVSCReadiness(t *testing.T) {
|
||||
expectErr: false,
|
||||
ready: false,
|
||||
},
|
||||
{
|
||||
name: "VSC has error from CSI driver",
|
||||
vsc: &snapshotv1api.VolumeSnapshotContent{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "vsc-1",
|
||||
Namespace: "velero",
|
||||
},
|
||||
Status: &snapshotv1api.VolumeSnapshotContentStatus{
|
||||
ReadyToUse: boolPtr(false),
|
||||
Error: &snapshotv1api.VolumeSnapshotError{
|
||||
Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."),
|
||||
},
|
||||
},
|
||||
},
|
||||
createVSC: true,
|
||||
expectErr: true,
|
||||
ready: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
@@ -207,3 +238,11 @@ func TestCheckVSCReadiness(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func boolPtr(b bool) *bool {
|
||||
return &b
|
||||
}
|
||||
|
||||
func stringPtr(s string) *string {
|
||||
return &s
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user