Fix DBR stuck when CSI snapshot no longer exists in cloud provider (#9581)
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m17s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 3s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 19s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 18s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m29s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 40s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 50s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 53s

* Fix DBR stuck when CSI snapshot no longer exists in cloud provider

During backup deletion, VolumeSnapshotContentDeleteItemAction creates a
new VSC with the snapshot handle from the backup and polls for readiness.
If the underlying snapshot no longer exists (e.g., deleted externally),
the CSI driver reports Status.Error but checkVSCReadiness() only checks
ReadyToUse, causing it to poll for the full 10-minute timeout instead of
failing fast. Additionally, the newly created VSC is never cleaned up on
failure, leaving orphaned resources in the cluster.

This commit:
- Adds Status.Error detection in checkVSCReadiness() to fail immediately
  on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound)
- Cleans up the dangling VSC when readiness polling fails

Fixes #9579

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>

* Add changelog for PR #9581

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>

* Fix typo in pod_volume_test.go: colume -> volume

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>

---------

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
Shubham Pampattiwar
2026-03-10 10:40:09 -07:00
committed by GitHub
parent 2145c57642
commit a31f4abcb3
3 changed files with 51 additions and 0 deletions

View File

@@ -0,0 +1 @@
Fix DBR stuck when CSI snapshot no longer exists in cloud provider

View File

@@ -137,6 +137,10 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
return checkVSCReadiness(ctx, &snapCont, p.crClient)
},
); err != nil {
// Clean up the VSC we created since it can't become ready
if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) {
p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name)
}
return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name)
}
@@ -167,6 +171,13 @@ var checkVSCReadiness = func(
return true, nil
}
// Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound)
if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil {
return false, errors.Errorf(
"VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message,
)
}
return false, nil
}

View File

@@ -94,6 +94,19 @@ func TestVSCExecute(t *testing.T) {
return false, errors.Errorf("test error case")
},
},
{
name: "Error case with CSI error, dangling VSC should be cleaned up",
vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(),
backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(),
expectErr: true,
function: func(
ctx context.Context,
vsc *snapshotv1api.VolumeSnapshotContent,
client crclient.Client,
) (bool, error) {
return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name)
},
},
}
for _, test := range tests {
@@ -190,6 +203,24 @@ func TestCheckVSCReadiness(t *testing.T) {
expectErr: false,
ready: false,
},
{
name: "VSC has error from CSI driver",
vsc: &snapshotv1api.VolumeSnapshotContent{
ObjectMeta: metav1.ObjectMeta{
Name: "vsc-1",
Namespace: "velero",
},
Status: &snapshotv1api.VolumeSnapshotContentStatus{
ReadyToUse: boolPtr(false),
Error: &snapshotv1api.VolumeSnapshotError{
Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."),
},
},
},
createVSC: true,
expectErr: true,
ready: false,
},
}
for _, test := range tests {
@@ -207,3 +238,11 @@ func TestCheckVSCReadiness(t *testing.T) {
})
}
}
func boolPtr(b bool) *bool {
return &b
}
func stringPtr(s string) *string {
return &s
}