From 4cc1779fecd16904903f0ee2dae2ad79e853567d Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 10 Mar 2026 10:40:09 -0700 Subject: [PATCH 1/3] Fix DBR stuck when CSI snapshot no longer exists in cloud provider (#9581) * Fix DBR stuck when CSI snapshot no longer exists in cloud provider During backup deletion, VolumeSnapshotContentDeleteItemAction creates a new VSC with the snapshot handle from the backup and polls for readiness. If the underlying snapshot no longer exists (e.g., deleted externally), the CSI driver reports Status.Error but checkVSCReadiness() only checks ReadyToUse, causing it to poll for the full 10-minute timeout instead of failing fast. Additionally, the newly created VSC is never cleaned up on failure, leaving orphaned resources in the cluster. This commit: - Adds Status.Error detection in checkVSCReadiness() to fail immediately on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) - Cleans up the dangling VSC when readiness polling fails Fixes #9579 Signed-off-by: Shubham Pampattiwar * Add changelog for PR #9581 Signed-off-by: Shubham Pampattiwar * Fix typo in pod_volume_test.go: colume -> volume Signed-off-by: Shubham Pampattiwar --------- Signed-off-by: Shubham Pampattiwar --- .../unreleased/9581-shubham-pampattiwar | 1 + .../csi/volumesnapshotcontent_action.go | 11 ++++++ .../csi/volumesnapshotcontent_action_test.go | 39 +++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 changelogs/unreleased/9581-shubham-pampattiwar diff --git a/changelogs/unreleased/9581-shubham-pampattiwar b/changelogs/unreleased/9581-shubham-pampattiwar new file mode 100644 index 000000000..f369a8af5 --- /dev/null +++ b/changelogs/unreleased/9581-shubham-pampattiwar @@ -0,0 +1 @@ +Fix DBR stuck when CSI snapshot no longer exists in cloud provider diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action.go b/internal/delete/actions/csi/volumesnapshotcontent_action.go index d12c7c43a..7a6724df1 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action.go @@ -137,6 +137,10 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( return checkVSCReadiness(ctx, &snapCont, p.crClient) }, ); err != nil { + // Clean up the VSC we created since it can't become ready + if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) { + p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name) + } return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name) } @@ -167,6 +171,13 @@ var checkVSCReadiness = func( return true, nil } + // Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) + if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil { + return false, errors.Errorf( + "VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message, + ) + } + return false, nil } diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go index 24baccdb2..7dbd6d7ff 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go @@ -94,6 +94,19 @@ func TestVSCExecute(t *testing.T) { return false, errors.Errorf("test error case") }, }, + { + name: "Error case with CSI error, dangling VSC should be cleaned up", + vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), + backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), + expectErr: true, + function: func( + ctx context.Context, + vsc *snapshotv1api.VolumeSnapshotContent, + client crclient.Client, + ) (bool, error) { + return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name) + }, + }, } for _, test := range tests { @@ -190,6 +203,24 @@ func TestCheckVSCReadiness(t *testing.T) { expectErr: false, ready: false, }, + { + name: "VSC has error from CSI driver", + vsc: &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{ + Name: "vsc-1", + Namespace: "velero", + }, + Status: &snapshotv1api.VolumeSnapshotContentStatus{ + ReadyToUse: boolPtr(false), + Error: &snapshotv1api.VolumeSnapshotError{ + Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."), + }, + }, + }, + createVSC: true, + expectErr: true, + ready: false, + }, } for _, test := range tests { @@ -207,3 +238,11 @@ func TestCheckVSCReadiness(t *testing.T) { }) } } + +func boolPtr(b bool) *bool { + return &b +} + +func stringPtr(s string) *string { + return &s +} From 7cfd4af733a3ca8c4662accd2b57c76f8d4efb6c Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 10 Mar 2026 19:09:57 -0700 Subject: [PATCH 2/3] Add changelog for PR #9604 Signed-off-by: Shubham Pampattiwar --- changelogs/unreleased/9604-shubham-pampattiwar | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9604-shubham-pampattiwar diff --git a/changelogs/unreleased/9604-shubham-pampattiwar b/changelogs/unreleased/9604-shubham-pampattiwar new file mode 100644 index 000000000..f369a8af5 --- /dev/null +++ b/changelogs/unreleased/9604-shubham-pampattiwar @@ -0,0 +1 @@ +Fix DBR stuck when CSI snapshot no longer exists in cloud provider From 883befcdde406193cfe8bded8c7f694e4198e88e Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Wed, 11 Mar 2026 09:19:06 -0700 Subject: [PATCH 3/3] Remove cherry-picked changelog file from upstream PR Signed-off-by: Shubham Pampattiwar --- changelogs/unreleased/9581-shubham-pampattiwar | 1 - 1 file changed, 1 deletion(-) delete mode 100644 changelogs/unreleased/9581-shubham-pampattiwar diff --git a/changelogs/unreleased/9581-shubham-pampattiwar b/changelogs/unreleased/9581-shubham-pampattiwar deleted file mode 100644 index f369a8af5..000000000 --- a/changelogs/unreleased/9581-shubham-pampattiwar +++ /dev/null @@ -1 +0,0 @@ -Fix DBR stuck when CSI snapshot no longer exists in cloud provider