From 6c3d81a146ceda77d73150bc58e63a5c8329b0cf Mon Sep 17 00:00:00 2001 From: Quang Ngo Date: Mon, 2 Mar 2026 10:19:16 +1100 Subject: [PATCH 1/7] Add schedule_expected_interval_seconds metric Add a new Prometheus gauge metric that exposes the expected interval between consecutive scheduled backups. This enables dynamic alerting thresholds per schedule backups. Signed-off-by: Quang Ngo --- pkg/controller/schedule_controller.go | 7 +++ pkg/metrics/metrics.go | 22 +++++++ pkg/metrics/metrics_test.go | 84 +++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/pkg/controller/schedule_controller.go b/pkg/controller/schedule_controller.go index ec8894571..443b3c08b 100644 --- a/pkg/controller/schedule_controller.go +++ b/pkg/controller/schedule_controller.go @@ -129,6 +129,13 @@ func (c *scheduleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c } else { schedule.Status.Phase = velerov1.SchedulePhaseEnabled schedule.Status.ValidationErrors = nil + + // Compute expected interval between consecutive scheduled backup runs. + // Only meaningful when the cron expression is valid. + now := c.clock.Now() + nextRun := cronSchedule.Next(now) + nextNextRun := cronSchedule.Next(nextRun) + c.metrics.SetScheduleExpectedIntervalSeconds(schedule.Name, nextNextRun.Sub(nextRun).Seconds()) } scheduleNeedsPatch := false diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 30e67a7b6..86d78028c 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -80,6 +80,9 @@ const ( DataDownloadFailureTotal = "data_download_failure_total" DataDownloadCancelTotal = "data_download_cancel_total" + // schedule metrics + scheduleExpectedIntervalSeconds = "schedule_expected_interval_seconds" + // repo maintenance metrics repoMaintenanceSuccessTotal = "repo_maintenance_success_total" repoMaintenanceFailureTotal = "repo_maintenance_failure_total" @@ -347,6 +350,14 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel, backupNameLabel}, ), + scheduleExpectedIntervalSeconds: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: scheduleExpectedIntervalSeconds, + Help: "Expected interval between consecutive scheduled backups, in seconds", + }, + []string{scheduleLabel}, + ), repoMaintenanceSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, @@ -644,6 +655,9 @@ func (m *ServerMetrics) RemoveSchedule(scheduleName string) { if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok { c.DeleteLabelValues(scheduleName, "") } + if g, ok := m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec); ok { + g.DeleteLabelValues(scheduleName) + } } // InitMetricsForNode initializes counter metrics for a node. @@ -758,6 +772,14 @@ func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string, } } +// SetScheduleExpectedIntervalSeconds records the expected interval in seconds, +// between consecutive backups for a schedule. +func (m *ServerMetrics) SetScheduleExpectedIntervalSeconds(scheduleName string, seconds float64) { + if g, ok := m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec); ok { + g.WithLabelValues(scheduleName).Set(seconds) + } +} + // SetBackupTotal records the current number of existent backups. func (m *ServerMetrics) SetBackupTotal(numberOfBackups int64) { if g, ok := m.metrics[backupTotal].(prometheus.Gauge); ok { diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 184e496ab..a24f2bf33 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -259,6 +259,90 @@ func TestMultipleAdhocBackupsShareMetrics(t *testing.T) { assert.Equal(t, float64(1), validationFailureMetric, "All adhoc validation failures should be counted together") } +// TestSetScheduleExpectedIntervalSeconds verifies that the expected interval metric +// is properly recorded for schedules. +func TestSetScheduleExpectedIntervalSeconds(t *testing.T) { + tests := []struct { + name string + scheduleName string + intervalSeconds float64 + description string + }{ + { + name: "every 5 minutes schedule", + scheduleName: "frequent-backup", + intervalSeconds: 300, + description: "Expected interval should be 5m in seconds", + }, + { + name: "daily schedule", + scheduleName: "daily-backup", + intervalSeconds: 86400, + description: "Expected interval should be 24h in seconds", + }, + { + name: "monthly schedule", + scheduleName: "monthly-backup", + intervalSeconds: 2678400, // 31 days in seconds + description: "Expected interval should be 31 days in seconds", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + m := NewServerMetrics() + m.SetScheduleExpectedIntervalSeconds(tc.scheduleName, tc.intervalSeconds) + + metric := getMetricValue(t, m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec), tc.scheduleName) + assert.Equal(t, tc.intervalSeconds, metric, tc.description) + }) + } +} + +// TestScheduleExpectedIntervalNotInitializedByDefault verifies that the expected +// interval metric is not initialized by InitSchedule, so it only appears for +// schedules with a valid cron expression. +func TestScheduleExpectedIntervalNotInitializedByDefault(t *testing.T) { + m := NewServerMetrics() + m.InitSchedule("test-schedule") + + // The metric should not have any values after InitSchedule + ch := make(chan prometheus.Metric, 1) + m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec).Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + assert.Equal(t, 0, count, "scheduleExpectedIntervalSeconds should not be initialized by InitSchedule") +} + +// TestRemoveScheduleCleansUpExpectedInterval verifies that RemoveSchedule +// cleans up the expected interval metric. +func TestRemoveScheduleCleansUpExpectedInterval(t *testing.T) { + m := NewServerMetrics() + m.InitSchedule("test-schedule") + m.SetScheduleExpectedIntervalSeconds("test-schedule", 3600) + + // Verify metric exists + metric := getMetricValue(t, m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec), "test-schedule") + assert.Equal(t, float64(3600), metric) + + // Remove schedule and verify metric is cleaned up + m.RemoveSchedule("test-schedule") + + ch := make(chan prometheus.Metric, 1) + m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec).Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + assert.Equal(t, 0, count, "scheduleExpectedIntervalSeconds should be removed after RemoveSchedule") +} + // TestInitScheduleWithEmptyName verifies that InitSchedule works correctly // with an empty schedule name (for adhoc backups). func TestInitScheduleWithEmptyName(t *testing.T) { From 1c08af84614e7f1197d9fe5b9a117668851c1afd Mon Sep 17 00:00:00 2001 From: Quang Ngo Date: Mon, 2 Mar 2026 10:49:14 +1100 Subject: [PATCH 2/7] Add changelog for #9570 Signed-off-by: Quang Ngo --- changelogs/unreleased/9570-H-M-Quang-Ngo | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9570-H-M-Quang-Ngo diff --git a/changelogs/unreleased/9570-H-M-Quang-Ngo b/changelogs/unreleased/9570-H-M-Quang-Ngo new file mode 100644 index 000000000..603cd75e5 --- /dev/null +++ b/changelogs/unreleased/9570-H-M-Quang-Ngo @@ -0,0 +1 @@ +Add schedule_expected_interval_seconds metric for dynamic backup alerting thresholds (#9559) From bbec46f6ee2d4bf4c7ca513c45c700f66d032fef Mon Sep 17 00:00:00 2001 From: dongqingcc Date: Thu, 26 Feb 2026 16:38:53 +0800 Subject: [PATCH 3/7] Add e2e test case for PR 9366: Use hookIndex for recording multiple restore exec hooks. Signed-off-by: dongqingcc --- test/e2e/basic/restore_exec_hooks.go | 150 +++++++++++++++++++++++++++ test/e2e/e2e_suite_test.go | 6 ++ 2 files changed, 156 insertions(+) create mode 100644 test/e2e/basic/restore_exec_hooks.go diff --git a/test/e2e/basic/restore_exec_hooks.go b/test/e2e/basic/restore_exec_hooks.go new file mode 100644 index 000000000..fc1b8ceb8 --- /dev/null +++ b/test/e2e/basic/restore_exec_hooks.go @@ -0,0 +1,150 @@ +/* +Copyright the Velero contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "fmt" + "path" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/vmware-tanzu/velero/test/e2e/test" + . "github.com/vmware-tanzu/velero/test/e2e/test" + "github.com/vmware-tanzu/velero/test/util/common" + . "github.com/vmware-tanzu/velero/test/util/k8s" +) + +// RestoreExecHooks tests that a pod with multiple restore exec hooks does not hang +// at the Finalizing phase during restore (Issue #9359 / PR #9366). +type RestoreExecHooks struct { + TestCase + podName string +} + +var RestoreExecHooksTest func() = test.TestFunc(&RestoreExecHooks{}) + +func (r *RestoreExecHooks) Init() error { + Expect(r.TestCase.Init()).To(Succeed()) + r.CaseBaseName = "restore-exec-hooks-" + r.UUIDgen + r.BackupName = "backup-" + r.CaseBaseName + r.RestoreName = "restore-" + r.CaseBaseName + r.podName = "pod-multiple-hooks" + r.NamespacesTotal = 1 + r.NSIncluded = &[]string{} + + for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ { + createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum) + *r.NSIncluded = append(*r.NSIncluded, createNSName) + } + + r.TestMsg = &test.TestMSG{ + Desc: "Restore pod with multiple restore exec hooks", + Text: "Should successfully backup and restore without hanging at Finalizing phase", + FailedMSG: "Failed to successfully backup and restore pod with multiple hooks", + } + + r.BackupArgs = []string{ + "create", "--namespace", r.VeleroCfg.VeleroNamespace, "backup", r.BackupName, + "--include-namespaces", strings.Join(*r.NSIncluded, ","), + "--default-volumes-to-fs-backup", "--wait", + } + + r.RestoreArgs = []string{ + "create", "--namespace", r.VeleroCfg.VeleroNamespace, "restore", r.RestoreName, + "--from-backup", r.BackupName, "--wait", + } + + return nil +} + +func (r *RestoreExecHooks) CreateResources() error { + for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ { + createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum) + + By(fmt.Sprintf("Creating namespace %s", createNSName), func() { + Expect(CreateNamespace(r.Ctx, r.Client, createNSName)). + To(Succeed(), fmt.Sprintf("Failed to create namespace %s", createNSName)) + }) + + // Prepare images and commands adaptively for the target OS + imageAddress := LinuxTestImage + initCommand := `["/bin/sh", "-c", "echo init-hook-done"]` + execCommand1 := `["/bin/sh", "-c", "echo hook1"]` + execCommand2 := `["/bin/sh", "-c", "echo hook2"]` + + if r.VeleroCfg.WorkerOS == common.WorkerOSLinux && r.VeleroCfg.ImageRegistryProxy != "" { + imageAddress = path.Join(r.VeleroCfg.ImageRegistryProxy, LinuxTestImage) + } else if r.VeleroCfg.WorkerOS == common.WorkerOSWindows { + imageAddress = WindowTestImage + initCommand = `["cmd", "/c", "echo init-hook-done"]` + execCommand1 = `["cmd", "/c", "echo hook1"]` + execCommand2 = `["cmd", "/c", "echo hook2"]` + } + + // Inject mixing InitContainer hook and multiple Exec post-restore hooks. + // This guarantees that the loop index 'i' mismatched 'hook.hookIndex' (Issue #9359), + // ensuring the bug is properly reproduced and the fix is verified. + ann := map[string]string{ + // Inject InitContainer Restore Hook + "init.hook.restore.velero.io/container-image": imageAddress, + "init.hook.restore.velero.io/container-name": "test-init-hook", + "init.hook.restore.velero.io/command": initCommand, + + // Inject multiple Exec Restore Hooks + "post.hook.restore.velero.io/test1.command": execCommand1, + "post.hook.restore.velero.io/test1.container": r.podName, + "post.hook.restore.velero.io/test2.command": execCommand2, + "post.hook.restore.velero.io/test2.container": r.podName, + } + + By(fmt.Sprintf("Creating pod %s with multiple restore hooks in namespace %s", r.podName, createNSName), func() { + _, err := CreatePod( + r.Client, + createNSName, + r.podName, + "", // No storage class needed + "", // No PVC needed + []string{}, // No volumes + nil, + ann, + r.VeleroCfg.ImageRegistryProxy, + r.VeleroCfg.WorkerOS, + ) + Expect(err).To(Succeed(), fmt.Sprintf("Failed to create pod with hooks in namespace %s", createNSName)) + }) + + By(fmt.Sprintf("Waiting for pod %s to be ready", r.podName), func() { + err := WaitForPods(r.Ctx, r.Client, createNSName, []string{r.podName}) + Expect(err).To(Succeed(), fmt.Sprintf("Failed to wait for pod %s in namespace %s", r.podName, createNSName)) + }) + } + return nil +} + +func (r *RestoreExecHooks) Verify() error { + for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ { + createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum) + + By(fmt.Sprintf("Verifying pod %s in namespace %s after restore", r.podName, createNSName), func() { + err := WaitForPods(r.Ctx, r.Client, createNSName, []string{r.podName}) + Expect(err).To(Succeed(), fmt.Sprintf("Failed to verify pod %s in namespace %s after restore", r.podName, createNSName)) + }) + } + return nil +} diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index f0d1c9c2e..c19c2d52f 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -440,6 +440,12 @@ var _ = Describe( StorageClasssChangingTest, ) +var _ = Describe( + "Restore phase does not block at Finalizing when a container has multiple exec hooks", + Label("Basic", "Hooks"), + RestoreExecHooksTest, +) + var _ = Describe( "Backup/restore of 2500 namespaces", Label("Scale", "LongTime"), From d1cc30355385093a1e8839662dda8ac93ce5d04c Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Mon, 9 Mar 2026 15:38:29 +0800 Subject: [PATCH 4/7] issue 9586: set latest doc to 1.18 Signed-off-by: Lyndon-Li --- pkg/util/podvolume/pod_volume_test.go | 2 +- site/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/util/podvolume/pod_volume_test.go b/pkg/util/podvolume/pod_volume_test.go index a3484c2e3..87a6f3a0f 100644 --- a/pkg/util/podvolume/pod_volume_test.go +++ b/pkg/util/podvolume/pod_volume_test.go @@ -156,7 +156,7 @@ func TestGetVolumesByPod(t *testing.T) { Volumes: []corev1api.Volume{ // PVB Volumes {Name: "pvbPV1"}, {Name: "pvbPV2"}, {Name: "pvbPV3"}, - /// Excluded from PVB because colume mounting default service account token + /// Excluded from PVB because volume mounting default service account token {Name: "default-token-5xq45"}, }, }, diff --git a/site/config.yaml b/site/config.yaml index ed80914a4..8eded5b59 100644 --- a/site/config.yaml +++ b/site/config.yaml @@ -12,7 +12,7 @@ params: hero: backgroundColor: med-blue versioning: true - latest: v1.17 + latest: v1.18 versions: - main - v1.18 From a9b3cfa062f61970d2980785cd4244d20834fdb0 Mon Sep 17 00:00:00 2001 From: Xun Jiang Date: Tue, 10 Mar 2026 15:42:27 +0800 Subject: [PATCH 5/7] Disable Algolia docs search. Revert PR 6105. Signed-off-by: Xun Jiang --- site/algolia-crawler.json | 90 ---------------------------- site/layouts/docs/docs.html | 20 ------- site/layouts/partials/head-docs.html | 2 - 3 files changed, 112 deletions(-) delete mode 100644 site/algolia-crawler.json diff --git a/site/algolia-crawler.json b/site/algolia-crawler.json deleted file mode 100644 index 06dc083d7..000000000 --- a/site/algolia-crawler.json +++ /dev/null @@ -1,90 +0,0 @@ -new Crawler({ - rateLimit: 8, - maxDepth: 10, - startUrls: ["https://velero.io/docs", "https://velero.io/"], - renderJavaScript: false, - sitemaps: ["https://velero.io/sitemap.xml"], - ignoreCanonicalTo: false, - discoveryPatterns: ["https://velero.io/**"], - schedule: "at 6:39 PM on Friday", - actions: [ - { - indexName: "velero_new", - pathsToMatch: ["https://velero.io/docs**/**"], - recordExtractor: ({ helpers }) => { - return helpers.docsearch({ - recordProps: { - lvl1: ["header h1", "article h1", "main h1", "h1", "head > title"], - content: ["article p, article li", "main p, main li", "p, li"], - lvl0: { - defaultValue: "Documentation", - }, - lvl2: ["article h2", "main h2", "h2"], - lvl3: ["article h3", "main h3", "h3"], - lvl4: ["article h4", "main h4", "h4"], - lvl5: ["article h5", "main h5", "h5"], - lvl6: ["article h6", "main h6", "h6"], - version: "#dropdownMenuButton", - }, - aggregateContent: true, - recordVersion: "v3", - }); - }, - }, - ], - initialIndexSettings: { - velero_new: { - attributesForFaceting: ["type", "lang", "version"], - attributesToRetrieve: [ - "hierarchy", - "content", - "anchor", - "url", - "url_without_anchor", - "type", - "version", - ], - attributesToHighlight: ["hierarchy", "content"], - attributesToSnippet: ["content:10"], - camelCaseAttributes: ["hierarchy", "content"], - searchableAttributes: [ - "unordered(hierarchy.lvl0)", - "unordered(hierarchy.lvl1)", - "unordered(hierarchy.lvl2)", - "unordered(hierarchy.lvl3)", - "unordered(hierarchy.lvl4)", - "unordered(hierarchy.lvl5)", - "unordered(hierarchy.lvl6)", - "content", - ], - distinct: true, - attributeForDistinct: "url", - customRanking: [ - "desc(weight.pageRank)", - "desc(weight.level)", - "asc(weight.position)", - ], - ranking: [ - "words", - "filters", - "typo", - "attribute", - "proximity", - "exact", - "custom", - ], - highlightPreTag: '', - highlightPostTag: "", - minWordSizefor1Typo: 3, - minWordSizefor2Typos: 7, - allowTyposOnNumericTokens: false, - minProximity: 1, - ignorePlurals: true, - advancedSyntax: true, - attributeCriteriaComputedByMinProximity: true, - removeWordsIfNoResults: "allOptional", - }, - }, - appId: "9ASKQJ1HR3", - apiKey: "6392a5916af73b73df2406d3aef5ca45", -}); \ No newline at end of file diff --git a/site/layouts/docs/docs.html b/site/layouts/docs/docs.html index 6d2a3f57f..11e6cf9e9 100644 --- a/site/layouts/docs/docs.html +++ b/site/layouts/docs/docs.html @@ -27,16 +27,6 @@
{{ .Render "versions" }}
-
- -
{{ .Render "nav" }}
@@ -58,16 +48,6 @@ {{ .Render "footer" }}
- - diff --git a/site/layouts/partials/head-docs.html b/site/layouts/partials/head-docs.html index 5ebae8c24..c92837b2f 100644 --- a/site/layouts/partials/head-docs.html +++ b/site/layouts/partials/head-docs.html @@ -8,6 +8,4 @@ {{ $styles := resources.Get "styles.scss" | toCSS $options | resources.Fingerprint }} {{/* TODO {% seo %}*/}} - - From a31f4abcb392bdf20d61e2d75602f01b522b8743 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 10 Mar 2026 10:40:09 -0700 Subject: [PATCH 6/7] Fix DBR stuck when CSI snapshot no longer exists in cloud provider (#9581) * Fix DBR stuck when CSI snapshot no longer exists in cloud provider During backup deletion, VolumeSnapshotContentDeleteItemAction creates a new VSC with the snapshot handle from the backup and polls for readiness. If the underlying snapshot no longer exists (e.g., deleted externally), the CSI driver reports Status.Error but checkVSCReadiness() only checks ReadyToUse, causing it to poll for the full 10-minute timeout instead of failing fast. Additionally, the newly created VSC is never cleaned up on failure, leaving orphaned resources in the cluster. This commit: - Adds Status.Error detection in checkVSCReadiness() to fail immediately on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) - Cleans up the dangling VSC when readiness polling fails Fixes #9579 Signed-off-by: Shubham Pampattiwar * Add changelog for PR #9581 Signed-off-by: Shubham Pampattiwar * Fix typo in pod_volume_test.go: colume -> volume Signed-off-by: Shubham Pampattiwar --------- Signed-off-by: Shubham Pampattiwar --- .../unreleased/9581-shubham-pampattiwar | 1 + .../csi/volumesnapshotcontent_action.go | 11 ++++++ .../csi/volumesnapshotcontent_action_test.go | 39 +++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 changelogs/unreleased/9581-shubham-pampattiwar diff --git a/changelogs/unreleased/9581-shubham-pampattiwar b/changelogs/unreleased/9581-shubham-pampattiwar new file mode 100644 index 000000000..f369a8af5 --- /dev/null +++ b/changelogs/unreleased/9581-shubham-pampattiwar @@ -0,0 +1 @@ +Fix DBR stuck when CSI snapshot no longer exists in cloud provider diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action.go b/internal/delete/actions/csi/volumesnapshotcontent_action.go index d12c7c43a..7a6724df1 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action.go @@ -137,6 +137,10 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( return checkVSCReadiness(ctx, &snapCont, p.crClient) }, ); err != nil { + // Clean up the VSC we created since it can't become ready + if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) { + p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name) + } return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name) } @@ -167,6 +171,13 @@ var checkVSCReadiness = func( return true, nil } + // Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) + if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil { + return false, errors.Errorf( + "VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message, + ) + } + return false, nil } diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go index 24baccdb2..7dbd6d7ff 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go @@ -94,6 +94,19 @@ func TestVSCExecute(t *testing.T) { return false, errors.Errorf("test error case") }, }, + { + name: "Error case with CSI error, dangling VSC should be cleaned up", + vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), + backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), + expectErr: true, + function: func( + ctx context.Context, + vsc *snapshotv1api.VolumeSnapshotContent, + client crclient.Client, + ) (bool, error) { + return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name) + }, + }, } for _, test := range tests { @@ -190,6 +203,24 @@ func TestCheckVSCReadiness(t *testing.T) { expectErr: false, ready: false, }, + { + name: "VSC has error from CSI driver", + vsc: &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{ + Name: "vsc-1", + Namespace: "velero", + }, + Status: &snapshotv1api.VolumeSnapshotContentStatus{ + ReadyToUse: boolPtr(false), + Error: &snapshotv1api.VolumeSnapshotError{ + Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."), + }, + }, + }, + createVSC: true, + expectErr: true, + ready: false, + }, } for _, test := range tests { @@ -207,3 +238,11 @@ func TestCheckVSCReadiness(t *testing.T) { }) } } + +func boolPtr(b bool) *bool { + return &b +} + +func stringPtr(s string) *string { + return &s +} From afe7df17d4af4a067eb77ca2af9d0ffbceac9b81 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 10 Mar 2026 13:12:47 -0700 Subject: [PATCH 7/7] Add itemOperationTimeout to Schedule API type docs (#9599) The itemOperationTimeout field was missing from the Schedule API type documentation even though it is supported in the Schedule CRD template. This led users to believe the field was not available per-schedule. Fixes #9598 Signed-off-by: Shubham Pampattiwar --- site/content/docs/main/api-types/schedule.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/site/content/docs/main/api-types/schedule.md b/site/content/docs/main/api-types/schedule.md index c89fe60d7..ef3df4324 100644 --- a/site/content/docs/main/api-types/schedule.md +++ b/site/content/docs/main/api-types/schedule.md @@ -63,6 +63,10 @@ spec: # CSI VolumeSnapshot status turns to ReadyToUse during creation, before # returning error as timeout. The default value is 10 minute. csiSnapshotTimeout: 10m + # ItemOperationTimeout specifies the time used to wait for + # asynchronous BackupItemAction operations + # The default value is 4 hour. + itemOperationTimeout: 4h # resourcePolicy specifies the referenced resource policies that backup should follow # optional resourcePolicy: