diff --git a/changelogs/unreleased/9570-H-M-Quang-Ngo b/changelogs/unreleased/9570-H-M-Quang-Ngo new file mode 100644 index 000000000..603cd75e5 --- /dev/null +++ b/changelogs/unreleased/9570-H-M-Quang-Ngo @@ -0,0 +1 @@ +Add schedule_expected_interval_seconds metric for dynamic backup alerting thresholds (#9559) diff --git a/changelogs/unreleased/9581-shubham-pampattiwar b/changelogs/unreleased/9581-shubham-pampattiwar new file mode 100644 index 000000000..f369a8af5 --- /dev/null +++ b/changelogs/unreleased/9581-shubham-pampattiwar @@ -0,0 +1 @@ +Fix DBR stuck when CSI snapshot no longer exists in cloud provider diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action.go b/internal/delete/actions/csi/volumesnapshotcontent_action.go index d12c7c43a..7a6724df1 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action.go @@ -137,6 +137,10 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( return checkVSCReadiness(ctx, &snapCont, p.crClient) }, ); err != nil { + // Clean up the VSC we created since it can't become ready + if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) { + p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name) + } return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name) } @@ -167,6 +171,13 @@ var checkVSCReadiness = func( return true, nil } + // Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) + if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil { + return false, errors.Errorf( + "VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message, + ) + } + return false, nil } diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go index 24baccdb2..7dbd6d7ff 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go @@ -94,6 +94,19 @@ func TestVSCExecute(t *testing.T) { return false, errors.Errorf("test error case") }, }, + { + name: "Error case with CSI error, dangling VSC should be cleaned up", + vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), + backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), + expectErr: true, + function: func( + ctx context.Context, + vsc *snapshotv1api.VolumeSnapshotContent, + client crclient.Client, + ) (bool, error) { + return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name) + }, + }, } for _, test := range tests { @@ -190,6 +203,24 @@ func TestCheckVSCReadiness(t *testing.T) { expectErr: false, ready: false, }, + { + name: "VSC has error from CSI driver", + vsc: &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{ + Name: "vsc-1", + Namespace: "velero", + }, + Status: &snapshotv1api.VolumeSnapshotContentStatus{ + ReadyToUse: boolPtr(false), + Error: &snapshotv1api.VolumeSnapshotError{ + Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."), + }, + }, + }, + createVSC: true, + expectErr: true, + ready: false, + }, } for _, test := range tests { @@ -207,3 +238,11 @@ func TestCheckVSCReadiness(t *testing.T) { }) } } + +func boolPtr(b bool) *bool { + return &b +} + +func stringPtr(s string) *string { + return &s +} diff --git a/pkg/controller/schedule_controller.go b/pkg/controller/schedule_controller.go index ec8894571..443b3c08b 100644 --- a/pkg/controller/schedule_controller.go +++ b/pkg/controller/schedule_controller.go @@ -129,6 +129,13 @@ func (c *scheduleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c } else { schedule.Status.Phase = velerov1.SchedulePhaseEnabled schedule.Status.ValidationErrors = nil + + // Compute expected interval between consecutive scheduled backup runs. + // Only meaningful when the cron expression is valid. + now := c.clock.Now() + nextRun := cronSchedule.Next(now) + nextNextRun := cronSchedule.Next(nextRun) + c.metrics.SetScheduleExpectedIntervalSeconds(schedule.Name, nextNextRun.Sub(nextRun).Seconds()) } scheduleNeedsPatch := false diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 30e67a7b6..86d78028c 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -80,6 +80,9 @@ const ( DataDownloadFailureTotal = "data_download_failure_total" DataDownloadCancelTotal = "data_download_cancel_total" + // schedule metrics + scheduleExpectedIntervalSeconds = "schedule_expected_interval_seconds" + // repo maintenance metrics repoMaintenanceSuccessTotal = "repo_maintenance_success_total" repoMaintenanceFailureTotal = "repo_maintenance_failure_total" @@ -347,6 +350,14 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel, backupNameLabel}, ), + scheduleExpectedIntervalSeconds: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: scheduleExpectedIntervalSeconds, + Help: "Expected interval between consecutive scheduled backups, in seconds", + }, + []string{scheduleLabel}, + ), repoMaintenanceSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, @@ -644,6 +655,9 @@ func (m *ServerMetrics) RemoveSchedule(scheduleName string) { if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok { c.DeleteLabelValues(scheduleName, "") } + if g, ok := m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec); ok { + g.DeleteLabelValues(scheduleName) + } } // InitMetricsForNode initializes counter metrics for a node. @@ -758,6 +772,14 @@ func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string, } } +// SetScheduleExpectedIntervalSeconds records the expected interval in seconds, +// between consecutive backups for a schedule. +func (m *ServerMetrics) SetScheduleExpectedIntervalSeconds(scheduleName string, seconds float64) { + if g, ok := m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec); ok { + g.WithLabelValues(scheduleName).Set(seconds) + } +} + // SetBackupTotal records the current number of existent backups. func (m *ServerMetrics) SetBackupTotal(numberOfBackups int64) { if g, ok := m.metrics[backupTotal].(prometheus.Gauge); ok { diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 184e496ab..a24f2bf33 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -259,6 +259,90 @@ func TestMultipleAdhocBackupsShareMetrics(t *testing.T) { assert.Equal(t, float64(1), validationFailureMetric, "All adhoc validation failures should be counted together") } +// TestSetScheduleExpectedIntervalSeconds verifies that the expected interval metric +// is properly recorded for schedules. +func TestSetScheduleExpectedIntervalSeconds(t *testing.T) { + tests := []struct { + name string + scheduleName string + intervalSeconds float64 + description string + }{ + { + name: "every 5 minutes schedule", + scheduleName: "frequent-backup", + intervalSeconds: 300, + description: "Expected interval should be 5m in seconds", + }, + { + name: "daily schedule", + scheduleName: "daily-backup", + intervalSeconds: 86400, + description: "Expected interval should be 24h in seconds", + }, + { + name: "monthly schedule", + scheduleName: "monthly-backup", + intervalSeconds: 2678400, // 31 days in seconds + description: "Expected interval should be 31 days in seconds", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + m := NewServerMetrics() + m.SetScheduleExpectedIntervalSeconds(tc.scheduleName, tc.intervalSeconds) + + metric := getMetricValue(t, m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec), tc.scheduleName) + assert.Equal(t, tc.intervalSeconds, metric, tc.description) + }) + } +} + +// TestScheduleExpectedIntervalNotInitializedByDefault verifies that the expected +// interval metric is not initialized by InitSchedule, so it only appears for +// schedules with a valid cron expression. +func TestScheduleExpectedIntervalNotInitializedByDefault(t *testing.T) { + m := NewServerMetrics() + m.InitSchedule("test-schedule") + + // The metric should not have any values after InitSchedule + ch := make(chan prometheus.Metric, 1) + m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec).Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + assert.Equal(t, 0, count, "scheduleExpectedIntervalSeconds should not be initialized by InitSchedule") +} + +// TestRemoveScheduleCleansUpExpectedInterval verifies that RemoveSchedule +// cleans up the expected interval metric. +func TestRemoveScheduleCleansUpExpectedInterval(t *testing.T) { + m := NewServerMetrics() + m.InitSchedule("test-schedule") + m.SetScheduleExpectedIntervalSeconds("test-schedule", 3600) + + // Verify metric exists + metric := getMetricValue(t, m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec), "test-schedule") + assert.Equal(t, float64(3600), metric) + + // Remove schedule and verify metric is cleaned up + m.RemoveSchedule("test-schedule") + + ch := make(chan prometheus.Metric, 1) + m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec).Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + assert.Equal(t, 0, count, "scheduleExpectedIntervalSeconds should be removed after RemoveSchedule") +} + // TestInitScheduleWithEmptyName verifies that InitSchedule works correctly // with an empty schedule name (for adhoc backups). func TestInitScheduleWithEmptyName(t *testing.T) { diff --git a/pkg/util/podvolume/pod_volume_test.go b/pkg/util/podvolume/pod_volume_test.go index a3484c2e3..87a6f3a0f 100644 --- a/pkg/util/podvolume/pod_volume_test.go +++ b/pkg/util/podvolume/pod_volume_test.go @@ -156,7 +156,7 @@ func TestGetVolumesByPod(t *testing.T) { Volumes: []corev1api.Volume{ // PVB Volumes {Name: "pvbPV1"}, {Name: "pvbPV2"}, {Name: "pvbPV3"}, - /// Excluded from PVB because colume mounting default service account token + /// Excluded from PVB because volume mounting default service account token {Name: "default-token-5xq45"}, }, }, diff --git a/site/algolia-crawler.json b/site/algolia-crawler.json deleted file mode 100644 index 06dc083d7..000000000 --- a/site/algolia-crawler.json +++ /dev/null @@ -1,90 +0,0 @@ -new Crawler({ - rateLimit: 8, - maxDepth: 10, - startUrls: ["https://velero.io/docs", "https://velero.io/"], - renderJavaScript: false, - sitemaps: ["https://velero.io/sitemap.xml"], - ignoreCanonicalTo: false, - discoveryPatterns: ["https://velero.io/**"], - schedule: "at 6:39 PM on Friday", - actions: [ - { - indexName: "velero_new", - pathsToMatch: ["https://velero.io/docs**/**"], - recordExtractor: ({ helpers }) => { - return helpers.docsearch({ - recordProps: { - lvl1: ["header h1", "article h1", "main h1", "h1", "head > title"], - content: ["article p, article li", "main p, main li", "p, li"], - lvl0: { - defaultValue: "Documentation", - }, - lvl2: ["article h2", "main h2", "h2"], - lvl3: ["article h3", "main h3", "h3"], - lvl4: ["article h4", "main h4", "h4"], - lvl5: ["article h5", "main h5", "h5"], - lvl6: ["article h6", "main h6", "h6"], - version: "#dropdownMenuButton", - }, - aggregateContent: true, - recordVersion: "v3", - }); - }, - }, - ], - initialIndexSettings: { - velero_new: { - attributesForFaceting: ["type", "lang", "version"], - attributesToRetrieve: [ - "hierarchy", - "content", - "anchor", - "url", - "url_without_anchor", - "type", - "version", - ], - attributesToHighlight: ["hierarchy", "content"], - attributesToSnippet: ["content:10"], - camelCaseAttributes: ["hierarchy", "content"], - searchableAttributes: [ - "unordered(hierarchy.lvl0)", - "unordered(hierarchy.lvl1)", - "unordered(hierarchy.lvl2)", - "unordered(hierarchy.lvl3)", - "unordered(hierarchy.lvl4)", - "unordered(hierarchy.lvl5)", - "unordered(hierarchy.lvl6)", - "content", - ], - distinct: true, - attributeForDistinct: "url", - customRanking: [ - "desc(weight.pageRank)", - "desc(weight.level)", - "asc(weight.position)", - ], - ranking: [ - "words", - "filters", - "typo", - "attribute", - "proximity", - "exact", - "custom", - ], - highlightPreTag: '', - highlightPostTag: "", - minWordSizefor1Typo: 3, - minWordSizefor2Typos: 7, - allowTyposOnNumericTokens: false, - minProximity: 1, - ignorePlurals: true, - advancedSyntax: true, - attributeCriteriaComputedByMinProximity: true, - removeWordsIfNoResults: "allOptional", - }, - }, - appId: "9ASKQJ1HR3", - apiKey: "6392a5916af73b73df2406d3aef5ca45", -}); \ No newline at end of file diff --git a/site/config.yaml b/site/config.yaml index ed80914a4..8eded5b59 100644 --- a/site/config.yaml +++ b/site/config.yaml @@ -12,7 +12,7 @@ params: hero: backgroundColor: med-blue versioning: true - latest: v1.17 + latest: v1.18 versions: - main - v1.18 diff --git a/site/content/docs/main/api-types/schedule.md b/site/content/docs/main/api-types/schedule.md index c89fe60d7..ef3df4324 100644 --- a/site/content/docs/main/api-types/schedule.md +++ b/site/content/docs/main/api-types/schedule.md @@ -63,6 +63,10 @@ spec: # CSI VolumeSnapshot status turns to ReadyToUse during creation, before # returning error as timeout. The default value is 10 minute. csiSnapshotTimeout: 10m + # ItemOperationTimeout specifies the time used to wait for + # asynchronous BackupItemAction operations + # The default value is 4 hour. + itemOperationTimeout: 4h # resourcePolicy specifies the referenced resource policies that backup should follow # optional resourcePolicy: diff --git a/site/layouts/docs/docs.html b/site/layouts/docs/docs.html index 6d2a3f57f..11e6cf9e9 100644 --- a/site/layouts/docs/docs.html +++ b/site/layouts/docs/docs.html @@ -27,16 +27,6 @@
{{ .Render "versions" }}
-
- -
{{ .Render "nav" }}
@@ -58,16 +48,6 @@ {{ .Render "footer" }}
- - diff --git a/site/layouts/partials/head-docs.html b/site/layouts/partials/head-docs.html index 5ebae8c24..c92837b2f 100644 --- a/site/layouts/partials/head-docs.html +++ b/site/layouts/partials/head-docs.html @@ -8,6 +8,4 @@ {{ $styles := resources.Get "styles.scss" | toCSS $options | resources.Fingerprint }} {{/* TODO {% seo %}*/}} - - diff --git a/test/e2e/basic/restore_exec_hooks.go b/test/e2e/basic/restore_exec_hooks.go new file mode 100644 index 000000000..fc1b8ceb8 --- /dev/null +++ b/test/e2e/basic/restore_exec_hooks.go @@ -0,0 +1,150 @@ +/* +Copyright the Velero contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "fmt" + "path" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/vmware-tanzu/velero/test/e2e/test" + . "github.com/vmware-tanzu/velero/test/e2e/test" + "github.com/vmware-tanzu/velero/test/util/common" + . "github.com/vmware-tanzu/velero/test/util/k8s" +) + +// RestoreExecHooks tests that a pod with multiple restore exec hooks does not hang +// at the Finalizing phase during restore (Issue #9359 / PR #9366). +type RestoreExecHooks struct { + TestCase + podName string +} + +var RestoreExecHooksTest func() = test.TestFunc(&RestoreExecHooks{}) + +func (r *RestoreExecHooks) Init() error { + Expect(r.TestCase.Init()).To(Succeed()) + r.CaseBaseName = "restore-exec-hooks-" + r.UUIDgen + r.BackupName = "backup-" + r.CaseBaseName + r.RestoreName = "restore-" + r.CaseBaseName + r.podName = "pod-multiple-hooks" + r.NamespacesTotal = 1 + r.NSIncluded = &[]string{} + + for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ { + createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum) + *r.NSIncluded = append(*r.NSIncluded, createNSName) + } + + r.TestMsg = &test.TestMSG{ + Desc: "Restore pod with multiple restore exec hooks", + Text: "Should successfully backup and restore without hanging at Finalizing phase", + FailedMSG: "Failed to successfully backup and restore pod with multiple hooks", + } + + r.BackupArgs = []string{ + "create", "--namespace", r.VeleroCfg.VeleroNamespace, "backup", r.BackupName, + "--include-namespaces", strings.Join(*r.NSIncluded, ","), + "--default-volumes-to-fs-backup", "--wait", + } + + r.RestoreArgs = []string{ + "create", "--namespace", r.VeleroCfg.VeleroNamespace, "restore", r.RestoreName, + "--from-backup", r.BackupName, "--wait", + } + + return nil +} + +func (r *RestoreExecHooks) CreateResources() error { + for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ { + createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum) + + By(fmt.Sprintf("Creating namespace %s", createNSName), func() { + Expect(CreateNamespace(r.Ctx, r.Client, createNSName)). + To(Succeed(), fmt.Sprintf("Failed to create namespace %s", createNSName)) + }) + + // Prepare images and commands adaptively for the target OS + imageAddress := LinuxTestImage + initCommand := `["/bin/sh", "-c", "echo init-hook-done"]` + execCommand1 := `["/bin/sh", "-c", "echo hook1"]` + execCommand2 := `["/bin/sh", "-c", "echo hook2"]` + + if r.VeleroCfg.WorkerOS == common.WorkerOSLinux && r.VeleroCfg.ImageRegistryProxy != "" { + imageAddress = path.Join(r.VeleroCfg.ImageRegistryProxy, LinuxTestImage) + } else if r.VeleroCfg.WorkerOS == common.WorkerOSWindows { + imageAddress = WindowTestImage + initCommand = `["cmd", "/c", "echo init-hook-done"]` + execCommand1 = `["cmd", "/c", "echo hook1"]` + execCommand2 = `["cmd", "/c", "echo hook2"]` + } + + // Inject mixing InitContainer hook and multiple Exec post-restore hooks. + // This guarantees that the loop index 'i' mismatched 'hook.hookIndex' (Issue #9359), + // ensuring the bug is properly reproduced and the fix is verified. + ann := map[string]string{ + // Inject InitContainer Restore Hook + "init.hook.restore.velero.io/container-image": imageAddress, + "init.hook.restore.velero.io/container-name": "test-init-hook", + "init.hook.restore.velero.io/command": initCommand, + + // Inject multiple Exec Restore Hooks + "post.hook.restore.velero.io/test1.command": execCommand1, + "post.hook.restore.velero.io/test1.container": r.podName, + "post.hook.restore.velero.io/test2.command": execCommand2, + "post.hook.restore.velero.io/test2.container": r.podName, + } + + By(fmt.Sprintf("Creating pod %s with multiple restore hooks in namespace %s", r.podName, createNSName), func() { + _, err := CreatePod( + r.Client, + createNSName, + r.podName, + "", // No storage class needed + "", // No PVC needed + []string{}, // No volumes + nil, + ann, + r.VeleroCfg.ImageRegistryProxy, + r.VeleroCfg.WorkerOS, + ) + Expect(err).To(Succeed(), fmt.Sprintf("Failed to create pod with hooks in namespace %s", createNSName)) + }) + + By(fmt.Sprintf("Waiting for pod %s to be ready", r.podName), func() { + err := WaitForPods(r.Ctx, r.Client, createNSName, []string{r.podName}) + Expect(err).To(Succeed(), fmt.Sprintf("Failed to wait for pod %s in namespace %s", r.podName, createNSName)) + }) + } + return nil +} + +func (r *RestoreExecHooks) Verify() error { + for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ { + createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum) + + By(fmt.Sprintf("Verifying pod %s in namespace %s after restore", r.podName, createNSName), func() { + err := WaitForPods(r.Ctx, r.Client, createNSName, []string{r.podName}) + Expect(err).To(Succeed(), fmt.Sprintf("Failed to verify pod %s in namespace %s after restore", r.podName, createNSName)) + }) + } + return nil +} diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index b60fee361..eef56b756 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -440,6 +440,12 @@ var _ = Describe( StorageClasssChangingTest, ) +var _ = Describe( + "Restore phase does not block at Finalizing when a container has multiple exec hooks", + Label("Basic", "Hooks"), + RestoreExecHooksTest, +) + var _ = Describe( "Backup/restore of 2500 namespaces", Label("Scale", "LongTime"),