Merge branch 'main' into add-e2e-tests-for-issue7725

This commit is contained in:
lyndon-li
2026-03-16 15:22:59 +08:00
committed by GitHub
15 changed files with 327 additions and 114 deletions

View File

@@ -0,0 +1 @@
Add schedule_expected_interval_seconds metric for dynamic backup alerting thresholds (#9559)

View File

@@ -0,0 +1 @@
Fix DBR stuck when CSI snapshot no longer exists in cloud provider

View File

@@ -137,6 +137,10 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
return checkVSCReadiness(ctx, &snapCont, p.crClient)
},
); err != nil {
// Clean up the VSC we created since it can't become ready
if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) {
p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name)
}
return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name)
}
@@ -167,6 +171,13 @@ var checkVSCReadiness = func(
return true, nil
}
// Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound)
if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil {
return false, errors.Errorf(
"VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message,
)
}
return false, nil
}

View File

@@ -94,6 +94,19 @@ func TestVSCExecute(t *testing.T) {
return false, errors.Errorf("test error case")
},
},
{
name: "Error case with CSI error, dangling VSC should be cleaned up",
vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(),
backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(),
expectErr: true,
function: func(
ctx context.Context,
vsc *snapshotv1api.VolumeSnapshotContent,
client crclient.Client,
) (bool, error) {
return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name)
},
},
}
for _, test := range tests {
@@ -190,6 +203,24 @@ func TestCheckVSCReadiness(t *testing.T) {
expectErr: false,
ready: false,
},
{
name: "VSC has error from CSI driver",
vsc: &snapshotv1api.VolumeSnapshotContent{
ObjectMeta: metav1.ObjectMeta{
Name: "vsc-1",
Namespace: "velero",
},
Status: &snapshotv1api.VolumeSnapshotContentStatus{
ReadyToUse: boolPtr(false),
Error: &snapshotv1api.VolumeSnapshotError{
Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."),
},
},
},
createVSC: true,
expectErr: true,
ready: false,
},
}
for _, test := range tests {
@@ -207,3 +238,11 @@ func TestCheckVSCReadiness(t *testing.T) {
})
}
}
func boolPtr(b bool) *bool {
return &b
}
func stringPtr(s string) *string {
return &s
}

View File

@@ -129,6 +129,13 @@ func (c *scheduleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
} else {
schedule.Status.Phase = velerov1.SchedulePhaseEnabled
schedule.Status.ValidationErrors = nil
// Compute expected interval between consecutive scheduled backup runs.
// Only meaningful when the cron expression is valid.
now := c.clock.Now()
nextRun := cronSchedule.Next(now)
nextNextRun := cronSchedule.Next(nextRun)
c.metrics.SetScheduleExpectedIntervalSeconds(schedule.Name, nextNextRun.Sub(nextRun).Seconds())
}
scheduleNeedsPatch := false

View File

@@ -80,6 +80,9 @@ const (
DataDownloadFailureTotal = "data_download_failure_total"
DataDownloadCancelTotal = "data_download_cancel_total"
// schedule metrics
scheduleExpectedIntervalSeconds = "schedule_expected_interval_seconds"
// repo maintenance metrics
repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
@@ -347,6 +350,14 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel, backupNameLabel},
),
scheduleExpectedIntervalSeconds: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricNamespace,
Name: scheduleExpectedIntervalSeconds,
Help: "Expected interval between consecutive scheduled backups, in seconds",
},
[]string{scheduleLabel},
),
repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
@@ -644,6 +655,9 @@ func (m *ServerMetrics) RemoveSchedule(scheduleName string) {
if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok {
c.DeleteLabelValues(scheduleName, "")
}
if g, ok := m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec); ok {
g.DeleteLabelValues(scheduleName)
}
}
// InitMetricsForNode initializes counter metrics for a node.
@@ -758,6 +772,14 @@ func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string,
}
}
// SetScheduleExpectedIntervalSeconds records the expected interval in seconds,
// between consecutive backups for a schedule.
func (m *ServerMetrics) SetScheduleExpectedIntervalSeconds(scheduleName string, seconds float64) {
if g, ok := m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec); ok {
g.WithLabelValues(scheduleName).Set(seconds)
}
}
// SetBackupTotal records the current number of existent backups.
func (m *ServerMetrics) SetBackupTotal(numberOfBackups int64) {
if g, ok := m.metrics[backupTotal].(prometheus.Gauge); ok {

View File

@@ -259,6 +259,90 @@ func TestMultipleAdhocBackupsShareMetrics(t *testing.T) {
assert.Equal(t, float64(1), validationFailureMetric, "All adhoc validation failures should be counted together")
}
// TestSetScheduleExpectedIntervalSeconds verifies that the expected interval metric
// is properly recorded for schedules.
func TestSetScheduleExpectedIntervalSeconds(t *testing.T) {
tests := []struct {
name string
scheduleName string
intervalSeconds float64
description string
}{
{
name: "every 5 minutes schedule",
scheduleName: "frequent-backup",
intervalSeconds: 300,
description: "Expected interval should be 5m in seconds",
},
{
name: "daily schedule",
scheduleName: "daily-backup",
intervalSeconds: 86400,
description: "Expected interval should be 24h in seconds",
},
{
name: "monthly schedule",
scheduleName: "monthly-backup",
intervalSeconds: 2678400, // 31 days in seconds
description: "Expected interval should be 31 days in seconds",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
m := NewServerMetrics()
m.SetScheduleExpectedIntervalSeconds(tc.scheduleName, tc.intervalSeconds)
metric := getMetricValue(t, m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec), tc.scheduleName)
assert.Equal(t, tc.intervalSeconds, metric, tc.description)
})
}
}
// TestScheduleExpectedIntervalNotInitializedByDefault verifies that the expected
// interval metric is not initialized by InitSchedule, so it only appears for
// schedules with a valid cron expression.
func TestScheduleExpectedIntervalNotInitializedByDefault(t *testing.T) {
m := NewServerMetrics()
m.InitSchedule("test-schedule")
// The metric should not have any values after InitSchedule
ch := make(chan prometheus.Metric, 1)
m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec).Collect(ch)
close(ch)
count := 0
for range ch {
count++
}
assert.Equal(t, 0, count, "scheduleExpectedIntervalSeconds should not be initialized by InitSchedule")
}
// TestRemoveScheduleCleansUpExpectedInterval verifies that RemoveSchedule
// cleans up the expected interval metric.
func TestRemoveScheduleCleansUpExpectedInterval(t *testing.T) {
m := NewServerMetrics()
m.InitSchedule("test-schedule")
m.SetScheduleExpectedIntervalSeconds("test-schedule", 3600)
// Verify metric exists
metric := getMetricValue(t, m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec), "test-schedule")
assert.Equal(t, float64(3600), metric)
// Remove schedule and verify metric is cleaned up
m.RemoveSchedule("test-schedule")
ch := make(chan prometheus.Metric, 1)
m.metrics[scheduleExpectedIntervalSeconds].(*prometheus.GaugeVec).Collect(ch)
close(ch)
count := 0
for range ch {
count++
}
assert.Equal(t, 0, count, "scheduleExpectedIntervalSeconds should be removed after RemoveSchedule")
}
// TestInitScheduleWithEmptyName verifies that InitSchedule works correctly
// with an empty schedule name (for adhoc backups).
func TestInitScheduleWithEmptyName(t *testing.T) {

View File

@@ -156,7 +156,7 @@ func TestGetVolumesByPod(t *testing.T) {
Volumes: []corev1api.Volume{
// PVB Volumes
{Name: "pvbPV1"}, {Name: "pvbPV2"}, {Name: "pvbPV3"},
/// Excluded from PVB because colume mounting default service account token
/// Excluded from PVB because volume mounting default service account token
{Name: "default-token-5xq45"},
},
},

View File

@@ -1,90 +0,0 @@
new Crawler({
rateLimit: 8,
maxDepth: 10,
startUrls: ["https://velero.io/docs", "https://velero.io/"],
renderJavaScript: false,
sitemaps: ["https://velero.io/sitemap.xml"],
ignoreCanonicalTo: false,
discoveryPatterns: ["https://velero.io/**"],
schedule: "at 6:39 PM on Friday",
actions: [
{
indexName: "velero_new",
pathsToMatch: ["https://velero.io/docs**/**"],
recordExtractor: ({ helpers }) => {
return helpers.docsearch({
recordProps: {
lvl1: ["header h1", "article h1", "main h1", "h1", "head > title"],
content: ["article p, article li", "main p, main li", "p, li"],
lvl0: {
defaultValue: "Documentation",
},
lvl2: ["article h2", "main h2", "h2"],
lvl3: ["article h3", "main h3", "h3"],
lvl4: ["article h4", "main h4", "h4"],
lvl5: ["article h5", "main h5", "h5"],
lvl6: ["article h6", "main h6", "h6"],
version: "#dropdownMenuButton",
},
aggregateContent: true,
recordVersion: "v3",
});
},
},
],
initialIndexSettings: {
velero_new: {
attributesForFaceting: ["type", "lang", "version"],
attributesToRetrieve: [
"hierarchy",
"content",
"anchor",
"url",
"url_without_anchor",
"type",
"version",
],
attributesToHighlight: ["hierarchy", "content"],
attributesToSnippet: ["content:10"],
camelCaseAttributes: ["hierarchy", "content"],
searchableAttributes: [
"unordered(hierarchy.lvl0)",
"unordered(hierarchy.lvl1)",
"unordered(hierarchy.lvl2)",
"unordered(hierarchy.lvl3)",
"unordered(hierarchy.lvl4)",
"unordered(hierarchy.lvl5)",
"unordered(hierarchy.lvl6)",
"content",
],
distinct: true,
attributeForDistinct: "url",
customRanking: [
"desc(weight.pageRank)",
"desc(weight.level)",
"asc(weight.position)",
],
ranking: [
"words",
"filters",
"typo",
"attribute",
"proximity",
"exact",
"custom",
],
highlightPreTag: '<span class="algolia-docsearch-suggestion--highlight">',
highlightPostTag: "</span>",
minWordSizefor1Typo: 3,
minWordSizefor2Typos: 7,
allowTyposOnNumericTokens: false,
minProximity: 1,
ignorePlurals: true,
advancedSyntax: true,
attributeCriteriaComputedByMinProximity: true,
removeWordsIfNoResults: "allOptional",
},
},
appId: "9ASKQJ1HR3",
apiKey: "6392a5916af73b73df2406d3aef5ca45",
});

View File

@@ -12,7 +12,7 @@ params:
hero:
backgroundColor: med-blue
versioning: true
latest: v1.17
latest: v1.18
versions:
- main
- v1.18

View File

@@ -63,6 +63,10 @@ spec:
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# ItemOperationTimeout specifies the time used to wait for
# asynchronous BackupItemAction operations
# The default value is 4 hour.
itemOperationTimeout: 4h
# resourcePolicy specifies the referenced resource policies that backup should follow
# optional
resourcePolicy:

View File

@@ -27,16 +27,6 @@
<div class="col-md-3 toc">
{{ .Render "versions" }}
<br/>
<div id="docsearch">
<!-- <form class="d-flex align-items-center">
<span class="algolia-autocomplete" style="position: relative; display: inline-block; direction: ltr;">
<input type="search" class="form-control docsearch" id="search-input" placeholder="Search..."
aria-label="Search for..." autocomplete="off" spellcheck="false" role="combobox"
aria-autocomplete="list" aria-expanded="false" aria-owns="algolia-autocomplete-listbox-0"
dir="auto" style="position: relative; vertical-align: top;">
</span>
</form> -->
</div>
{{ .Render "nav" }}
</div>
<div class="col-md-8">
@@ -58,16 +48,6 @@
{{ .Render "footer" }}
</div>
</div>
<script src="https://cdn.jsdelivr.net/npm/@docsearch/js@3"></script>
<script type="text/javascript"> docsearch({
appId: '9ASKQJ1HR3',
apiKey: '170ba79bfa16cebfdf10726ae4771d7e',
indexName: 'velero_new',
container: '#docsearch',
searchParameters: {
facetFilters: ["version:{{ .CurrentSection.Params.version }}"]},
});
</script>
</body>
</html>

View File

@@ -8,6 +8,4 @@
{{ $styles := resources.Get "styles.scss" | toCSS $options | resources.Fingerprint }}
<link rel="stylesheet" href="{{ $styles.RelPermalink }}" integrity="{{ $styles.Data.Integrity }}">
{{/* TODO {% seo %}*/}}
<link rel="preconnect" href="https://9ASKQJ1HR3-dsn.algolia.net" crossorigin />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@docsearch/css@3" />
</head>

View File

@@ -0,0 +1,150 @@
/*
Copyright the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package basic
import (
"fmt"
"path"
"strings"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/vmware-tanzu/velero/test/e2e/test"
. "github.com/vmware-tanzu/velero/test/e2e/test"
"github.com/vmware-tanzu/velero/test/util/common"
. "github.com/vmware-tanzu/velero/test/util/k8s"
)
// RestoreExecHooks tests that a pod with multiple restore exec hooks does not hang
// at the Finalizing phase during restore (Issue #9359 / PR #9366).
type RestoreExecHooks struct {
TestCase
podName string
}
var RestoreExecHooksTest func() = test.TestFunc(&RestoreExecHooks{})
func (r *RestoreExecHooks) Init() error {
Expect(r.TestCase.Init()).To(Succeed())
r.CaseBaseName = "restore-exec-hooks-" + r.UUIDgen
r.BackupName = "backup-" + r.CaseBaseName
r.RestoreName = "restore-" + r.CaseBaseName
r.podName = "pod-multiple-hooks"
r.NamespacesTotal = 1
r.NSIncluded = &[]string{}
for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ {
createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum)
*r.NSIncluded = append(*r.NSIncluded, createNSName)
}
r.TestMsg = &test.TestMSG{
Desc: "Restore pod with multiple restore exec hooks",
Text: "Should successfully backup and restore without hanging at Finalizing phase",
FailedMSG: "Failed to successfully backup and restore pod with multiple hooks",
}
r.BackupArgs = []string{
"create", "--namespace", r.VeleroCfg.VeleroNamespace, "backup", r.BackupName,
"--include-namespaces", strings.Join(*r.NSIncluded, ","),
"--default-volumes-to-fs-backup", "--wait",
}
r.RestoreArgs = []string{
"create", "--namespace", r.VeleroCfg.VeleroNamespace, "restore", r.RestoreName,
"--from-backup", r.BackupName, "--wait",
}
return nil
}
func (r *RestoreExecHooks) CreateResources() error {
for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ {
createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum)
By(fmt.Sprintf("Creating namespace %s", createNSName), func() {
Expect(CreateNamespace(r.Ctx, r.Client, createNSName)).
To(Succeed(), fmt.Sprintf("Failed to create namespace %s", createNSName))
})
// Prepare images and commands adaptively for the target OS
imageAddress := LinuxTestImage
initCommand := `["/bin/sh", "-c", "echo init-hook-done"]`
execCommand1 := `["/bin/sh", "-c", "echo hook1"]`
execCommand2 := `["/bin/sh", "-c", "echo hook2"]`
if r.VeleroCfg.WorkerOS == common.WorkerOSLinux && r.VeleroCfg.ImageRegistryProxy != "" {
imageAddress = path.Join(r.VeleroCfg.ImageRegistryProxy, LinuxTestImage)
} else if r.VeleroCfg.WorkerOS == common.WorkerOSWindows {
imageAddress = WindowTestImage
initCommand = `["cmd", "/c", "echo init-hook-done"]`
execCommand1 = `["cmd", "/c", "echo hook1"]`
execCommand2 = `["cmd", "/c", "echo hook2"]`
}
// Inject mixing InitContainer hook and multiple Exec post-restore hooks.
// This guarantees that the loop index 'i' mismatched 'hook.hookIndex' (Issue #9359),
// ensuring the bug is properly reproduced and the fix is verified.
ann := map[string]string{
// Inject InitContainer Restore Hook
"init.hook.restore.velero.io/container-image": imageAddress,
"init.hook.restore.velero.io/container-name": "test-init-hook",
"init.hook.restore.velero.io/command": initCommand,
// Inject multiple Exec Restore Hooks
"post.hook.restore.velero.io/test1.command": execCommand1,
"post.hook.restore.velero.io/test1.container": r.podName,
"post.hook.restore.velero.io/test2.command": execCommand2,
"post.hook.restore.velero.io/test2.container": r.podName,
}
By(fmt.Sprintf("Creating pod %s with multiple restore hooks in namespace %s", r.podName, createNSName), func() {
_, err := CreatePod(
r.Client,
createNSName,
r.podName,
"", // No storage class needed
"", // No PVC needed
[]string{}, // No volumes
nil,
ann,
r.VeleroCfg.ImageRegistryProxy,
r.VeleroCfg.WorkerOS,
)
Expect(err).To(Succeed(), fmt.Sprintf("Failed to create pod with hooks in namespace %s", createNSName))
})
By(fmt.Sprintf("Waiting for pod %s to be ready", r.podName), func() {
err := WaitForPods(r.Ctx, r.Client, createNSName, []string{r.podName})
Expect(err).To(Succeed(), fmt.Sprintf("Failed to wait for pod %s in namespace %s", r.podName, createNSName))
})
}
return nil
}
func (r *RestoreExecHooks) Verify() error {
for nsNum := 0; nsNum < r.NamespacesTotal; nsNum++ {
createNSName := fmt.Sprintf("%s-%00000d", r.CaseBaseName, nsNum)
By(fmt.Sprintf("Verifying pod %s in namespace %s after restore", r.podName, createNSName), func() {
err := WaitForPods(r.Ctx, r.Client, createNSName, []string{r.podName})
Expect(err).To(Succeed(), fmt.Sprintf("Failed to verify pod %s in namespace %s after restore", r.podName, createNSName))
})
}
return nil
}

View File

@@ -440,6 +440,12 @@ var _ = Describe(
StorageClasssChangingTest,
)
var _ = Describe(
"Restore phase does not block at Finalizing when a container has multiple exec hooks",
Label("Basic", "Hooks"),
RestoreExecHooksTest,
)
var _ = Describe(
"Backup/restore of 2500 namespaces",
Label("Scale", "LongTime"),