mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-01-03 11:45:20 +00:00
- Rename metric constants from maintenance_job_* to repo_maintenance_* - Update metric help text to clarify these are for repo maintenance - Rename functions: RegisterMaintenanceJob* → RegisterRepoMaintenance* - Update all test references to use new names Addresses review comments from @Lyndon-Li on PR #9414 Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
520 lines
17 KiB
Go
520 lines
17 KiB
Go
/*
|
|
Copyright the Velero contributors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package metrics
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
dto "github.com/prometheus/client_model/go"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// TestBackupMetricsWithAdhocBackups verifies that metrics are properly recorded
|
|
// for both scheduled and adhoc (non-scheduled) backups.
|
|
func TestBackupMetricsWithAdhocBackups(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
scheduleName string
|
|
expectedLabel string
|
|
description string
|
|
}{
|
|
{
|
|
name: "scheduled backup metrics",
|
|
scheduleName: "daily-backup",
|
|
expectedLabel: "daily-backup",
|
|
description: "Metrics should be recorded with the schedule name label",
|
|
},
|
|
{
|
|
name: "adhoc backup metrics with empty schedule",
|
|
scheduleName: "",
|
|
expectedLabel: "",
|
|
description: "Metrics should be recorded with empty schedule label for adhoc backups",
|
|
},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// Create a new metrics instance
|
|
m := NewServerMetrics()
|
|
|
|
// Test backup attempt metric
|
|
t.Run("RegisterBackupAttempt", func(t *testing.T) {
|
|
m.RegisterBackupAttempt(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupAttemptTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup success metric
|
|
t.Run("RegisterBackupSuccess", func(t *testing.T) {
|
|
m.RegisterBackupSuccess(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupSuccessTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup failure metric
|
|
t.Run("RegisterBackupFailed", func(t *testing.T) {
|
|
m.RegisterBackupFailed(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup partial failure metric
|
|
t.Run("RegisterBackupPartialFailure", func(t *testing.T) {
|
|
m.RegisterBackupPartialFailure(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup validation failure metric
|
|
t.Run("RegisterBackupValidationFailure", func(t *testing.T) {
|
|
m.RegisterBackupValidationFailure(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup warning metric
|
|
t.Run("RegisterBackupWarning", func(t *testing.T) {
|
|
m.RegisterBackupWarning(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupWarningTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup items total gauge
|
|
t.Run("RegisterBackupItemsTotalGauge", func(t *testing.T) {
|
|
m.RegisterBackupItemsTotalGauge(tc.scheduleName, 100)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupItemsTotalGauge].(*prometheus.GaugeVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(100), metric, tc.description)
|
|
})
|
|
|
|
// Test backup items errors gauge
|
|
t.Run("RegisterBackupItemsErrorsGauge", func(t *testing.T) {
|
|
m.RegisterBackupItemsErrorsGauge(tc.scheduleName, 5)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(5), metric, tc.description)
|
|
})
|
|
|
|
// Test backup duration metric
|
|
t.Run("RegisterBackupDuration", func(t *testing.T) {
|
|
m.RegisterBackupDuration(tc.scheduleName, 120.5)
|
|
|
|
// For histogram, we check the count
|
|
metric := getHistogramCount(t, m.metrics[backupDurationSeconds].(*prometheus.HistogramVec), tc.expectedLabel)
|
|
assert.Equal(t, uint64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test backup last status metric
|
|
t.Run("RegisterBackupLastStatus", func(t *testing.T) {
|
|
m.RegisterBackupLastStatus(tc.scheduleName, BackupLastStatusSucc)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupLastStatus].(*prometheus.GaugeVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(BackupLastStatusSucc), metric, tc.description)
|
|
})
|
|
|
|
// Test backup tarball size metric
|
|
t.Run("SetBackupTarballSizeBytesGauge", func(t *testing.T) {
|
|
m.SetBackupTarballSizeBytesGauge(tc.scheduleName, 1024*1024)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1024*1024), metric, tc.description)
|
|
})
|
|
|
|
// Test backup last successful timestamp
|
|
t.Run("SetBackupLastSuccessfulTimestamp", func(t *testing.T) {
|
|
testTime := time.Now()
|
|
m.SetBackupLastSuccessfulTimestamp(tc.scheduleName, testTime)
|
|
|
|
metric := getMetricValue(t, m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(testTime.Unix()), metric, tc.description)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestRestoreMetricsWithAdhocBackups verifies that restore metrics are properly recorded
|
|
// for restores from both scheduled and adhoc backups.
|
|
func TestRestoreMetricsWithAdhocBackups(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
scheduleName string
|
|
expectedLabel string
|
|
description string
|
|
}{
|
|
{
|
|
name: "restore from scheduled backup",
|
|
scheduleName: "daily-backup",
|
|
expectedLabel: "daily-backup",
|
|
description: "Restore metrics should use the backup's schedule name",
|
|
},
|
|
{
|
|
name: "restore from adhoc backup",
|
|
scheduleName: "",
|
|
expectedLabel: "",
|
|
description: "Restore metrics should have empty schedule label for adhoc backup restores",
|
|
},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
m := NewServerMetrics()
|
|
|
|
// Test restore attempt metric
|
|
t.Run("RegisterRestoreAttempt", func(t *testing.T) {
|
|
m.RegisterRestoreAttempt(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[restoreAttemptTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test restore success metric
|
|
t.Run("RegisterRestoreSuccess", func(t *testing.T) {
|
|
m.RegisterRestoreSuccess(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[restoreSuccessTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test restore failed metric
|
|
t.Run("RegisterRestoreFailed", func(t *testing.T) {
|
|
m.RegisterRestoreFailed(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[restoreFailedTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test restore partial failure metric
|
|
t.Run("RegisterRestorePartialFailure", func(t *testing.T) {
|
|
m.RegisterRestorePartialFailure(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test restore validation failed metric
|
|
t.Run("RegisterRestoreValidationFailed", func(t *testing.T) {
|
|
m.RegisterRestoreValidationFailed(tc.scheduleName)
|
|
|
|
metric := getMetricValue(t, m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec), tc.expectedLabel)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestMultipleAdhocBackupsShareMetrics verifies that multiple adhoc backups
|
|
// accumulate metrics under the same empty schedule label.
|
|
func TestMultipleAdhocBackupsShareMetrics(t *testing.T) {
|
|
m := NewServerMetrics()
|
|
|
|
// Simulate multiple adhoc backup attempts
|
|
for i := 0; i < 5; i++ {
|
|
m.RegisterBackupAttempt("")
|
|
}
|
|
|
|
// Simulate some successes and failures
|
|
m.RegisterBackupSuccess("")
|
|
m.RegisterBackupSuccess("")
|
|
m.RegisterBackupFailed("")
|
|
m.RegisterBackupPartialFailure("")
|
|
m.RegisterBackupValidationFailure("")
|
|
|
|
// Verify accumulated metrics
|
|
attemptMetric := getMetricValue(t, m.metrics[backupAttemptTotal].(*prometheus.CounterVec), "")
|
|
assert.Equal(t, float64(5), attemptMetric, "All adhoc backup attempts should be counted together")
|
|
|
|
successMetric := getMetricValue(t, m.metrics[backupSuccessTotal].(*prometheus.CounterVec), "")
|
|
assert.Equal(t, float64(2), successMetric, "All adhoc backup successes should be counted together")
|
|
|
|
failureMetric := getMetricValue(t, m.metrics[backupFailureTotal].(*prometheus.CounterVec), "")
|
|
assert.Equal(t, float64(1), failureMetric, "All adhoc backup failures should be counted together")
|
|
|
|
partialFailureMetric := getMetricValue(t, m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec), "")
|
|
assert.Equal(t, float64(1), partialFailureMetric, "All adhoc partial failures should be counted together")
|
|
|
|
validationFailureMetric := getMetricValue(t, m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec), "")
|
|
assert.Equal(t, float64(1), validationFailureMetric, "All adhoc validation failures should be counted together")
|
|
}
|
|
|
|
// TestInitScheduleWithEmptyName verifies that InitSchedule works correctly
|
|
// with an empty schedule name (for adhoc backups).
|
|
func TestInitScheduleWithEmptyName(t *testing.T) {
|
|
m := NewServerMetrics()
|
|
|
|
// Initialize metrics for empty schedule (adhoc backups)
|
|
m.InitSchedule("")
|
|
|
|
// Verify all metrics are initialized with 0
|
|
metrics := []string{
|
|
backupAttemptTotal,
|
|
backupSuccessTotal,
|
|
backupPartialFailureTotal,
|
|
backupFailureTotal,
|
|
backupValidationFailureTotal,
|
|
backupDeletionAttemptTotal,
|
|
backupDeletionSuccessTotal,
|
|
backupDeletionFailureTotal,
|
|
backupItemsTotalGauge,
|
|
backupItemsErrorsGauge,
|
|
backupWarningTotal,
|
|
restoreAttemptTotal,
|
|
restorePartialFailureTotal,
|
|
restoreFailedTotal,
|
|
restoreSuccessTotal,
|
|
restoreValidationFailedTotal,
|
|
volumeSnapshotSuccessTotal,
|
|
volumeSnapshotAttemptTotal,
|
|
volumeSnapshotFailureTotal,
|
|
}
|
|
|
|
for _, metricName := range metrics {
|
|
t.Run(metricName, func(t *testing.T) {
|
|
var value float64
|
|
switch vec := m.metrics[metricName].(type) {
|
|
case *prometheus.CounterVec:
|
|
value = getMetricValue(t, vec, "")
|
|
case *prometheus.GaugeVec:
|
|
value = getMetricValue(t, vec, "")
|
|
}
|
|
assert.Equal(t, float64(0), value, "Metric %s should be initialized to 0 for empty schedule", metricName)
|
|
})
|
|
}
|
|
|
|
// Special case: backupLastStatus should be initialized to 1 (success)
|
|
lastStatusValue := getMetricValue(t, m.metrics[backupLastStatus].(*prometheus.GaugeVec), "")
|
|
assert.Equal(t, float64(1), lastStatusValue, "backupLastStatus should be initialized to 1 for empty schedule")
|
|
}
|
|
|
|
// Helper function to get metric value from a CounterVec or GaugeVec
|
|
func getMetricValue(t *testing.T, vec prometheus.Collector, scheduleLabel string) float64 {
|
|
t.Helper()
|
|
ch := make(chan prometheus.Metric, 1)
|
|
vec.Collect(ch)
|
|
close(ch)
|
|
|
|
for metric := range ch {
|
|
dto := &dto.Metric{}
|
|
err := metric.Write(dto)
|
|
require.NoError(t, err)
|
|
|
|
// Check if this metric has the expected schedule label
|
|
hasCorrectLabel := false
|
|
for _, label := range dto.Label {
|
|
if *label.Name == "schedule" && *label.Value == scheduleLabel {
|
|
hasCorrectLabel = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if hasCorrectLabel {
|
|
if dto.Counter != nil {
|
|
return *dto.Counter.Value
|
|
}
|
|
if dto.Gauge != nil {
|
|
return *dto.Gauge.Value
|
|
}
|
|
}
|
|
}
|
|
|
|
t.Fatalf("Metric with schedule label '%s' not found", scheduleLabel)
|
|
return 0
|
|
}
|
|
|
|
// Helper function to get histogram count
|
|
func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel string) uint64 {
|
|
t.Helper()
|
|
ch := make(chan prometheus.Metric, 1)
|
|
vec.Collect(ch)
|
|
close(ch)
|
|
|
|
for metric := range ch {
|
|
dto := &dto.Metric{}
|
|
err := metric.Write(dto)
|
|
require.NoError(t, err)
|
|
|
|
// Check if this metric has the expected schedule label
|
|
hasCorrectLabel := false
|
|
for _, label := range dto.Label {
|
|
if *label.Name == "schedule" && *label.Value == scheduleLabel {
|
|
hasCorrectLabel = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if hasCorrectLabel && dto.Histogram != nil {
|
|
return *dto.Histogram.SampleCount
|
|
}
|
|
}
|
|
|
|
t.Fatalf("Histogram with schedule label '%s' not found", scheduleLabel)
|
|
return 0
|
|
}
|
|
|
|
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
|
|
func TestRepoMaintenanceMetrics(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
repositoryName string
|
|
description string
|
|
}{
|
|
{
|
|
name: "maintenance job metrics for repository",
|
|
repositoryName: "default-restic-abcd",
|
|
description: "Metrics should be recorded with the repository name label",
|
|
},
|
|
{
|
|
name: "maintenance job metrics for different repository",
|
|
repositoryName: "velero-backup-repo-xyz",
|
|
description: "Metrics should be recorded with different repository name",
|
|
},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
m := NewServerMetrics()
|
|
|
|
// Test repo maintenance success metric
|
|
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
|
|
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
|
|
|
|
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test repo maintenance failure metric
|
|
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
|
|
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
|
|
|
|
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
|
|
assert.Equal(t, float64(1), metric, tc.description)
|
|
})
|
|
|
|
// Test repo maintenance duration metric
|
|
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
|
|
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
|
|
|
|
// For histogram, we check the count
|
|
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
|
|
assert.Equal(t, uint64(1), metric, tc.description)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
|
|
// accumulate metrics under the same repository label.
|
|
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
|
|
m := NewServerMetrics()
|
|
repoName := "default-restic-test"
|
|
|
|
// Simulate multiple repo maintenance job executions
|
|
m.RegisterRepoMaintenanceSuccess(repoName)
|
|
m.RegisterRepoMaintenanceSuccess(repoName)
|
|
m.RegisterRepoMaintenanceSuccess(repoName)
|
|
m.RegisterRepoMaintenanceFailure(repoName)
|
|
m.RegisterRepoMaintenanceFailure(repoName)
|
|
|
|
// Record multiple durations
|
|
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
|
|
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
|
|
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
|
|
|
|
// Verify accumulated metrics
|
|
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
|
|
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
|
|
|
|
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
|
|
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
|
|
|
|
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
|
|
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
|
|
}
|
|
|
|
// Helper function to get metric value from a CounterVec with repository_name label
|
|
func getMaintenanceMetricValue(t *testing.T, vec prometheus.Collector, repositoryName string) float64 {
|
|
t.Helper()
|
|
ch := make(chan prometheus.Metric, 1)
|
|
vec.Collect(ch)
|
|
close(ch)
|
|
|
|
for metric := range ch {
|
|
dto := &dto.Metric{}
|
|
err := metric.Write(dto)
|
|
require.NoError(t, err)
|
|
|
|
// Check if this metric has the expected repository_name label
|
|
hasCorrectLabel := false
|
|
for _, label := range dto.Label {
|
|
if *label.Name == "repository_name" && *label.Value == repositoryName {
|
|
hasCorrectLabel = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if hasCorrectLabel {
|
|
if dto.Counter != nil {
|
|
return *dto.Counter.Value
|
|
}
|
|
if dto.Gauge != nil {
|
|
return *dto.Gauge.Value
|
|
}
|
|
}
|
|
}
|
|
|
|
t.Fatalf("Metric with repository_name label '%s' not found", repositoryName)
|
|
return 0
|
|
}
|
|
|
|
// Helper function to get histogram count with repository_name label
|
|
func getMaintenanceHistogramCount(t *testing.T, vec *prometheus.HistogramVec, repositoryName string) uint64 {
|
|
t.Helper()
|
|
ch := make(chan prometheus.Metric, 1)
|
|
vec.Collect(ch)
|
|
close(ch)
|
|
|
|
for metric := range ch {
|
|
dto := &dto.Metric{}
|
|
err := metric.Write(dto)
|
|
require.NoError(t, err)
|
|
|
|
// Check if this metric has the expected repository_name label
|
|
hasCorrectLabel := false
|
|
for _, label := range dto.Label {
|
|
if *label.Name == "repository_name" && *label.Value == repositoryName {
|
|
hasCorrectLabel = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if hasCorrectLabel && dto.Histogram != nil {
|
|
return *dto.Histogram.SampleCount
|
|
}
|
|
}
|
|
|
|
t.Fatalf("Histogram with repository_name label '%s' not found", repositoryName)
|
|
return 0
|
|
}
|