Files
velero/pkg/metrics/metrics_test.go
Shubham Pampattiwar 27ca08b5a5 Address review comments: rename metrics to repo_maintenance_*
- Rename metric constants from maintenance_job_* to repo_maintenance_*
- Update metric help text to clarify these are for repo maintenance
- Rename functions: RegisterMaintenanceJob* → RegisterRepoMaintenance*
- Update all test references to use new names

Addresses review comments from @Lyndon-Li on PR #9414

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
2025-12-02 11:36:15 -08:00

520 lines
17 KiB
Go

/*
Copyright the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestBackupMetricsWithAdhocBackups verifies that metrics are properly recorded
// for both scheduled and adhoc (non-scheduled) backups.
func TestBackupMetricsWithAdhocBackups(t *testing.T) {
tests := []struct {
name string
scheduleName string
expectedLabel string
description string
}{
{
name: "scheduled backup metrics",
scheduleName: "daily-backup",
expectedLabel: "daily-backup",
description: "Metrics should be recorded with the schedule name label",
},
{
name: "adhoc backup metrics with empty schedule",
scheduleName: "",
expectedLabel: "",
description: "Metrics should be recorded with empty schedule label for adhoc backups",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
// Create a new metrics instance
m := NewServerMetrics()
// Test backup attempt metric
t.Run("RegisterBackupAttempt", func(t *testing.T) {
m.RegisterBackupAttempt(tc.scheduleName)
metric := getMetricValue(t, m.metrics[backupAttemptTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test backup success metric
t.Run("RegisterBackupSuccess", func(t *testing.T) {
m.RegisterBackupSuccess(tc.scheduleName)
metric := getMetricValue(t, m.metrics[backupSuccessTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test backup failure metric
t.Run("RegisterBackupFailed", func(t *testing.T) {
m.RegisterBackupFailed(tc.scheduleName)
metric := getMetricValue(t, m.metrics[backupFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test backup partial failure metric
t.Run("RegisterBackupPartialFailure", func(t *testing.T) {
m.RegisterBackupPartialFailure(tc.scheduleName)
metric := getMetricValue(t, m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test backup validation failure metric
t.Run("RegisterBackupValidationFailure", func(t *testing.T) {
m.RegisterBackupValidationFailure(tc.scheduleName)
metric := getMetricValue(t, m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test backup warning metric
t.Run("RegisterBackupWarning", func(t *testing.T) {
m.RegisterBackupWarning(tc.scheduleName)
metric := getMetricValue(t, m.metrics[backupWarningTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test backup items total gauge
t.Run("RegisterBackupItemsTotalGauge", func(t *testing.T) {
m.RegisterBackupItemsTotalGauge(tc.scheduleName, 100)
metric := getMetricValue(t, m.metrics[backupItemsTotalGauge].(*prometheus.GaugeVec), tc.expectedLabel)
assert.Equal(t, float64(100), metric, tc.description)
})
// Test backup items errors gauge
t.Run("RegisterBackupItemsErrorsGauge", func(t *testing.T) {
m.RegisterBackupItemsErrorsGauge(tc.scheduleName, 5)
metric := getMetricValue(t, m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec), tc.expectedLabel)
assert.Equal(t, float64(5), metric, tc.description)
})
// Test backup duration metric
t.Run("RegisterBackupDuration", func(t *testing.T) {
m.RegisterBackupDuration(tc.scheduleName, 120.5)
// For histogram, we check the count
metric := getHistogramCount(t, m.metrics[backupDurationSeconds].(*prometheus.HistogramVec), tc.expectedLabel)
assert.Equal(t, uint64(1), metric, tc.description)
})
// Test backup last status metric
t.Run("RegisterBackupLastStatus", func(t *testing.T) {
m.RegisterBackupLastStatus(tc.scheduleName, BackupLastStatusSucc)
metric := getMetricValue(t, m.metrics[backupLastStatus].(*prometheus.GaugeVec), tc.expectedLabel)
assert.Equal(t, float64(BackupLastStatusSucc), metric, tc.description)
})
// Test backup tarball size metric
t.Run("SetBackupTarballSizeBytesGauge", func(t *testing.T) {
m.SetBackupTarballSizeBytesGauge(tc.scheduleName, 1024*1024)
metric := getMetricValue(t, m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec), tc.expectedLabel)
assert.Equal(t, float64(1024*1024), metric, tc.description)
})
// Test backup last successful timestamp
t.Run("SetBackupLastSuccessfulTimestamp", func(t *testing.T) {
testTime := time.Now()
m.SetBackupLastSuccessfulTimestamp(tc.scheduleName, testTime)
metric := getMetricValue(t, m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec), tc.expectedLabel)
assert.Equal(t, float64(testTime.Unix()), metric, tc.description)
})
})
}
}
// TestRestoreMetricsWithAdhocBackups verifies that restore metrics are properly recorded
// for restores from both scheduled and adhoc backups.
func TestRestoreMetricsWithAdhocBackups(t *testing.T) {
tests := []struct {
name string
scheduleName string
expectedLabel string
description string
}{
{
name: "restore from scheduled backup",
scheduleName: "daily-backup",
expectedLabel: "daily-backup",
description: "Restore metrics should use the backup's schedule name",
},
{
name: "restore from adhoc backup",
scheduleName: "",
expectedLabel: "",
description: "Restore metrics should have empty schedule label for adhoc backup restores",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
m := NewServerMetrics()
// Test restore attempt metric
t.Run("RegisterRestoreAttempt", func(t *testing.T) {
m.RegisterRestoreAttempt(tc.scheduleName)
metric := getMetricValue(t, m.metrics[restoreAttemptTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test restore success metric
t.Run("RegisterRestoreSuccess", func(t *testing.T) {
m.RegisterRestoreSuccess(tc.scheduleName)
metric := getMetricValue(t, m.metrics[restoreSuccessTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test restore failed metric
t.Run("RegisterRestoreFailed", func(t *testing.T) {
m.RegisterRestoreFailed(tc.scheduleName)
metric := getMetricValue(t, m.metrics[restoreFailedTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test restore partial failure metric
t.Run("RegisterRestorePartialFailure", func(t *testing.T) {
m.RegisterRestorePartialFailure(tc.scheduleName)
metric := getMetricValue(t, m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test restore validation failed metric
t.Run("RegisterRestoreValidationFailed", func(t *testing.T) {
m.RegisterRestoreValidationFailed(tc.scheduleName)
metric := getMetricValue(t, m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec), tc.expectedLabel)
assert.Equal(t, float64(1), metric, tc.description)
})
})
}
}
// TestMultipleAdhocBackupsShareMetrics verifies that multiple adhoc backups
// accumulate metrics under the same empty schedule label.
func TestMultipleAdhocBackupsShareMetrics(t *testing.T) {
m := NewServerMetrics()
// Simulate multiple adhoc backup attempts
for i := 0; i < 5; i++ {
m.RegisterBackupAttempt("")
}
// Simulate some successes and failures
m.RegisterBackupSuccess("")
m.RegisterBackupSuccess("")
m.RegisterBackupFailed("")
m.RegisterBackupPartialFailure("")
m.RegisterBackupValidationFailure("")
// Verify accumulated metrics
attemptMetric := getMetricValue(t, m.metrics[backupAttemptTotal].(*prometheus.CounterVec), "")
assert.Equal(t, float64(5), attemptMetric, "All adhoc backup attempts should be counted together")
successMetric := getMetricValue(t, m.metrics[backupSuccessTotal].(*prometheus.CounterVec), "")
assert.Equal(t, float64(2), successMetric, "All adhoc backup successes should be counted together")
failureMetric := getMetricValue(t, m.metrics[backupFailureTotal].(*prometheus.CounterVec), "")
assert.Equal(t, float64(1), failureMetric, "All adhoc backup failures should be counted together")
partialFailureMetric := getMetricValue(t, m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec), "")
assert.Equal(t, float64(1), partialFailureMetric, "All adhoc partial failures should be counted together")
validationFailureMetric := getMetricValue(t, m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec), "")
assert.Equal(t, float64(1), validationFailureMetric, "All adhoc validation failures should be counted together")
}
// TestInitScheduleWithEmptyName verifies that InitSchedule works correctly
// with an empty schedule name (for adhoc backups).
func TestInitScheduleWithEmptyName(t *testing.T) {
m := NewServerMetrics()
// Initialize metrics for empty schedule (adhoc backups)
m.InitSchedule("")
// Verify all metrics are initialized with 0
metrics := []string{
backupAttemptTotal,
backupSuccessTotal,
backupPartialFailureTotal,
backupFailureTotal,
backupValidationFailureTotal,
backupDeletionAttemptTotal,
backupDeletionSuccessTotal,
backupDeletionFailureTotal,
backupItemsTotalGauge,
backupItemsErrorsGauge,
backupWarningTotal,
restoreAttemptTotal,
restorePartialFailureTotal,
restoreFailedTotal,
restoreSuccessTotal,
restoreValidationFailedTotal,
volumeSnapshotSuccessTotal,
volumeSnapshotAttemptTotal,
volumeSnapshotFailureTotal,
}
for _, metricName := range metrics {
t.Run(metricName, func(t *testing.T) {
var value float64
switch vec := m.metrics[metricName].(type) {
case *prometheus.CounterVec:
value = getMetricValue(t, vec, "")
case *prometheus.GaugeVec:
value = getMetricValue(t, vec, "")
}
assert.Equal(t, float64(0), value, "Metric %s should be initialized to 0 for empty schedule", metricName)
})
}
// Special case: backupLastStatus should be initialized to 1 (success)
lastStatusValue := getMetricValue(t, m.metrics[backupLastStatus].(*prometheus.GaugeVec), "")
assert.Equal(t, float64(1), lastStatusValue, "backupLastStatus should be initialized to 1 for empty schedule")
}
// Helper function to get metric value from a CounterVec or GaugeVec
func getMetricValue(t *testing.T, vec prometheus.Collector, scheduleLabel string) float64 {
t.Helper()
ch := make(chan prometheus.Metric, 1)
vec.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
// Check if this metric has the expected schedule label
hasCorrectLabel := false
for _, label := range dto.Label {
if *label.Name == "schedule" && *label.Value == scheduleLabel {
hasCorrectLabel = true
break
}
}
if hasCorrectLabel {
if dto.Counter != nil {
return *dto.Counter.Value
}
if dto.Gauge != nil {
return *dto.Gauge.Value
}
}
}
t.Fatalf("Metric with schedule label '%s' not found", scheduleLabel)
return 0
}
// Helper function to get histogram count
func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel string) uint64 {
t.Helper()
ch := make(chan prometheus.Metric, 1)
vec.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
// Check if this metric has the expected schedule label
hasCorrectLabel := false
for _, label := range dto.Label {
if *label.Name == "schedule" && *label.Value == scheduleLabel {
hasCorrectLabel = true
break
}
}
if hasCorrectLabel && dto.Histogram != nil {
return *dto.Histogram.SampleCount
}
}
t.Fatalf("Histogram with schedule label '%s' not found", scheduleLabel)
return 0
}
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
func TestRepoMaintenanceMetrics(t *testing.T) {
tests := []struct {
name string
repositoryName string
description string
}{
{
name: "maintenance job metrics for repository",
repositoryName: "default-restic-abcd",
description: "Metrics should be recorded with the repository name label",
},
{
name: "maintenance job metrics for different repository",
repositoryName: "velero-backup-repo-xyz",
description: "Metrics should be recorded with different repository name",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
m := NewServerMetrics()
// Test repo maintenance success metric
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test repo maintenance failure metric
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test repo maintenance duration metric
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
// For histogram, we check the count
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
assert.Equal(t, uint64(1), metric, tc.description)
})
})
}
}
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
// accumulate metrics under the same repository label.
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
m := NewServerMetrics()
repoName := "default-restic-test"
// Simulate multiple repo maintenance job executions
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceFailure(repoName)
m.RegisterRepoMaintenanceFailure(repoName)
// Record multiple durations
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
// Verify accumulated metrics
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
}
// Helper function to get metric value from a CounterVec with repository_name label
func getMaintenanceMetricValue(t *testing.T, vec prometheus.Collector, repositoryName string) float64 {
t.Helper()
ch := make(chan prometheus.Metric, 1)
vec.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
// Check if this metric has the expected repository_name label
hasCorrectLabel := false
for _, label := range dto.Label {
if *label.Name == "repository_name" && *label.Value == repositoryName {
hasCorrectLabel = true
break
}
}
if hasCorrectLabel {
if dto.Counter != nil {
return *dto.Counter.Value
}
if dto.Gauge != nil {
return *dto.Gauge.Value
}
}
}
t.Fatalf("Metric with repository_name label '%s' not found", repositoryName)
return 0
}
// Helper function to get histogram count with repository_name label
func getMaintenanceHistogramCount(t *testing.T, vec *prometheus.HistogramVec, repositoryName string) uint64 {
t.Helper()
ch := make(chan prometheus.Metric, 1)
vec.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
// Check if this metric has the expected repository_name label
hasCorrectLabel := false
for _, label := range dto.Label {
if *label.Name == "repository_name" && *label.Value == repositoryName {
hasCorrectLabel = true
break
}
}
if hasCorrectLabel && dto.Histogram != nil {
return *dto.Histogram.SampleCount
}
}
t.Fatalf("Histogram with repository_name label '%s' not found", repositoryName)
return 0
}