Merge pull request #9414 from shubham-pampattiwar/add-maintenance-job-metrics
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m8s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 5s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 14s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 15s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m43s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 58s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 1m8s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 58s

Add Prometheus metrics for maintenance jobs
This commit is contained in:
Shubham Pampattiwar
2025-12-08 09:23:44 -08:00
committed by GitHub
6 changed files with 442 additions and 0 deletions

View File

@@ -0,0 +1 @@
Add Prometheus metrics for maintenance jobs

View File

@@ -758,6 +758,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
s.config.RepoMaintenanceJobConfig,
s.logLevel,
s.config.LogFormat,
s.metrics,
).SetupWithManager(s.mgr); err != nil {
s.logger.Fatal(err, "unable to create controller", "controller", constant.ControllerBackupRepo)
}

View File

@@ -42,6 +42,7 @@ import (
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/constant"
"github.com/vmware-tanzu/velero/pkg/label"
"github.com/vmware-tanzu/velero/pkg/metrics"
repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config"
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
@@ -66,6 +67,7 @@ type BackupRepoReconciler struct {
repoMaintenanceConfig string
logLevel logrus.Level
logFormat *logging.FormatFlag
metrics *metrics.ServerMetrics
}
func NewBackupRepoReconciler(
@@ -78,6 +80,7 @@ func NewBackupRepoReconciler(
repoMaintenanceConfig string,
logLevel logrus.Level,
logFormat *logging.FormatFlag,
metrics *metrics.ServerMetrics,
) *BackupRepoReconciler {
c := &BackupRepoReconciler{
client,
@@ -90,6 +93,7 @@ func NewBackupRepoReconciler(
repoMaintenanceConfig,
logLevel,
logFormat,
metrics,
}
return c
@@ -491,6 +495,12 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.logLevel, r.logFormat, log)
if err != nil {
log.WithError(err).Warn("Starting repo maintenance failed")
// Record failure metric when job fails to start
if r.metrics != nil {
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
}
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, &metav1.Time{Time: startTime}, nil, fmt.Sprintf("Failed to start maintenance job, err: %v", err))
})
@@ -505,11 +515,30 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
if status.Result == velerov1api.BackupRepositoryMaintenanceFailed {
log.WithError(err).Warn("Pruning repository failed")
// Record failure metric
if r.metrics != nil {
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
}
}
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp, status.CompleteTimestamp, status.Message)
})
}
// Record success metric
if r.metrics != nil {
r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
}
}
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
rr.Status.LastMaintenanceTime = &metav1.Time{Time: status.CompleteTimestamp.Time}
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp, status.CompleteTimestamp, status.Message)

View File

@@ -19,6 +19,8 @@ import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
@@ -32,6 +34,7 @@ import (
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/builder"
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/vmware-tanzu/velero/pkg/repository"
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
repomaintenance "github.com/vmware-tanzu/velero/pkg/repository/maintenance"
@@ -65,6 +68,7 @@ func mockBackupRepoReconciler(t *testing.T, mockOn string, arg any, ret ...any)
"",
logrus.InfoLevel,
nil,
nil,
)
}
@@ -584,6 +588,7 @@ func TestGetRepositoryMaintenanceFrequency(t *testing.T) {
"",
logrus.InfoLevel,
nil,
nil,
)
freq := reconciler.getRepositoryMaintenanceFrequency(test.repo)
@@ -716,6 +721,7 @@ func TestNeedInvalidBackupRepo(t *testing.T) {
"",
logrus.InfoLevel,
nil,
nil,
)
need := reconciler.needInvalidBackupRepo(test.oldBSL, test.newBSL)
@@ -1581,6 +1587,7 @@ func TestDeleteOldMaintenanceJobWithConfigMap(t *testing.T) {
repoMaintenanceConfigName,
logrus.InfoLevel,
nil,
nil,
)
_, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: test.repo.Namespace, Name: "repo"}})
@@ -1638,6 +1645,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
"",
logrus.InfoLevel,
nil,
nil,
)
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
@@ -1689,6 +1697,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
"",
logrus.InfoLevel,
nil,
nil,
)
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
@@ -1739,6 +1748,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
"",
logrus.InfoLevel,
nil,
nil,
)
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
@@ -1750,3 +1760,189 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
assert.Equal(t, velerov1api.BackupRepositoryPhaseReady, rr.Status.Phase)
})
}
func TestRepoMaintenanceMetricsRecording(t *testing.T) {
now := time.Now().Round(time.Second)
tests := []struct {
name string
repo *velerov1api.BackupRepository
startJobFunc func(client.Client, context.Context, *velerov1api.BackupRepository, string, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error)
waitJobFunc func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error)
expectSuccess bool
expectFailure bool
expectDuration bool
}{
{
name: "metrics recorded on successful maintenance",
repo: &velerov1api.BackupRepository{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "test-repo-success",
},
Spec: velerov1api.BackupRepositorySpec{
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
},
Status: velerov1api.BackupRepositoryStatus{
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
},
},
startJobFunc: startMaintenanceJobSucceed,
waitJobFunc: waitMaintenanceJobCompleteFunc(now, velerov1api.BackupRepositoryMaintenanceSucceeded, ""),
expectSuccess: true,
expectFailure: false,
expectDuration: true,
},
{
name: "metrics recorded on failed maintenance",
repo: &velerov1api.BackupRepository{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "test-repo-failure",
},
Spec: velerov1api.BackupRepositorySpec{
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
},
Status: velerov1api.BackupRepositoryStatus{
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
},
},
startJobFunc: startMaintenanceJobSucceed,
waitJobFunc: func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
return velerov1api.BackupRepositoryMaintenanceStatus{
StartTimestamp: &metav1.Time{Time: now},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Minute)}, // Job ran for 1 minute then failed
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "test error",
}, nil
},
expectSuccess: false,
expectFailure: true,
expectDuration: true,
},
{
name: "metrics recorded on job start failure",
repo: &velerov1api.BackupRepository{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "test-repo-start-fail",
},
Spec: velerov1api.BackupRepositorySpec{
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
},
Status: velerov1api.BackupRepositoryStatus{
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
},
},
startJobFunc: startMaintenanceJobFail,
expectSuccess: false,
expectFailure: true,
expectDuration: false, // No duration when job fails to start
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
// Create metrics instance
m := metrics.NewServerMetrics()
// Create reconciler with metrics
reconciler := mockBackupRepoReconciler(t, "", test.repo, nil)
reconciler.metrics = m
reconciler.clock = &fakeClock{now}
err := reconciler.Client.Create(t.Context(), test.repo)
require.NoError(t, err)
// Set up job functions
funcStartMaintenanceJob = test.startJobFunc
funcWaitMaintenanceJobComplete = test.waitJobFunc
// Run maintenance
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
// Verify metrics were recorded
successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
if test.expectSuccess {
assert.Equal(t, float64(1), successCount, "Success metric should be recorded")
} else {
assert.Equal(t, float64(0), successCount, "Success metric should not be recorded")
}
if test.expectFailure {
assert.Equal(t, float64(1), failureCount, "Failure metric should be recorded")
} else {
assert.Equal(t, float64(0), failureCount, "Failure metric should not be recorded")
}
if test.expectDuration {
assert.Equal(t, uint64(1), durationCount, "Duration metric should be recorded")
} else {
assert.Equal(t, uint64(0), durationCount, "Duration metric should not be recorded")
}
})
}
}
// Helper to get maintenance metric value from ServerMetrics
func getMaintenanceMetricValue(t *testing.T, m *metrics.ServerMetrics, metricName, repoName string) float64 {
t.Helper()
metricMap := m.Metrics()
collector, ok := metricMap[metricName]
if !ok {
return 0
}
ch := make(chan prometheus.Metric, 1)
collector.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
for _, label := range dto.Label {
if *label.Name == "repository_name" && *label.Value == repoName {
if dto.Counter != nil {
return *dto.Counter.Value
}
}
}
}
return 0
}
// Helper to get maintenance duration histogram count
func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoName string) uint64 {
t.Helper()
metricMap := m.Metrics()
collector, ok := metricMap["repo_maintenance_duration_seconds"]
if !ok {
return 0
}
ch := make(chan prometheus.Metric, 1)
collector.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
for _, label := range dto.Label {
if *label.Name == "repository_name" && *label.Value == repoName {
if dto.Histogram != nil {
return *dto.Histogram.SampleCount
}
}
}
}
return 0
}

View File

@@ -27,6 +27,11 @@ type ServerMetrics struct {
metrics map[string]prometheus.Collector
}
// Metrics returns the metrics map for testing purposes.
func (m *ServerMetrics) Metrics() map[string]prometheus.Collector {
return m.metrics
}
const (
metricNamespace = "velero"
podVolumeMetricsNamespace = "podVolume"
@@ -75,6 +80,14 @@ const (
DataDownloadFailureTotal = "data_download_failure_total"
DataDownloadCancelTotal = "data_download_cancel_total"
// repo maintenance metrics
repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
// repoMaintenanceDurationSeconds tracks the distribution of maintenance job durations.
// Each completed job's duration is recorded in the appropriate bucket, allowing
// analysis of individual job performance and trending over time.
repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
// Labels
nodeMetricLabel = "node"
podVolumeOperationLabel = "operation"
@@ -82,6 +95,7 @@ const (
pvbNameLabel = "pod_volume_backup"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
repositoryNameLabel = "repository_name"
// metrics values
BackupLastStatusSucc int64 = 1
@@ -333,6 +347,41 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel, backupNameLabel},
),
repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: repoMaintenanceSuccessTotal,
Help: "Total number of successful repo maintenance jobs",
},
[]string{repositoryNameLabel},
),
repoMaintenanceFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: repoMaintenanceFailureTotal,
Help: "Total number of failed repo maintenance jobs",
},
[]string{repositoryNameLabel},
),
repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricNamespace,
Name: repoMaintenanceDurationSeconds,
Help: "Time taken to complete repo maintenance jobs, in seconds",
Buckets: []float64{
toSeconds(1 * time.Minute),
toSeconds(5 * time.Minute),
toSeconds(10 * time.Minute),
toSeconds(15 * time.Minute),
toSeconds(30 * time.Minute),
toSeconds(1 * time.Hour),
toSeconds(2 * time.Hour),
toSeconds(3 * time.Hour),
toSeconds(4 * time.Hour),
},
},
[]string{repositoryNameLabel},
),
},
}
}
@@ -912,3 +961,24 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
g.WithLabelValues(backupLocationName).Set(float64(0))
}
}
// RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(repositoryName).Inc()
}
}
// RegisterRepoMaintenanceFailure records a failed repo maintenance job.
func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(repositoryName).Inc()
}
}
// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
h.WithLabelValues(repositoryName).Observe(seconds)
}
}

View File

@@ -372,3 +372,148 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
t.Fatalf("Histogram with schedule label '%s' not found", scheduleLabel)
return 0
}
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
func TestRepoMaintenanceMetrics(t *testing.T) {
tests := []struct {
name string
repositoryName string
description string
}{
{
name: "maintenance job metrics for repository",
repositoryName: "default-restic-abcd",
description: "Metrics should be recorded with the repository name label",
},
{
name: "maintenance job metrics for different repository",
repositoryName: "velero-backup-repo-xyz",
description: "Metrics should be recorded with different repository name",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
m := NewServerMetrics()
// Test repo maintenance success metric
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test repo maintenance failure metric
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test repo maintenance duration metric
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
// For histogram, we check the count
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
assert.Equal(t, uint64(1), metric, tc.description)
})
})
}
}
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
// accumulate metrics under the same repository label.
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
m := NewServerMetrics()
repoName := "default-restic-test"
// Simulate multiple repo maintenance job executions
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceFailure(repoName)
m.RegisterRepoMaintenanceFailure(repoName)
// Record multiple durations
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
// Verify accumulated metrics
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
}
// Helper function to get metric value from a CounterVec with repository_name label
func getMaintenanceMetricValue(t *testing.T, vec prometheus.Collector, repositoryName string) float64 {
t.Helper()
ch := make(chan prometheus.Metric, 1)
vec.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
// Check if this metric has the expected repository_name label
hasCorrectLabel := false
for _, label := range dto.Label {
if *label.Name == "repository_name" && *label.Value == repositoryName {
hasCorrectLabel = true
break
}
}
if hasCorrectLabel {
if dto.Counter != nil {
return *dto.Counter.Value
}
if dto.Gauge != nil {
return *dto.Gauge.Value
}
}
}
t.Fatalf("Metric with repository_name label '%s' not found", repositoryName)
return 0
}
// Helper function to get histogram count with repository_name label
func getMaintenanceHistogramCount(t *testing.T, vec *prometheus.HistogramVec, repositoryName string) uint64 {
t.Helper()
ch := make(chan prometheus.Metric, 1)
vec.Collect(ch)
close(ch)
for metric := range ch {
dto := &dto.Metric{}
err := metric.Write(dto)
require.NoError(t, err)
// Check if this metric has the expected repository_name label
hasCorrectLabel := false
for _, label := range dto.Label {
if *label.Name == "repository_name" && *label.Value == repositoryName {
hasCorrectLabel = true
break
}
}
if hasCorrectLabel && dto.Histogram != nil {
return *dto.Histogram.SampleCount
}
}
t.Fatalf("Histogram with repository_name label '%s' not found", repositoryName)
return 0
}