Merge pull request #9414 from shubham-pampattiwar/add-maintenance-job-metrics

Add Prometheus metrics for maintenance jobs
2026-01-03 11:45:20 +00:00 · 2025-12-08 09:23:44 -08:00
parent 6a0307142c 1ec622245b
commit f0c97c489d
6 changed files with 442 additions and 0 deletions
--- a/changelogs/unreleased/9414-shubham-pampattiwar
+++ b/changelogs/unreleased/9414-shubham-pampattiwar
@@ -0,0 +1 @@
+Add Prometheus metrics for maintenance jobs
--- a/pkg/cmd/server/server.go
+++ b/pkg/cmd/server/server.go
@@ -758,6 +758,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
 			s.config.RepoMaintenanceJobConfig,
 			s.logLevel,
 			s.config.LogFormat,
+			s.metrics,
 		).SetupWithManager(s.mgr); err != nil {
 			s.logger.Fatal(err, "unable to create controller", "controller", constant.ControllerBackupRepo)
 		}
--- a/pkg/controller/backup_repository_controller.go
+++ b/pkg/controller/backup_repository_controller.go
@@ -42,6 +42,7 @@ import (
 	velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
 	"github.com/vmware-tanzu/velero/pkg/constant"
 	"github.com/vmware-tanzu/velero/pkg/label"
+	"github.com/vmware-tanzu/velero/pkg/metrics"
 	repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config"
 	"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
 	repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
@@ -66,6 +67,7 @@ type BackupRepoReconciler struct {
 	repoMaintenanceConfig string
 	logLevel              logrus.Level
 	logFormat             *logging.FormatFlag
+	metrics               *metrics.ServerMetrics
 }

 func NewBackupRepoReconciler(
@@ -78,6 +80,7 @@ func NewBackupRepoReconciler(
 	repoMaintenanceConfig string,
 	logLevel logrus.Level,
 	logFormat *logging.FormatFlag,
+	metrics *metrics.ServerMetrics,
 ) *BackupRepoReconciler {
 	c := &BackupRepoReconciler{
 		client,
@@ -90,6 +93,7 @@ func NewBackupRepoReconciler(
 		repoMaintenanceConfig,
 		logLevel,
 		logFormat,
+		metrics,
 	}

 	return c
@@ -491,6 +495,12 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
 	job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.logLevel, r.logFormat, log)
 	if err != nil {
 		log.WithError(err).Warn("Starting repo maintenance failed")
+
+		// Record failure metric when job fails to start
+		if r.metrics != nil {
+			r.metrics.RegisterRepoMaintenanceFailure(req.Name)
+		}
+
 		return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
 			updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, &metav1.Time{Time: startTime}, nil, fmt.Sprintf("Failed to start maintenance job, err: %v", err))
 		})
@@ -505,11 +515,30 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel

 	if status.Result == velerov1api.BackupRepositoryMaintenanceFailed {
 		log.WithError(err).Warn("Pruning repository failed")
+
+		// Record failure metric
+		if r.metrics != nil {
+			r.metrics.RegisterRepoMaintenanceFailure(req.Name)
+			if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
+				duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
+				r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
+			}
+		}
+
 		return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
 			updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp, status.CompleteTimestamp, status.Message)
 		})
 	}

+	// Record success metric
+	if r.metrics != nil {
+		r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
+		if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
+			duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
+			r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
+		}
+	}
+
 	return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
 		rr.Status.LastMaintenanceTime = &metav1.Time{Time: status.CompleteTimestamp.Time}
 		updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp, status.CompleteTimestamp, status.Message)
--- a/pkg/controller/backup_repository_controller_test.go
+++ b/pkg/controller/backup_repository_controller_test.go
@@ -19,6 +19,8 @@ import (
 	"testing"
 	"time"

+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
@@ -32,6 +34,7 @@ import (

 	velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
 	"github.com/vmware-tanzu/velero/pkg/builder"
+	"github.com/vmware-tanzu/velero/pkg/metrics"
 	"github.com/vmware-tanzu/velero/pkg/repository"
 	"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
 	repomaintenance "github.com/vmware-tanzu/velero/pkg/repository/maintenance"
@@ -65,6 +68,7 @@ func mockBackupRepoReconciler(t *testing.T, mockOn string, arg any, ret ...any)
 		"",
 		logrus.InfoLevel,
 		nil,
+		nil,
 	)
 }

@@ -584,6 +588,7 @@ func TestGetRepositoryMaintenanceFrequency(t *testing.T) {
 				"",
 				logrus.InfoLevel,
 				nil,
+				nil,
 			)

 			freq := reconciler.getRepositoryMaintenanceFrequency(test.repo)
@@ -716,6 +721,7 @@ func TestNeedInvalidBackupRepo(t *testing.T) {
 				"",
 				logrus.InfoLevel,
 				nil,
+				nil,
 			)

 			need := reconciler.needInvalidBackupRepo(test.oldBSL, test.newBSL)
@@ -1581,6 +1587,7 @@ func TestDeleteOldMaintenanceJobWithConfigMap(t *testing.T) {
 				repoMaintenanceConfigName,
 				logrus.InfoLevel,
 				nil,
+				nil,
 			)

 			_, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: test.repo.Namespace, Name: "repo"}})
@@ -1638,6 +1645,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
 			"",
 			logrus.InfoLevel,
 			nil,
+			nil,
 		)

 		err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
@@ -1689,6 +1697,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
 			"",
 			logrus.InfoLevel,
 			nil,
+			nil,
 		)

 		err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
@@ -1739,6 +1748,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
 			"",
 			logrus.InfoLevel,
 			nil,
+			nil,
 		)

 		err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
@@ -1750,3 +1760,189 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
 		assert.Equal(t, velerov1api.BackupRepositoryPhaseReady, rr.Status.Phase)
 	})
 }
+
+func TestRepoMaintenanceMetricsRecording(t *testing.T) {
+	now := time.Now().Round(time.Second)
+
+	tests := []struct {
+		name           string
+		repo           *velerov1api.BackupRepository
+		startJobFunc   func(client.Client, context.Context, *velerov1api.BackupRepository, string, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error)
+		waitJobFunc    func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error)
+		expectSuccess  bool
+		expectFailure  bool
+		expectDuration bool
+	}{
+		{
+			name: "metrics recorded on successful maintenance",
+			repo: &velerov1api.BackupRepository{
+				ObjectMeta: metav1.ObjectMeta{
+					Namespace: velerov1api.DefaultNamespace,
+					Name:      "test-repo-success",
+				},
+				Spec: velerov1api.BackupRepositorySpec{
+					MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
+				},
+				Status: velerov1api.BackupRepositoryStatus{
+					LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
+				},
+			},
+			startJobFunc:   startMaintenanceJobSucceed,
+			waitJobFunc:    waitMaintenanceJobCompleteFunc(now, velerov1api.BackupRepositoryMaintenanceSucceeded, ""),
+			expectSuccess:  true,
+			expectFailure:  false,
+			expectDuration: true,
+		},
+		{
+			name: "metrics recorded on failed maintenance",
+			repo: &velerov1api.BackupRepository{
+				ObjectMeta: metav1.ObjectMeta{
+					Namespace: velerov1api.DefaultNamespace,
+					Name:      "test-repo-failure",
+				},
+				Spec: velerov1api.BackupRepositorySpec{
+					MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
+				},
+				Status: velerov1api.BackupRepositoryStatus{
+					LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
+				},
+			},
+			startJobFunc: startMaintenanceJobSucceed,
+			waitJobFunc: func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
+				return velerov1api.BackupRepositoryMaintenanceStatus{
+					StartTimestamp:    &metav1.Time{Time: now},
+					CompleteTimestamp: &metav1.Time{Time: now.Add(time.Minute)}, // Job ran for 1 minute then failed
+					Result:            velerov1api.BackupRepositoryMaintenanceFailed,
+					Message:           "test error",
+				}, nil
+			},
+			expectSuccess:  false,
+			expectFailure:  true,
+			expectDuration: true,
+		},
+		{
+			name: "metrics recorded on job start failure",
+			repo: &velerov1api.BackupRepository{
+				ObjectMeta: metav1.ObjectMeta{
+					Namespace: velerov1api.DefaultNamespace,
+					Name:      "test-repo-start-fail",
+				},
+				Spec: velerov1api.BackupRepositorySpec{
+					MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
+				},
+				Status: velerov1api.BackupRepositoryStatus{
+					LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
+				},
+			},
+			startJobFunc:   startMaintenanceJobFail,
+			expectSuccess:  false,
+			expectFailure:  true,
+			expectDuration: false, // No duration when job fails to start
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Create metrics instance
+			m := metrics.NewServerMetrics()
+
+			// Create reconciler with metrics
+			reconciler := mockBackupRepoReconciler(t, "", test.repo, nil)
+			reconciler.metrics = m
+			reconciler.clock = &fakeClock{now}
+
+			err := reconciler.Client.Create(t.Context(), test.repo)
+			require.NoError(t, err)
+
+			// Set up job functions
+			funcStartMaintenanceJob = test.startJobFunc
+			funcWaitMaintenanceJobComplete = test.waitJobFunc
+
+			// Run maintenance
+			_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
+
+			// Verify metrics were recorded
+			successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
+			failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
+			durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
+
+			if test.expectSuccess {
+				assert.Equal(t, float64(1), successCount, "Success metric should be recorded")
+			} else {
+				assert.Equal(t, float64(0), successCount, "Success metric should not be recorded")
+			}
+
+			if test.expectFailure {
+				assert.Equal(t, float64(1), failureCount, "Failure metric should be recorded")
+			} else {
+				assert.Equal(t, float64(0), failureCount, "Failure metric should not be recorded")
+			}
+
+			if test.expectDuration {
+				assert.Equal(t, uint64(1), durationCount, "Duration metric should be recorded")
+			} else {
+				assert.Equal(t, uint64(0), durationCount, "Duration metric should not be recorded")
+			}
+		})
+	}
+}
+
+// Helper to get maintenance metric value from ServerMetrics
+func getMaintenanceMetricValue(t *testing.T, m *metrics.ServerMetrics, metricName, repoName string) float64 {
+	t.Helper()
+
+	metricMap := m.Metrics()
+	collector, ok := metricMap[metricName]
+	if !ok {
+		return 0
+	}
+
+	ch := make(chan prometheus.Metric, 1)
+	collector.Collect(ch)
+	close(ch)
+
+	for metric := range ch {
+		dto := &dto.Metric{}
+		err := metric.Write(dto)
+		require.NoError(t, err)
+
+		for _, label := range dto.Label {
+			if *label.Name == "repository_name" && *label.Value == repoName {
+				if dto.Counter != nil {
+					return *dto.Counter.Value
+				}
+			}
+		}
+	}
+	return 0
+}
+
+// Helper to get maintenance duration histogram count
+func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoName string) uint64 {
+	t.Helper()
+
+	metricMap := m.Metrics()
+	collector, ok := metricMap["repo_maintenance_duration_seconds"]
+	if !ok {
+		return 0
+	}
+
+	ch := make(chan prometheus.Metric, 1)
+	collector.Collect(ch)
+	close(ch)
+
+	for metric := range ch {
+		dto := &dto.Metric{}
+		err := metric.Write(dto)
+		require.NoError(t, err)
+
+		for _, label := range dto.Label {
+			if *label.Name == "repository_name" && *label.Value == repoName {
+				if dto.Histogram != nil {
+					return *dto.Histogram.SampleCount
+				}
+			}
+		}
+	}
+	return 0
+}
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -27,6 +27,11 @@ type ServerMetrics struct {
 	metrics map[string]prometheus.Collector
 }

+// Metrics returns the metrics map for testing purposes.
+func (m *ServerMetrics) Metrics() map[string]prometheus.Collector {
+	return m.metrics
+}
+
 const (
 	metricNamespace           = "velero"
 	podVolumeMetricsNamespace = "podVolume"
@@ -75,6 +80,14 @@ const (
 	DataDownloadFailureTotal = "data_download_failure_total"
 	DataDownloadCancelTotal  = "data_download_cancel_total"

+	// repo maintenance metrics
+	repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
+	repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
+	// repoMaintenanceDurationSeconds tracks the distribution of maintenance job durations.
+	// Each completed job's duration is recorded in the appropriate bucket, allowing
+	// analysis of individual job performance and trending over time.
+	repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
+
 	// Labels
 	nodeMetricLabel         = "node"
 	podVolumeOperationLabel = "operation"
@@ -82,6 +95,7 @@ const (
 	pvbNameLabel            = "pod_volume_backup"
 	scheduleLabel           = "schedule"
 	backupNameLabel         = "backupName"
+	repositoryNameLabel     = "repository_name"

 	// metrics values
 	BackupLastStatusSucc    int64 = 1
@@ -333,6 +347,41 @@ func NewServerMetrics() *ServerMetrics {
 				},
 				[]string{scheduleLabel, backupNameLabel},
 			),
+			repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
+				prometheus.CounterOpts{
+					Namespace: metricNamespace,
+					Name:      repoMaintenanceSuccessTotal,
+					Help:      "Total number of successful repo maintenance jobs",
+				},
+				[]string{repositoryNameLabel},
+			),
+			repoMaintenanceFailureTotal: prometheus.NewCounterVec(
+				prometheus.CounterOpts{
+					Namespace: metricNamespace,
+					Name:      repoMaintenanceFailureTotal,
+					Help:      "Total number of failed repo maintenance jobs",
+				},
+				[]string{repositoryNameLabel},
+			),
+			repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
+				prometheus.HistogramOpts{
+					Namespace: metricNamespace,
+					Name:      repoMaintenanceDurationSeconds,
+					Help:      "Time taken to complete repo maintenance jobs, in seconds",
+					Buckets: []float64{
+						toSeconds(1 * time.Minute),
+						toSeconds(5 * time.Minute),
+						toSeconds(10 * time.Minute),
+						toSeconds(15 * time.Minute),
+						toSeconds(30 * time.Minute),
+						toSeconds(1 * time.Hour),
+						toSeconds(2 * time.Hour),
+						toSeconds(3 * time.Hour),
+						toSeconds(4 * time.Hour),
+					},
+				},
+				[]string{repositoryNameLabel},
+			),
 		},
 	}
 }
@@ -912,3 +961,24 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
 		g.WithLabelValues(backupLocationName).Set(float64(0))
 	}
 }
+
+// RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
+func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
+	if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
+		c.WithLabelValues(repositoryName).Inc()
+	}
+}
+
+// RegisterRepoMaintenanceFailure records a failed repo maintenance job.
+func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
+	if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
+		c.WithLabelValues(repositoryName).Inc()
+	}
+}
+
+// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
+func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
+	if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
+		h.WithLabelValues(repositoryName).Observe(seconds)
+	}
+}
--- a/pkg/metrics/metrics_test.go
+++ b/pkg/metrics/metrics_test.go
@@ -372,3 +372,148 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
 	t.Fatalf("Histogram with schedule label '%s' not found", scheduleLabel)
 	return 0
 }
+
+// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
+func TestRepoMaintenanceMetrics(t *testing.T) {
+	tests := []struct {
+		name           string
+		repositoryName string
+		description    string
+	}{
+		{
+			name:           "maintenance job metrics for repository",
+			repositoryName: "default-restic-abcd",
+			description:    "Metrics should be recorded with the repository name label",
+		},
+		{
+			name:           "maintenance job metrics for different repository",
+			repositoryName: "velero-backup-repo-xyz",
+			description:    "Metrics should be recorded with different repository name",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			m := NewServerMetrics()
+
+			// Test repo maintenance success metric
+			t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
+				m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
+
+				metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
+				assert.Equal(t, float64(1), metric, tc.description)
+			})
+
+			// Test repo maintenance failure metric
+			t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
+				m.RegisterRepoMaintenanceFailure(tc.repositoryName)
+
+				metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
+				assert.Equal(t, float64(1), metric, tc.description)
+			})
+
+			// Test repo maintenance duration metric
+			t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
+				m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
+
+				// For histogram, we check the count
+				metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
+				assert.Equal(t, uint64(1), metric, tc.description)
+			})
+		})
+	}
+}
+
+// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
+// accumulate metrics under the same repository label.
+func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
+	m := NewServerMetrics()
+	repoName := "default-restic-test"
+
+	// Simulate multiple repo maintenance job executions
+	m.RegisterRepoMaintenanceSuccess(repoName)
+	m.RegisterRepoMaintenanceSuccess(repoName)
+	m.RegisterRepoMaintenanceSuccess(repoName)
+	m.RegisterRepoMaintenanceFailure(repoName)
+	m.RegisterRepoMaintenanceFailure(repoName)
+
+	// Record multiple durations
+	m.ObserveRepoMaintenanceDuration(repoName, 120.5)
+	m.ObserveRepoMaintenanceDuration(repoName, 180.3)
+	m.ObserveRepoMaintenanceDuration(repoName, 90.7)
+
+	// Verify accumulated metrics
+	successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
+	assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
+
+	failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
+	assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
+
+	durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
+	assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
+}
+
+// Helper function to get metric value from a CounterVec with repository_name label
+func getMaintenanceMetricValue(t *testing.T, vec prometheus.Collector, repositoryName string) float64 {
+	t.Helper()
+	ch := make(chan prometheus.Metric, 1)
+	vec.Collect(ch)
+	close(ch)
+
+	for metric := range ch {
+		dto := &dto.Metric{}
+		err := metric.Write(dto)
+		require.NoError(t, err)
+
+		// Check if this metric has the expected repository_name label
+		hasCorrectLabel := false
+		for _, label := range dto.Label {
+			if *label.Name == "repository_name" && *label.Value == repositoryName {
+				hasCorrectLabel = true
+				break
+			}
+		}
+
+		if hasCorrectLabel {
+			if dto.Counter != nil {
+				return *dto.Counter.Value
+			}
+			if dto.Gauge != nil {
+				return *dto.Gauge.Value
+			}
+		}
+	}
+
+	t.Fatalf("Metric with repository_name label '%s' not found", repositoryName)
+	return 0
+}
+
+// Helper function to get histogram count with repository_name label
+func getMaintenanceHistogramCount(t *testing.T, vec *prometheus.HistogramVec, repositoryName string) uint64 {
+	t.Helper()
+	ch := make(chan prometheus.Metric, 1)
+	vec.Collect(ch)
+	close(ch)
+
+	for metric := range ch {
+		dto := &dto.Metric{}
+		err := metric.Write(dto)
+		require.NoError(t, err)
+
+		// Check if this metric has the expected repository_name label
+		hasCorrectLabel := false
+		for _, label := range dto.Label {
+			if *label.Name == "repository_name" && *label.Value == repositoryName {
+				hasCorrectLabel = true
+				break
+			}
+		}
+
+		if hasCorrectLabel && dto.Histogram != nil {
+			return *dto.Histogram.SampleCount
+		}
+	}
+
+	t.Fatalf("Histogram with repository_name label '%s' not found", repositoryName)
+	return 0
+}
				`@@ -0,0 +1 @@`
				`Add Prometheus metrics for maintenance jobs`