Address review comments: rename metrics to repo_maintenance_*

- Rename metric constants from maintenance_job_* to repo_maintenance_*
- Update metric help text to clarify these are for repo maintenance
- Rename functions: RegisterMaintenanceJob* → RegisterRepoMaintenance*
- Update all test references to use new names

Addresses review comments from @Lyndon-Li on PR #9414

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
Shubham Pampattiwar
2025-11-18 12:17:31 -08:00
parent fdf439963c
commit 27ca08b5a5
4 changed files with 62 additions and 62 deletions

View File

@@ -498,7 +498,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
// Record failure metric when job fails to start // Record failure metric when job fails to start
if r.metrics != nil { if r.metrics != nil {
r.metrics.RegisterMaintenanceJobFailure(req.Name) r.metrics.RegisterRepoMaintenanceFailure(req.Name)
} }
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
@@ -518,10 +518,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
// Record failure metric // Record failure metric
if r.metrics != nil { if r.metrics != nil {
r.metrics.RegisterMaintenanceJobFailure(req.Name) r.metrics.RegisterRepoMaintenanceFailure(req.Name)
if status.StartTimestamp != nil && status.CompleteTimestamp != nil { if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds() duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration) r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
} }
} }
@@ -532,10 +532,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
// Record success metric // Record success metric
if r.metrics != nil { if r.metrics != nil {
r.metrics.RegisterMaintenanceJobSuccess(req.Name) r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
if status.StartTimestamp != nil && status.CompleteTimestamp != nil { if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds() duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration) r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
} }
} }

View File

@@ -1761,7 +1761,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
}) })
} }
func TestMaintenanceJobMetricsRecording(t *testing.T) { func TestRepoMaintenanceMetricsRecording(t *testing.T) {
now := time.Now().Round(time.Second) now := time.Now().Round(time.Second)
tests := []struct { tests := []struct {
@@ -1862,8 +1862,8 @@ func TestMaintenanceJobMetricsRecording(t *testing.T) {
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger()) _ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
// Verify metrics were recorded // Verify metrics were recorded
successCount := getMaintenanceMetricValue(t, m, "maintenance_job_success_total", test.repo.Name) successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
failureCount := getMaintenanceMetricValue(t, m, "maintenance_job_failure_total", test.repo.Name) failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name) durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
if test.expectSuccess { if test.expectSuccess {
@@ -1922,7 +1922,7 @@ func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoNam
t.Helper() t.Helper()
metricMap := m.Metrics() metricMap := m.Metrics()
collector, ok := metricMap["maintenance_job_duration_seconds"] collector, ok := metricMap["repo_maintenance_duration_seconds"]
if !ok { if !ok {
return 0 return 0
} }

View File

@@ -80,10 +80,10 @@ const (
DataDownloadFailureTotal = "data_download_failure_total" DataDownloadFailureTotal = "data_download_failure_total"
DataDownloadCancelTotal = "data_download_cancel_total" DataDownloadCancelTotal = "data_download_cancel_total"
// maintenance job metrics // repo maintenance metrics
maintenanceJobSuccessTotal = "maintenance_job_success_total" repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
maintenanceJobFailureTotal = "maintenance_job_failure_total" repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
maintenanceJobDurationSeconds = "maintenance_job_duration_seconds" repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
// Labels // Labels
nodeMetricLabel = "node" nodeMetricLabel = "node"
@@ -344,27 +344,27 @@ func NewServerMetrics() *ServerMetrics {
}, },
[]string{scheduleLabel, backupNameLabel}, []string{scheduleLabel, backupNameLabel},
), ),
maintenanceJobSuccessTotal: prometheus.NewCounterVec( repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: metricNamespace, Namespace: metricNamespace,
Name: maintenanceJobSuccessTotal, Name: repoMaintenanceSuccessTotal,
Help: "Total number of successful maintenance jobs", Help: "Total number of successful repo maintenance jobs",
}, },
[]string{repositoryNameLabel}, []string{repositoryNameLabel},
), ),
maintenanceJobFailureTotal: prometheus.NewCounterVec( repoMaintenanceFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: metricNamespace, Namespace: metricNamespace,
Name: maintenanceJobFailureTotal, Name: repoMaintenanceFailureTotal,
Help: "Total number of failed maintenance jobs", Help: "Total number of failed repo maintenance jobs",
}, },
[]string{repositoryNameLabel}, []string{repositoryNameLabel},
), ),
maintenanceJobDurationSeconds: prometheus.NewHistogramVec( repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
prometheus.HistogramOpts{ prometheus.HistogramOpts{
Namespace: metricNamespace, Namespace: metricNamespace,
Name: maintenanceJobDurationSeconds, Name: repoMaintenanceDurationSeconds,
Help: "Time taken to complete maintenance jobs, in seconds", Help: "Time taken to complete repo maintenance jobs, in seconds",
Buckets: []float64{ Buckets: []float64{
toSeconds(1 * time.Minute), toSeconds(1 * time.Minute),
toSeconds(5 * time.Minute), toSeconds(5 * time.Minute),
@@ -959,23 +959,23 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
} }
} }
// RegisterMaintenanceJobSuccess records a successful maintenance job. // RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
func (m *ServerMetrics) RegisterMaintenanceJobSuccess(repositoryName string) { func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
if c, ok := m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec); ok { if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(repositoryName).Inc() c.WithLabelValues(repositoryName).Inc()
} }
} }
// RegisterMaintenanceJobFailure records a failed maintenance job. // RegisterRepoMaintenanceFailure records a failed repo maintenance job.
func (m *ServerMetrics) RegisterMaintenanceJobFailure(repositoryName string) { func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
if c, ok := m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec); ok { if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(repositoryName).Inc() c.WithLabelValues(repositoryName).Inc()
} }
} }
// ObserveMaintenanceJobDuration records the number of seconds a maintenance job took. // ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
func (m *ServerMetrics) ObserveMaintenanceJobDuration(repositoryName string, seconds float64) { func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
if h, ok := m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec); ok { if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
h.WithLabelValues(repositoryName).Observe(seconds) h.WithLabelValues(repositoryName).Observe(seconds)
} }
} }

View File

@@ -373,8 +373,8 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
return 0 return 0
} }
// TestMaintenanceJobMetrics verifies that maintenance job metrics are properly recorded. // TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
func TestMaintenanceJobMetrics(t *testing.T) { func TestRepoMaintenanceMetrics(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
repositoryName string repositoryName string
@@ -396,61 +396,61 @@ func TestMaintenanceJobMetrics(t *testing.T) {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
m := NewServerMetrics() m := NewServerMetrics()
// Test maintenance job success metric // Test repo maintenance success metric
t.Run("RegisterMaintenanceJobSuccess", func(t *testing.T) { t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
m.RegisterMaintenanceJobSuccess(tc.repositoryName) m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), tc.repositoryName) metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description) assert.Equal(t, float64(1), metric, tc.description)
}) })
// Test maintenance job failure metric // Test repo maintenance failure metric
t.Run("RegisterMaintenanceJobFailure", func(t *testing.T) { t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
m.RegisterMaintenanceJobFailure(tc.repositoryName) m.RegisterRepoMaintenanceFailure(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), tc.repositoryName) metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description) assert.Equal(t, float64(1), metric, tc.description)
}) })
// Test maintenance job duration metric // Test repo maintenance duration metric
t.Run("ObserveMaintenanceJobDuration", func(t *testing.T) { t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
m.ObserveMaintenanceJobDuration(tc.repositoryName, 300.5) m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
// For histogram, we check the count // For histogram, we check the count
metric := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName) metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
assert.Equal(t, uint64(1), metric, tc.description) assert.Equal(t, uint64(1), metric, tc.description)
}) })
}) })
} }
} }
// TestMultipleMaintenanceJobsAccumulate verifies that multiple maintenance jobs // TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
// accumulate metrics under the same repository label. // accumulate metrics under the same repository label.
func TestMultipleMaintenanceJobsAccumulate(t *testing.T) { func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
m := NewServerMetrics() m := NewServerMetrics()
repoName := "default-restic-test" repoName := "default-restic-test"
// Simulate multiple maintenance job executions // Simulate multiple repo maintenance job executions
m.RegisterMaintenanceJobSuccess(repoName) m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterMaintenanceJobSuccess(repoName) m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterMaintenanceJobSuccess(repoName) m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterMaintenanceJobFailure(repoName) m.RegisterRepoMaintenanceFailure(repoName)
m.RegisterMaintenanceJobFailure(repoName) m.RegisterRepoMaintenanceFailure(repoName)
// Record multiple durations // Record multiple durations
m.ObserveMaintenanceJobDuration(repoName, 120.5) m.ObserveRepoMaintenanceDuration(repoName, 120.5)
m.ObserveMaintenanceJobDuration(repoName, 180.3) m.ObserveRepoMaintenanceDuration(repoName, 180.3)
m.ObserveMaintenanceJobDuration(repoName, 90.7) m.ObserveRepoMaintenanceDuration(repoName, 90.7)
// Verify accumulated metrics // Verify accumulated metrics
successMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), repoName) successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(3), successMetric, "All maintenance job successes should be counted") assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
failureMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), repoName) failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(2), failureMetric, "All maintenance job failures should be counted") assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
durationCount := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), repoName) durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
assert.Equal(t, uint64(3), durationCount, "All maintenance job durations should be observed") assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
} }
// Helper function to get metric value from a CounterVec with repository_name label // Helper function to get metric value from a CounterVec with repository_name label