Address review comments: rename metrics to repo_maintenance_*

- Rename metric constants from maintenance_job_* to repo_maintenance_*
- Update metric help text to clarify these are for repo maintenance
- Rename functions: RegisterMaintenanceJob* → RegisterRepoMaintenance*
- Update all test references to use new names

Addresses review comments from @Lyndon-Li on PR #9414

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
Shubham Pampattiwar
2025-11-18 12:17:31 -08:00
parent fdf439963c
commit 27ca08b5a5
4 changed files with 62 additions and 62 deletions

View File

@@ -498,7 +498,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
// Record failure metric when job fails to start
if r.metrics != nil {
r.metrics.RegisterMaintenanceJobFailure(req.Name)
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
}
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
@@ -518,10 +518,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
// Record failure metric
if r.metrics != nil {
r.metrics.RegisterMaintenanceJobFailure(req.Name)
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration)
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
}
}
@@ -532,10 +532,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
// Record success metric
if r.metrics != nil {
r.metrics.RegisterMaintenanceJobSuccess(req.Name)
r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration)
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
}
}

View File

@@ -1761,7 +1761,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
})
}
func TestMaintenanceJobMetricsRecording(t *testing.T) {
func TestRepoMaintenanceMetricsRecording(t *testing.T) {
now := time.Now().Round(time.Second)
tests := []struct {
@@ -1862,8 +1862,8 @@ func TestMaintenanceJobMetricsRecording(t *testing.T) {
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
// Verify metrics were recorded
successCount := getMaintenanceMetricValue(t, m, "maintenance_job_success_total", test.repo.Name)
failureCount := getMaintenanceMetricValue(t, m, "maintenance_job_failure_total", test.repo.Name)
successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
if test.expectSuccess {
@@ -1922,7 +1922,7 @@ func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoNam
t.Helper()
metricMap := m.Metrics()
collector, ok := metricMap["maintenance_job_duration_seconds"]
collector, ok := metricMap["repo_maintenance_duration_seconds"]
if !ok {
return 0
}

View File

@@ -80,10 +80,10 @@ const (
DataDownloadFailureTotal = "data_download_failure_total"
DataDownloadCancelTotal = "data_download_cancel_total"
// maintenance job metrics
maintenanceJobSuccessTotal = "maintenance_job_success_total"
maintenanceJobFailureTotal = "maintenance_job_failure_total"
maintenanceJobDurationSeconds = "maintenance_job_duration_seconds"
// repo maintenance metrics
repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
// Labels
nodeMetricLabel = "node"
@@ -344,27 +344,27 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel, backupNameLabel},
),
maintenanceJobSuccessTotal: prometheus.NewCounterVec(
repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: maintenanceJobSuccessTotal,
Help: "Total number of successful maintenance jobs",
Name: repoMaintenanceSuccessTotal,
Help: "Total number of successful repo maintenance jobs",
},
[]string{repositoryNameLabel},
),
maintenanceJobFailureTotal: prometheus.NewCounterVec(
repoMaintenanceFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: maintenanceJobFailureTotal,
Help: "Total number of failed maintenance jobs",
Name: repoMaintenanceFailureTotal,
Help: "Total number of failed repo maintenance jobs",
},
[]string{repositoryNameLabel},
),
maintenanceJobDurationSeconds: prometheus.NewHistogramVec(
repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricNamespace,
Name: maintenanceJobDurationSeconds,
Help: "Time taken to complete maintenance jobs, in seconds",
Name: repoMaintenanceDurationSeconds,
Help: "Time taken to complete repo maintenance jobs, in seconds",
Buckets: []float64{
toSeconds(1 * time.Minute),
toSeconds(5 * time.Minute),
@@ -959,23 +959,23 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
}
}
// RegisterMaintenanceJobSuccess records a successful maintenance job.
func (m *ServerMetrics) RegisterMaintenanceJobSuccess(repositoryName string) {
if c, ok := m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec); ok {
// RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(repositoryName).Inc()
}
}
// RegisterMaintenanceJobFailure records a failed maintenance job.
func (m *ServerMetrics) RegisterMaintenanceJobFailure(repositoryName string) {
if c, ok := m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec); ok {
// RegisterRepoMaintenanceFailure records a failed repo maintenance job.
func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(repositoryName).Inc()
}
}
// ObserveMaintenanceJobDuration records the number of seconds a maintenance job took.
func (m *ServerMetrics) ObserveMaintenanceJobDuration(repositoryName string, seconds float64) {
if h, ok := m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec); ok {
// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
h.WithLabelValues(repositoryName).Observe(seconds)
}
}

View File

@@ -373,8 +373,8 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
return 0
}
// TestMaintenanceJobMetrics verifies that maintenance job metrics are properly recorded.
func TestMaintenanceJobMetrics(t *testing.T) {
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
func TestRepoMaintenanceMetrics(t *testing.T) {
tests := []struct {
name string
repositoryName string
@@ -396,61 +396,61 @@ func TestMaintenanceJobMetrics(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
m := NewServerMetrics()
// Test maintenance job success metric
t.Run("RegisterMaintenanceJobSuccess", func(t *testing.T) {
m.RegisterMaintenanceJobSuccess(tc.repositoryName)
// Test repo maintenance success metric
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test maintenance job failure metric
t.Run("RegisterMaintenanceJobFailure", func(t *testing.T) {
m.RegisterMaintenanceJobFailure(tc.repositoryName)
// Test repo maintenance failure metric
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
assert.Equal(t, float64(1), metric, tc.description)
})
// Test maintenance job duration metric
t.Run("ObserveMaintenanceJobDuration", func(t *testing.T) {
m.ObserveMaintenanceJobDuration(tc.repositoryName, 300.5)
// Test repo maintenance duration metric
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
// For histogram, we check the count
metric := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
assert.Equal(t, uint64(1), metric, tc.description)
})
})
}
}
// TestMultipleMaintenanceJobsAccumulate verifies that multiple maintenance jobs
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
// accumulate metrics under the same repository label.
func TestMultipleMaintenanceJobsAccumulate(t *testing.T) {
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
m := NewServerMetrics()
repoName := "default-restic-test"
// Simulate multiple maintenance job executions
m.RegisterMaintenanceJobSuccess(repoName)
m.RegisterMaintenanceJobSuccess(repoName)
m.RegisterMaintenanceJobSuccess(repoName)
m.RegisterMaintenanceJobFailure(repoName)
m.RegisterMaintenanceJobFailure(repoName)
// Simulate multiple repo maintenance job executions
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceSuccess(repoName)
m.RegisterRepoMaintenanceFailure(repoName)
m.RegisterRepoMaintenanceFailure(repoName)
// Record multiple durations
m.ObserveMaintenanceJobDuration(repoName, 120.5)
m.ObserveMaintenanceJobDuration(repoName, 180.3)
m.ObserveMaintenanceJobDuration(repoName, 90.7)
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
// Verify accumulated metrics
successMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(3), successMetric, "All maintenance job successes should be counted")
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
failureMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(2), failureMetric, "All maintenance job failures should be counted")
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
durationCount := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), repoName)
assert.Equal(t, uint64(3), durationCount, "All maintenance job durations should be observed")
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
}
// Helper function to get metric value from a CounterVec with repository_name label