diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index eb622908e..4280f4f1e 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -498,7 +498,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel // Record failure metric when job fails to start if r.metrics != nil { - r.metrics.RegisterMaintenanceJobFailure(req.Name) + r.metrics.RegisterRepoMaintenanceFailure(req.Name) } return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { @@ -518,10 +518,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel // Record failure metric if r.metrics != nil { - r.metrics.RegisterMaintenanceJobFailure(req.Name) + r.metrics.RegisterRepoMaintenanceFailure(req.Name) if status.StartTimestamp != nil && status.CompleteTimestamp != nil { duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds() - r.metrics.ObserveMaintenanceJobDuration(req.Name, duration) + r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration) } } @@ -532,10 +532,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel // Record success metric if r.metrics != nil { - r.metrics.RegisterMaintenanceJobSuccess(req.Name) + r.metrics.RegisterRepoMaintenanceSuccess(req.Name) if status.StartTimestamp != nil && status.CompleteTimestamp != nil { duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds() - r.metrics.ObserveMaintenanceJobDuration(req.Name, duration) + r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration) } } diff --git a/pkg/controller/backup_repository_controller_test.go b/pkg/controller/backup_repository_controller_test.go index 1fc1e9199..81a973200 100644 --- a/pkg/controller/backup_repository_controller_test.go +++ b/pkg/controller/backup_repository_controller_test.go @@ -1761,7 +1761,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) { }) } -func TestMaintenanceJobMetricsRecording(t *testing.T) { +func TestRepoMaintenanceMetricsRecording(t *testing.T) { now := time.Now().Round(time.Second) tests := []struct { @@ -1862,8 +1862,8 @@ func TestMaintenanceJobMetricsRecording(t *testing.T) { _ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger()) // Verify metrics were recorded - successCount := getMaintenanceMetricValue(t, m, "maintenance_job_success_total", test.repo.Name) - failureCount := getMaintenanceMetricValue(t, m, "maintenance_job_failure_total", test.repo.Name) + successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name) + failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name) durationCount := getMaintenanceDurationCount(t, m, test.repo.Name) if test.expectSuccess { @@ -1922,7 +1922,7 @@ func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoNam t.Helper() metricMap := m.Metrics() - collector, ok := metricMap["maintenance_job_duration_seconds"] + collector, ok := metricMap["repo_maintenance_duration_seconds"] if !ok { return 0 } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 84508bed0..eb00299e6 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -80,10 +80,10 @@ const ( DataDownloadFailureTotal = "data_download_failure_total" DataDownloadCancelTotal = "data_download_cancel_total" - // maintenance job metrics - maintenanceJobSuccessTotal = "maintenance_job_success_total" - maintenanceJobFailureTotal = "maintenance_job_failure_total" - maintenanceJobDurationSeconds = "maintenance_job_duration_seconds" + // repo maintenance metrics + repoMaintenanceSuccessTotal = "repo_maintenance_success_total" + repoMaintenanceFailureTotal = "repo_maintenance_failure_total" + repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds" // Labels nodeMetricLabel = "node" @@ -344,27 +344,27 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel, backupNameLabel}, ), - maintenanceJobSuccessTotal: prometheus.NewCounterVec( + repoMaintenanceSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, - Name: maintenanceJobSuccessTotal, - Help: "Total number of successful maintenance jobs", + Name: repoMaintenanceSuccessTotal, + Help: "Total number of successful repo maintenance jobs", }, []string{repositoryNameLabel}, ), - maintenanceJobFailureTotal: prometheus.NewCounterVec( + repoMaintenanceFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, - Name: maintenanceJobFailureTotal, - Help: "Total number of failed maintenance jobs", + Name: repoMaintenanceFailureTotal, + Help: "Total number of failed repo maintenance jobs", }, []string{repositoryNameLabel}, ), - maintenanceJobDurationSeconds: prometheus.NewHistogramVec( + repoMaintenanceDurationSeconds: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metricNamespace, - Name: maintenanceJobDurationSeconds, - Help: "Time taken to complete maintenance jobs, in seconds", + Name: repoMaintenanceDurationSeconds, + Help: "Time taken to complete repo maintenance jobs, in seconds", Buckets: []float64{ toSeconds(1 * time.Minute), toSeconds(5 * time.Minute), @@ -959,23 +959,23 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str } } -// RegisterMaintenanceJobSuccess records a successful maintenance job. -func (m *ServerMetrics) RegisterMaintenanceJobSuccess(repositoryName string) { - if c, ok := m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec); ok { +// RegisterRepoMaintenanceSuccess records a successful repo maintenance job. +func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) { + if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(repositoryName).Inc() } } -// RegisterMaintenanceJobFailure records a failed maintenance job. -func (m *ServerMetrics) RegisterMaintenanceJobFailure(repositoryName string) { - if c, ok := m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec); ok { +// RegisterRepoMaintenanceFailure records a failed repo maintenance job. +func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) { + if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(repositoryName).Inc() } } -// ObserveMaintenanceJobDuration records the number of seconds a maintenance job took. -func (m *ServerMetrics) ObserveMaintenanceJobDuration(repositoryName string, seconds float64) { - if h, ok := m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec); ok { +// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took. +func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) { + if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok { h.WithLabelValues(repositoryName).Observe(seconds) } } diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 005228417..184e496ab 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -373,8 +373,8 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel return 0 } -// TestMaintenanceJobMetrics verifies that maintenance job metrics are properly recorded. -func TestMaintenanceJobMetrics(t *testing.T) { +// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded. +func TestRepoMaintenanceMetrics(t *testing.T) { tests := []struct { name string repositoryName string @@ -396,61 +396,61 @@ func TestMaintenanceJobMetrics(t *testing.T) { t.Run(tc.name, func(t *testing.T) { m := NewServerMetrics() - // Test maintenance job success metric - t.Run("RegisterMaintenanceJobSuccess", func(t *testing.T) { - m.RegisterMaintenanceJobSuccess(tc.repositoryName) + // Test repo maintenance success metric + t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) { + m.RegisterRepoMaintenanceSuccess(tc.repositoryName) - metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), tc.repositoryName) + metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName) assert.Equal(t, float64(1), metric, tc.description) }) - // Test maintenance job failure metric - t.Run("RegisterMaintenanceJobFailure", func(t *testing.T) { - m.RegisterMaintenanceJobFailure(tc.repositoryName) + // Test repo maintenance failure metric + t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) { + m.RegisterRepoMaintenanceFailure(tc.repositoryName) - metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), tc.repositoryName) + metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName) assert.Equal(t, float64(1), metric, tc.description) }) - // Test maintenance job duration metric - t.Run("ObserveMaintenanceJobDuration", func(t *testing.T) { - m.ObserveMaintenanceJobDuration(tc.repositoryName, 300.5) + // Test repo maintenance duration metric + t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) { + m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5) // For histogram, we check the count - metric := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName) + metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName) assert.Equal(t, uint64(1), metric, tc.description) }) }) } } -// TestMultipleMaintenanceJobsAccumulate verifies that multiple maintenance jobs +// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs // accumulate metrics under the same repository label. -func TestMultipleMaintenanceJobsAccumulate(t *testing.T) { +func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) { m := NewServerMetrics() repoName := "default-restic-test" - // Simulate multiple maintenance job executions - m.RegisterMaintenanceJobSuccess(repoName) - m.RegisterMaintenanceJobSuccess(repoName) - m.RegisterMaintenanceJobSuccess(repoName) - m.RegisterMaintenanceJobFailure(repoName) - m.RegisterMaintenanceJobFailure(repoName) + // Simulate multiple repo maintenance job executions + m.RegisterRepoMaintenanceSuccess(repoName) + m.RegisterRepoMaintenanceSuccess(repoName) + m.RegisterRepoMaintenanceSuccess(repoName) + m.RegisterRepoMaintenanceFailure(repoName) + m.RegisterRepoMaintenanceFailure(repoName) // Record multiple durations - m.ObserveMaintenanceJobDuration(repoName, 120.5) - m.ObserveMaintenanceJobDuration(repoName, 180.3) - m.ObserveMaintenanceJobDuration(repoName, 90.7) + m.ObserveRepoMaintenanceDuration(repoName, 120.5) + m.ObserveRepoMaintenanceDuration(repoName, 180.3) + m.ObserveRepoMaintenanceDuration(repoName, 90.7) // Verify accumulated metrics - successMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), repoName) - assert.Equal(t, float64(3), successMetric, "All maintenance job successes should be counted") + successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName) + assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted") - failureMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), repoName) - assert.Equal(t, float64(2), failureMetric, "All maintenance job failures should be counted") + failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName) + assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted") - durationCount := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), repoName) - assert.Equal(t, uint64(3), durationCount, "All maintenance job durations should be observed") + durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName) + assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed") } // Helper function to get metric value from a CounterVec with repository_name label