mirror of
https://github.com/vmware-tanzu/velero.git
synced 2025-12-23 06:15:21 +00:00
Address review comments: rename metrics to repo_maintenance_*
- Rename metric constants from maintenance_job_* to repo_maintenance_* - Update metric help text to clarify these are for repo maintenance - Rename functions: RegisterMaintenanceJob* → RegisterRepoMaintenance* - Update all test references to use new names Addresses review comments from @Lyndon-Li on PR #9414 Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
@@ -498,7 +498,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
|
||||
// Record failure metric when job fails to start
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterMaintenanceJobFailure(req.Name)
|
||||
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
@@ -518,10 +518,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
|
||||
// Record failure metric
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterMaintenanceJobFailure(req.Name)
|
||||
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration)
|
||||
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,10 +532,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
|
||||
// Record success metric
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterMaintenanceJobSuccess(req.Name)
|
||||
r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
|
||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration)
|
||||
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1761,7 +1761,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestMaintenanceJobMetricsRecording(t *testing.T) {
|
||||
func TestRepoMaintenanceMetricsRecording(t *testing.T) {
|
||||
now := time.Now().Round(time.Second)
|
||||
|
||||
tests := []struct {
|
||||
@@ -1862,8 +1862,8 @@ func TestMaintenanceJobMetricsRecording(t *testing.T) {
|
||||
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
|
||||
|
||||
// Verify metrics were recorded
|
||||
successCount := getMaintenanceMetricValue(t, m, "maintenance_job_success_total", test.repo.Name)
|
||||
failureCount := getMaintenanceMetricValue(t, m, "maintenance_job_failure_total", test.repo.Name)
|
||||
successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
|
||||
failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
|
||||
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
|
||||
|
||||
if test.expectSuccess {
|
||||
@@ -1922,7 +1922,7 @@ func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoNam
|
||||
t.Helper()
|
||||
|
||||
metricMap := m.Metrics()
|
||||
collector, ok := metricMap["maintenance_job_duration_seconds"]
|
||||
collector, ok := metricMap["repo_maintenance_duration_seconds"]
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -80,10 +80,10 @@ const (
|
||||
DataDownloadFailureTotal = "data_download_failure_total"
|
||||
DataDownloadCancelTotal = "data_download_cancel_total"
|
||||
|
||||
// maintenance job metrics
|
||||
maintenanceJobSuccessTotal = "maintenance_job_success_total"
|
||||
maintenanceJobFailureTotal = "maintenance_job_failure_total"
|
||||
maintenanceJobDurationSeconds = "maintenance_job_duration_seconds"
|
||||
// repo maintenance metrics
|
||||
repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
|
||||
repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
|
||||
repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
|
||||
|
||||
// Labels
|
||||
nodeMetricLabel = "node"
|
||||
@@ -344,27 +344,27 @@ func NewServerMetrics() *ServerMetrics {
|
||||
},
|
||||
[]string{scheduleLabel, backupNameLabel},
|
||||
),
|
||||
maintenanceJobSuccessTotal: prometheus.NewCounterVec(
|
||||
repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: maintenanceJobSuccessTotal,
|
||||
Help: "Total number of successful maintenance jobs",
|
||||
Name: repoMaintenanceSuccessTotal,
|
||||
Help: "Total number of successful repo maintenance jobs",
|
||||
},
|
||||
[]string{repositoryNameLabel},
|
||||
),
|
||||
maintenanceJobFailureTotal: prometheus.NewCounterVec(
|
||||
repoMaintenanceFailureTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: maintenanceJobFailureTotal,
|
||||
Help: "Total number of failed maintenance jobs",
|
||||
Name: repoMaintenanceFailureTotal,
|
||||
Help: "Total number of failed repo maintenance jobs",
|
||||
},
|
||||
[]string{repositoryNameLabel},
|
||||
),
|
||||
maintenanceJobDurationSeconds: prometheus.NewHistogramVec(
|
||||
repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: maintenanceJobDurationSeconds,
|
||||
Help: "Time taken to complete maintenance jobs, in seconds",
|
||||
Name: repoMaintenanceDurationSeconds,
|
||||
Help: "Time taken to complete repo maintenance jobs, in seconds",
|
||||
Buckets: []float64{
|
||||
toSeconds(1 * time.Minute),
|
||||
toSeconds(5 * time.Minute),
|
||||
@@ -959,23 +959,23 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterMaintenanceJobSuccess records a successful maintenance job.
|
||||
func (m *ServerMetrics) RegisterMaintenanceJobSuccess(repositoryName string) {
|
||||
if c, ok := m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
// RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
|
||||
func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
|
||||
if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(repositoryName).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterMaintenanceJobFailure records a failed maintenance job.
|
||||
func (m *ServerMetrics) RegisterMaintenanceJobFailure(repositoryName string) {
|
||||
if c, ok := m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec); ok {
|
||||
// RegisterRepoMaintenanceFailure records a failed repo maintenance job.
|
||||
func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
|
||||
if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(repositoryName).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveMaintenanceJobDuration records the number of seconds a maintenance job took.
|
||||
func (m *ServerMetrics) ObserveMaintenanceJobDuration(repositoryName string, seconds float64) {
|
||||
if h, ok := m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec); ok {
|
||||
// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
|
||||
func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
|
||||
if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
|
||||
h.WithLabelValues(repositoryName).Observe(seconds)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -373,8 +373,8 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
|
||||
return 0
|
||||
}
|
||||
|
||||
// TestMaintenanceJobMetrics verifies that maintenance job metrics are properly recorded.
|
||||
func TestMaintenanceJobMetrics(t *testing.T) {
|
||||
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
|
||||
func TestRepoMaintenanceMetrics(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
repositoryName string
|
||||
@@ -396,61 +396,61 @@ func TestMaintenanceJobMetrics(t *testing.T) {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
m := NewServerMetrics()
|
||||
|
||||
// Test maintenance job success metric
|
||||
t.Run("RegisterMaintenanceJobSuccess", func(t *testing.T) {
|
||||
m.RegisterMaintenanceJobSuccess(tc.repositoryName)
|
||||
// Test repo maintenance success metric
|
||||
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
|
||||
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
|
||||
|
||||
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||
assert.Equal(t, float64(1), metric, tc.description)
|
||||
})
|
||||
|
||||
// Test maintenance job failure metric
|
||||
t.Run("RegisterMaintenanceJobFailure", func(t *testing.T) {
|
||||
m.RegisterMaintenanceJobFailure(tc.repositoryName)
|
||||
// Test repo maintenance failure metric
|
||||
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
|
||||
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
|
||||
|
||||
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||
assert.Equal(t, float64(1), metric, tc.description)
|
||||
})
|
||||
|
||||
// Test maintenance job duration metric
|
||||
t.Run("ObserveMaintenanceJobDuration", func(t *testing.T) {
|
||||
m.ObserveMaintenanceJobDuration(tc.repositoryName, 300.5)
|
||||
// Test repo maintenance duration metric
|
||||
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
|
||||
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
|
||||
|
||||
// For histogram, we check the count
|
||||
metric := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
|
||||
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
|
||||
assert.Equal(t, uint64(1), metric, tc.description)
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestMultipleMaintenanceJobsAccumulate verifies that multiple maintenance jobs
|
||||
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
|
||||
// accumulate metrics under the same repository label.
|
||||
func TestMultipleMaintenanceJobsAccumulate(t *testing.T) {
|
||||
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
|
||||
m := NewServerMetrics()
|
||||
repoName := "default-restic-test"
|
||||
|
||||
// Simulate multiple maintenance job executions
|
||||
m.RegisterMaintenanceJobSuccess(repoName)
|
||||
m.RegisterMaintenanceJobSuccess(repoName)
|
||||
m.RegisterMaintenanceJobSuccess(repoName)
|
||||
m.RegisterMaintenanceJobFailure(repoName)
|
||||
m.RegisterMaintenanceJobFailure(repoName)
|
||||
// Simulate multiple repo maintenance job executions
|
||||
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||
m.RegisterRepoMaintenanceFailure(repoName)
|
||||
m.RegisterRepoMaintenanceFailure(repoName)
|
||||
|
||||
// Record multiple durations
|
||||
m.ObserveMaintenanceJobDuration(repoName, 120.5)
|
||||
m.ObserveMaintenanceJobDuration(repoName, 180.3)
|
||||
m.ObserveMaintenanceJobDuration(repoName, 90.7)
|
||||
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
|
||||
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
|
||||
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
|
||||
|
||||
// Verify accumulated metrics
|
||||
successMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), repoName)
|
||||
assert.Equal(t, float64(3), successMetric, "All maintenance job successes should be counted")
|
||||
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
|
||||
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
|
||||
|
||||
failureMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), repoName)
|
||||
assert.Equal(t, float64(2), failureMetric, "All maintenance job failures should be counted")
|
||||
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
|
||||
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
|
||||
|
||||
durationCount := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), repoName)
|
||||
assert.Equal(t, uint64(3), durationCount, "All maintenance job durations should be observed")
|
||||
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
|
||||
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
|
||||
}
|
||||
|
||||
// Helper function to get metric value from a CounterVec with repository_name label
|
||||
|
||||
Reference in New Issue
Block a user