mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-01-07 13:55:20 +00:00
Address review comments: rename metrics to repo_maintenance_*
- Rename metric constants from maintenance_job_* to repo_maintenance_* - Update metric help text to clarify these are for repo maintenance - Rename functions: RegisterMaintenanceJob* → RegisterRepoMaintenance* - Update all test references to use new names Addresses review comments from @Lyndon-Li on PR #9414 Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
@@ -498,7 +498,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
|||||||
|
|
||||||
// Record failure metric when job fails to start
|
// Record failure metric when job fails to start
|
||||||
if r.metrics != nil {
|
if r.metrics != nil {
|
||||||
r.metrics.RegisterMaintenanceJobFailure(req.Name)
|
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||||
@@ -518,10 +518,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
|||||||
|
|
||||||
// Record failure metric
|
// Record failure metric
|
||||||
if r.metrics != nil {
|
if r.metrics != nil {
|
||||||
r.metrics.RegisterMaintenanceJobFailure(req.Name)
|
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||||
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration)
|
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -532,10 +532,10 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
|||||||
|
|
||||||
// Record success metric
|
// Record success metric
|
||||||
if r.metrics != nil {
|
if r.metrics != nil {
|
||||||
r.metrics.RegisterMaintenanceJobSuccess(req.Name)
|
r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
|
||||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||||
r.metrics.ObserveMaintenanceJobDuration(req.Name, duration)
|
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1761,7 +1761,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMaintenanceJobMetricsRecording(t *testing.T) {
|
func TestRepoMaintenanceMetricsRecording(t *testing.T) {
|
||||||
now := time.Now().Round(time.Second)
|
now := time.Now().Round(time.Second)
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
@@ -1862,8 +1862,8 @@ func TestMaintenanceJobMetricsRecording(t *testing.T) {
|
|||||||
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
|
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
|
||||||
|
|
||||||
// Verify metrics were recorded
|
// Verify metrics were recorded
|
||||||
successCount := getMaintenanceMetricValue(t, m, "maintenance_job_success_total", test.repo.Name)
|
successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
|
||||||
failureCount := getMaintenanceMetricValue(t, m, "maintenance_job_failure_total", test.repo.Name)
|
failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
|
||||||
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
|
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
|
||||||
|
|
||||||
if test.expectSuccess {
|
if test.expectSuccess {
|
||||||
@@ -1922,7 +1922,7 @@ func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoNam
|
|||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
metricMap := m.Metrics()
|
metricMap := m.Metrics()
|
||||||
collector, ok := metricMap["maintenance_job_duration_seconds"]
|
collector, ok := metricMap["repo_maintenance_duration_seconds"]
|
||||||
if !ok {
|
if !ok {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -80,10 +80,10 @@ const (
|
|||||||
DataDownloadFailureTotal = "data_download_failure_total"
|
DataDownloadFailureTotal = "data_download_failure_total"
|
||||||
DataDownloadCancelTotal = "data_download_cancel_total"
|
DataDownloadCancelTotal = "data_download_cancel_total"
|
||||||
|
|
||||||
// maintenance job metrics
|
// repo maintenance metrics
|
||||||
maintenanceJobSuccessTotal = "maintenance_job_success_total"
|
repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
|
||||||
maintenanceJobFailureTotal = "maintenance_job_failure_total"
|
repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
|
||||||
maintenanceJobDurationSeconds = "maintenance_job_duration_seconds"
|
repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
|
||||||
|
|
||||||
// Labels
|
// Labels
|
||||||
nodeMetricLabel = "node"
|
nodeMetricLabel = "node"
|
||||||
@@ -344,27 +344,27 @@ func NewServerMetrics() *ServerMetrics {
|
|||||||
},
|
},
|
||||||
[]string{scheduleLabel, backupNameLabel},
|
[]string{scheduleLabel, backupNameLabel},
|
||||||
),
|
),
|
||||||
maintenanceJobSuccessTotal: prometheus.NewCounterVec(
|
repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: metricNamespace,
|
Namespace: metricNamespace,
|
||||||
Name: maintenanceJobSuccessTotal,
|
Name: repoMaintenanceSuccessTotal,
|
||||||
Help: "Total number of successful maintenance jobs",
|
Help: "Total number of successful repo maintenance jobs",
|
||||||
},
|
},
|
||||||
[]string{repositoryNameLabel},
|
[]string{repositoryNameLabel},
|
||||||
),
|
),
|
||||||
maintenanceJobFailureTotal: prometheus.NewCounterVec(
|
repoMaintenanceFailureTotal: prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: metricNamespace,
|
Namespace: metricNamespace,
|
||||||
Name: maintenanceJobFailureTotal,
|
Name: repoMaintenanceFailureTotal,
|
||||||
Help: "Total number of failed maintenance jobs",
|
Help: "Total number of failed repo maintenance jobs",
|
||||||
},
|
},
|
||||||
[]string{repositoryNameLabel},
|
[]string{repositoryNameLabel},
|
||||||
),
|
),
|
||||||
maintenanceJobDurationSeconds: prometheus.NewHistogramVec(
|
repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Namespace: metricNamespace,
|
Namespace: metricNamespace,
|
||||||
Name: maintenanceJobDurationSeconds,
|
Name: repoMaintenanceDurationSeconds,
|
||||||
Help: "Time taken to complete maintenance jobs, in seconds",
|
Help: "Time taken to complete repo maintenance jobs, in seconds",
|
||||||
Buckets: []float64{
|
Buckets: []float64{
|
||||||
toSeconds(1 * time.Minute),
|
toSeconds(1 * time.Minute),
|
||||||
toSeconds(5 * time.Minute),
|
toSeconds(5 * time.Minute),
|
||||||
@@ -959,23 +959,23 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// RegisterMaintenanceJobSuccess records a successful maintenance job.
|
// RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
|
||||||
func (m *ServerMetrics) RegisterMaintenanceJobSuccess(repositoryName string) {
|
func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
|
||||||
if c, ok := m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec); ok {
|
if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
|
||||||
c.WithLabelValues(repositoryName).Inc()
|
c.WithLabelValues(repositoryName).Inc()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// RegisterMaintenanceJobFailure records a failed maintenance job.
|
// RegisterRepoMaintenanceFailure records a failed repo maintenance job.
|
||||||
func (m *ServerMetrics) RegisterMaintenanceJobFailure(repositoryName string) {
|
func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
|
||||||
if c, ok := m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec); ok {
|
if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
|
||||||
c.WithLabelValues(repositoryName).Inc()
|
c.WithLabelValues(repositoryName).Inc()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ObserveMaintenanceJobDuration records the number of seconds a maintenance job took.
|
// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
|
||||||
func (m *ServerMetrics) ObserveMaintenanceJobDuration(repositoryName string, seconds float64) {
|
func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
|
||||||
if h, ok := m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec); ok {
|
if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
|
||||||
h.WithLabelValues(repositoryName).Observe(seconds)
|
h.WithLabelValues(repositoryName).Observe(seconds)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -373,8 +373,8 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestMaintenanceJobMetrics verifies that maintenance job metrics are properly recorded.
|
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
|
||||||
func TestMaintenanceJobMetrics(t *testing.T) {
|
func TestRepoMaintenanceMetrics(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
repositoryName string
|
repositoryName string
|
||||||
@@ -396,61 +396,61 @@ func TestMaintenanceJobMetrics(t *testing.T) {
|
|||||||
t.Run(tc.name, func(t *testing.T) {
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
m := NewServerMetrics()
|
m := NewServerMetrics()
|
||||||
|
|
||||||
// Test maintenance job success metric
|
// Test repo maintenance success metric
|
||||||
t.Run("RegisterMaintenanceJobSuccess", func(t *testing.T) {
|
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
|
||||||
m.RegisterMaintenanceJobSuccess(tc.repositoryName)
|
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
|
||||||
|
|
||||||
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
|
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||||
assert.Equal(t, float64(1), metric, tc.description)
|
assert.Equal(t, float64(1), metric, tc.description)
|
||||||
})
|
})
|
||||||
|
|
||||||
// Test maintenance job failure metric
|
// Test repo maintenance failure metric
|
||||||
t.Run("RegisterMaintenanceJobFailure", func(t *testing.T) {
|
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
|
||||||
m.RegisterMaintenanceJobFailure(tc.repositoryName)
|
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
|
||||||
|
|
||||||
metric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
|
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||||
assert.Equal(t, float64(1), metric, tc.description)
|
assert.Equal(t, float64(1), metric, tc.description)
|
||||||
})
|
})
|
||||||
|
|
||||||
// Test maintenance job duration metric
|
// Test repo maintenance duration metric
|
||||||
t.Run("ObserveMaintenanceJobDuration", func(t *testing.T) {
|
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
|
||||||
m.ObserveMaintenanceJobDuration(tc.repositoryName, 300.5)
|
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
|
||||||
|
|
||||||
// For histogram, we check the count
|
// For histogram, we check the count
|
||||||
metric := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
|
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
|
||||||
assert.Equal(t, uint64(1), metric, tc.description)
|
assert.Equal(t, uint64(1), metric, tc.description)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestMultipleMaintenanceJobsAccumulate verifies that multiple maintenance jobs
|
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
|
||||||
// accumulate metrics under the same repository label.
|
// accumulate metrics under the same repository label.
|
||||||
func TestMultipleMaintenanceJobsAccumulate(t *testing.T) {
|
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
|
||||||
m := NewServerMetrics()
|
m := NewServerMetrics()
|
||||||
repoName := "default-restic-test"
|
repoName := "default-restic-test"
|
||||||
|
|
||||||
// Simulate multiple maintenance job executions
|
// Simulate multiple repo maintenance job executions
|
||||||
m.RegisterMaintenanceJobSuccess(repoName)
|
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||||
m.RegisterMaintenanceJobSuccess(repoName)
|
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||||
m.RegisterMaintenanceJobSuccess(repoName)
|
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||||
m.RegisterMaintenanceJobFailure(repoName)
|
m.RegisterRepoMaintenanceFailure(repoName)
|
||||||
m.RegisterMaintenanceJobFailure(repoName)
|
m.RegisterRepoMaintenanceFailure(repoName)
|
||||||
|
|
||||||
// Record multiple durations
|
// Record multiple durations
|
||||||
m.ObserveMaintenanceJobDuration(repoName, 120.5)
|
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
|
||||||
m.ObserveMaintenanceJobDuration(repoName, 180.3)
|
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
|
||||||
m.ObserveMaintenanceJobDuration(repoName, 90.7)
|
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
|
||||||
|
|
||||||
// Verify accumulated metrics
|
// Verify accumulated metrics
|
||||||
successMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobSuccessTotal].(*prometheus.CounterVec), repoName)
|
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
|
||||||
assert.Equal(t, float64(3), successMetric, "All maintenance job successes should be counted")
|
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
|
||||||
|
|
||||||
failureMetric := getMaintenanceMetricValue(t, m.metrics[maintenanceJobFailureTotal].(*prometheus.CounterVec), repoName)
|
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
|
||||||
assert.Equal(t, float64(2), failureMetric, "All maintenance job failures should be counted")
|
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
|
||||||
|
|
||||||
durationCount := getMaintenanceHistogramCount(t, m.metrics[maintenanceJobDurationSeconds].(*prometheus.HistogramVec), repoName)
|
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
|
||||||
assert.Equal(t, uint64(3), durationCount, "All maintenance job durations should be observed")
|
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to get metric value from a CounterVec with repository_name label
|
// Helper function to get metric value from a CounterVec with repository_name label
|
||||||
|
|||||||
Reference in New Issue
Block a user