mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-01-03 11:45:20 +00:00
Merge pull request #9414 from shubham-pampattiwar/add-maintenance-job-metrics
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m8s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 5s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 14s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 15s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m43s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 58s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 1m8s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 58s
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m8s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 5s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 14s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 15s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m43s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 58s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 1m8s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 58s
Add Prometheus metrics for maintenance jobs
This commit is contained in:
1
changelogs/unreleased/9414-shubham-pampattiwar
Normal file
1
changelogs/unreleased/9414-shubham-pampattiwar
Normal file
@@ -0,0 +1 @@
|
||||
Add Prometheus metrics for maintenance jobs
|
||||
@@ -758,6 +758,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
|
||||
s.config.RepoMaintenanceJobConfig,
|
||||
s.logLevel,
|
||||
s.config.LogFormat,
|
||||
s.metrics,
|
||||
).SetupWithManager(s.mgr); err != nil {
|
||||
s.logger.Fatal(err, "unable to create controller", "controller", constant.ControllerBackupRepo)
|
||||
}
|
||||
|
||||
@@ -42,6 +42,7 @@ import (
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/constant"
|
||||
"github.com/vmware-tanzu/velero/pkg/label"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config"
|
||||
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
|
||||
repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
|
||||
@@ -66,6 +67,7 @@ type BackupRepoReconciler struct {
|
||||
repoMaintenanceConfig string
|
||||
logLevel logrus.Level
|
||||
logFormat *logging.FormatFlag
|
||||
metrics *metrics.ServerMetrics
|
||||
}
|
||||
|
||||
func NewBackupRepoReconciler(
|
||||
@@ -78,6 +80,7 @@ func NewBackupRepoReconciler(
|
||||
repoMaintenanceConfig string,
|
||||
logLevel logrus.Level,
|
||||
logFormat *logging.FormatFlag,
|
||||
metrics *metrics.ServerMetrics,
|
||||
) *BackupRepoReconciler {
|
||||
c := &BackupRepoReconciler{
|
||||
client,
|
||||
@@ -90,6 +93,7 @@ func NewBackupRepoReconciler(
|
||||
repoMaintenanceConfig,
|
||||
logLevel,
|
||||
logFormat,
|
||||
metrics,
|
||||
}
|
||||
|
||||
return c
|
||||
@@ -491,6 +495,12 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.logLevel, r.logFormat, log)
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("Starting repo maintenance failed")
|
||||
|
||||
// Record failure metric when job fails to start
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, &metav1.Time{Time: startTime}, nil, fmt.Sprintf("Failed to start maintenance job, err: %v", err))
|
||||
})
|
||||
@@ -505,11 +515,30 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
|
||||
if status.Result == velerov1api.BackupRepositoryMaintenanceFailed {
|
||||
log.WithError(err).Warn("Pruning repository failed")
|
||||
|
||||
// Record failure metric
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||
}
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp, status.CompleteTimestamp, status.Message)
|
||||
})
|
||||
}
|
||||
|
||||
// Record success metric
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
|
||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||
}
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
rr.Status.LastMaintenanceTime = &metav1.Time{Time: status.CompleteTimestamp.Time}
|
||||
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp, status.CompleteTimestamp, status.Message)
|
||||
|
||||
@@ -19,6 +19,8 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/mock"
|
||||
@@ -32,6 +34,7 @@ import (
|
||||
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/builder"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
"github.com/vmware-tanzu/velero/pkg/repository"
|
||||
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
|
||||
repomaintenance "github.com/vmware-tanzu/velero/pkg/repository/maintenance"
|
||||
@@ -65,6 +68,7 @@ func mockBackupRepoReconciler(t *testing.T, mockOn string, arg any, ret ...any)
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -584,6 +588,7 @@ func TestGetRepositoryMaintenanceFrequency(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
freq := reconciler.getRepositoryMaintenanceFrequency(test.repo)
|
||||
@@ -716,6 +721,7 @@ func TestNeedInvalidBackupRepo(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
need := reconciler.needInvalidBackupRepo(test.oldBSL, test.newBSL)
|
||||
@@ -1581,6 +1587,7 @@ func TestDeleteOldMaintenanceJobWithConfigMap(t *testing.T) {
|
||||
repoMaintenanceConfigName,
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
_, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: test.repo.Namespace, Name: "repo"}})
|
||||
@@ -1638,6 +1645,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
|
||||
@@ -1689,6 +1697,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
|
||||
@@ -1739,6 +1748,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
|
||||
@@ -1750,3 +1760,189 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
assert.Equal(t, velerov1api.BackupRepositoryPhaseReady, rr.Status.Phase)
|
||||
})
|
||||
}
|
||||
|
||||
func TestRepoMaintenanceMetricsRecording(t *testing.T) {
|
||||
now := time.Now().Round(time.Second)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
repo *velerov1api.BackupRepository
|
||||
startJobFunc func(client.Client, context.Context, *velerov1api.BackupRepository, string, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error)
|
||||
waitJobFunc func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error)
|
||||
expectSuccess bool
|
||||
expectFailure bool
|
||||
expectDuration bool
|
||||
}{
|
||||
{
|
||||
name: "metrics recorded on successful maintenance",
|
||||
repo: &velerov1api.BackupRepository{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: velerov1api.DefaultNamespace,
|
||||
Name: "test-repo-success",
|
||||
},
|
||||
Spec: velerov1api.BackupRepositorySpec{
|
||||
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
|
||||
},
|
||||
Status: velerov1api.BackupRepositoryStatus{
|
||||
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
|
||||
},
|
||||
},
|
||||
startJobFunc: startMaintenanceJobSucceed,
|
||||
waitJobFunc: waitMaintenanceJobCompleteFunc(now, velerov1api.BackupRepositoryMaintenanceSucceeded, ""),
|
||||
expectSuccess: true,
|
||||
expectFailure: false,
|
||||
expectDuration: true,
|
||||
},
|
||||
{
|
||||
name: "metrics recorded on failed maintenance",
|
||||
repo: &velerov1api.BackupRepository{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: velerov1api.DefaultNamespace,
|
||||
Name: "test-repo-failure",
|
||||
},
|
||||
Spec: velerov1api.BackupRepositorySpec{
|
||||
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
|
||||
},
|
||||
Status: velerov1api.BackupRepositoryStatus{
|
||||
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
|
||||
},
|
||||
},
|
||||
startJobFunc: startMaintenanceJobSucceed,
|
||||
waitJobFunc: func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
|
||||
return velerov1api.BackupRepositoryMaintenanceStatus{
|
||||
StartTimestamp: &metav1.Time{Time: now},
|
||||
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Minute)}, // Job ran for 1 minute then failed
|
||||
Result: velerov1api.BackupRepositoryMaintenanceFailed,
|
||||
Message: "test error",
|
||||
}, nil
|
||||
},
|
||||
expectSuccess: false,
|
||||
expectFailure: true,
|
||||
expectDuration: true,
|
||||
},
|
||||
{
|
||||
name: "metrics recorded on job start failure",
|
||||
repo: &velerov1api.BackupRepository{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: velerov1api.DefaultNamespace,
|
||||
Name: "test-repo-start-fail",
|
||||
},
|
||||
Spec: velerov1api.BackupRepositorySpec{
|
||||
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
|
||||
},
|
||||
Status: velerov1api.BackupRepositoryStatus{
|
||||
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
|
||||
},
|
||||
},
|
||||
startJobFunc: startMaintenanceJobFail,
|
||||
expectSuccess: false,
|
||||
expectFailure: true,
|
||||
expectDuration: false, // No duration when job fails to start
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
// Create metrics instance
|
||||
m := metrics.NewServerMetrics()
|
||||
|
||||
// Create reconciler with metrics
|
||||
reconciler := mockBackupRepoReconciler(t, "", test.repo, nil)
|
||||
reconciler.metrics = m
|
||||
reconciler.clock = &fakeClock{now}
|
||||
|
||||
err := reconciler.Client.Create(t.Context(), test.repo)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Set up job functions
|
||||
funcStartMaintenanceJob = test.startJobFunc
|
||||
funcWaitMaintenanceJobComplete = test.waitJobFunc
|
||||
|
||||
// Run maintenance
|
||||
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
|
||||
|
||||
// Verify metrics were recorded
|
||||
successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
|
||||
failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
|
||||
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
|
||||
|
||||
if test.expectSuccess {
|
||||
assert.Equal(t, float64(1), successCount, "Success metric should be recorded")
|
||||
} else {
|
||||
assert.Equal(t, float64(0), successCount, "Success metric should not be recorded")
|
||||
}
|
||||
|
||||
if test.expectFailure {
|
||||
assert.Equal(t, float64(1), failureCount, "Failure metric should be recorded")
|
||||
} else {
|
||||
assert.Equal(t, float64(0), failureCount, "Failure metric should not be recorded")
|
||||
}
|
||||
|
||||
if test.expectDuration {
|
||||
assert.Equal(t, uint64(1), durationCount, "Duration metric should be recorded")
|
||||
} else {
|
||||
assert.Equal(t, uint64(0), durationCount, "Duration metric should not be recorded")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to get maintenance metric value from ServerMetrics
|
||||
func getMaintenanceMetricValue(t *testing.T, m *metrics.ServerMetrics, metricName, repoName string) float64 {
|
||||
t.Helper()
|
||||
|
||||
metricMap := m.Metrics()
|
||||
collector, ok := metricMap[metricName]
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
ch := make(chan prometheus.Metric, 1)
|
||||
collector.Collect(ch)
|
||||
close(ch)
|
||||
|
||||
for metric := range ch {
|
||||
dto := &dto.Metric{}
|
||||
err := metric.Write(dto)
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, label := range dto.Label {
|
||||
if *label.Name == "repository_name" && *label.Value == repoName {
|
||||
if dto.Counter != nil {
|
||||
return *dto.Counter.Value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Helper to get maintenance duration histogram count
|
||||
func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoName string) uint64 {
|
||||
t.Helper()
|
||||
|
||||
metricMap := m.Metrics()
|
||||
collector, ok := metricMap["repo_maintenance_duration_seconds"]
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
ch := make(chan prometheus.Metric, 1)
|
||||
collector.Collect(ch)
|
||||
close(ch)
|
||||
|
||||
for metric := range ch {
|
||||
dto := &dto.Metric{}
|
||||
err := metric.Write(dto)
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, label := range dto.Label {
|
||||
if *label.Name == "repository_name" && *label.Value == repoName {
|
||||
if dto.Histogram != nil {
|
||||
return *dto.Histogram.SampleCount
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -27,6 +27,11 @@ type ServerMetrics struct {
|
||||
metrics map[string]prometheus.Collector
|
||||
}
|
||||
|
||||
// Metrics returns the metrics map for testing purposes.
|
||||
func (m *ServerMetrics) Metrics() map[string]prometheus.Collector {
|
||||
return m.metrics
|
||||
}
|
||||
|
||||
const (
|
||||
metricNamespace = "velero"
|
||||
podVolumeMetricsNamespace = "podVolume"
|
||||
@@ -75,6 +80,14 @@ const (
|
||||
DataDownloadFailureTotal = "data_download_failure_total"
|
||||
DataDownloadCancelTotal = "data_download_cancel_total"
|
||||
|
||||
// repo maintenance metrics
|
||||
repoMaintenanceSuccessTotal = "repo_maintenance_success_total"
|
||||
repoMaintenanceFailureTotal = "repo_maintenance_failure_total"
|
||||
// repoMaintenanceDurationSeconds tracks the distribution of maintenance job durations.
|
||||
// Each completed job's duration is recorded in the appropriate bucket, allowing
|
||||
// analysis of individual job performance and trending over time.
|
||||
repoMaintenanceDurationSeconds = "repo_maintenance_duration_seconds"
|
||||
|
||||
// Labels
|
||||
nodeMetricLabel = "node"
|
||||
podVolumeOperationLabel = "operation"
|
||||
@@ -82,6 +95,7 @@ const (
|
||||
pvbNameLabel = "pod_volume_backup"
|
||||
scheduleLabel = "schedule"
|
||||
backupNameLabel = "backupName"
|
||||
repositoryNameLabel = "repository_name"
|
||||
|
||||
// metrics values
|
||||
BackupLastStatusSucc int64 = 1
|
||||
@@ -333,6 +347,41 @@ func NewServerMetrics() *ServerMetrics {
|
||||
},
|
||||
[]string{scheduleLabel, backupNameLabel},
|
||||
),
|
||||
repoMaintenanceSuccessTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: repoMaintenanceSuccessTotal,
|
||||
Help: "Total number of successful repo maintenance jobs",
|
||||
},
|
||||
[]string{repositoryNameLabel},
|
||||
),
|
||||
repoMaintenanceFailureTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: repoMaintenanceFailureTotal,
|
||||
Help: "Total number of failed repo maintenance jobs",
|
||||
},
|
||||
[]string{repositoryNameLabel},
|
||||
),
|
||||
repoMaintenanceDurationSeconds: prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: repoMaintenanceDurationSeconds,
|
||||
Help: "Time taken to complete repo maintenance jobs, in seconds",
|
||||
Buckets: []float64{
|
||||
toSeconds(1 * time.Minute),
|
||||
toSeconds(5 * time.Minute),
|
||||
toSeconds(10 * time.Minute),
|
||||
toSeconds(15 * time.Minute),
|
||||
toSeconds(30 * time.Minute),
|
||||
toSeconds(1 * time.Hour),
|
||||
toSeconds(2 * time.Hour),
|
||||
toSeconds(3 * time.Hour),
|
||||
toSeconds(4 * time.Hour),
|
||||
},
|
||||
},
|
||||
[]string{repositoryNameLabel},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -912,3 +961,24 @@ func (m *ServerMetrics) RegisterBackupLocationUnavailable(backupLocationName str
|
||||
g.WithLabelValues(backupLocationName).Set(float64(0))
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRepoMaintenanceSuccess records a successful repo maintenance job.
|
||||
func (m *ServerMetrics) RegisterRepoMaintenanceSuccess(repositoryName string) {
|
||||
if c, ok := m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(repositoryName).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRepoMaintenanceFailure records a failed repo maintenance job.
|
||||
func (m *ServerMetrics) RegisterRepoMaintenanceFailure(repositoryName string) {
|
||||
if c, ok := m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(repositoryName).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveRepoMaintenanceDuration records the number of seconds a repo maintenance job took.
|
||||
func (m *ServerMetrics) ObserveRepoMaintenanceDuration(repositoryName string, seconds float64) {
|
||||
if h, ok := m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec); ok {
|
||||
h.WithLabelValues(repositoryName).Observe(seconds)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -372,3 +372,148 @@ func getHistogramCount(t *testing.T, vec *prometheus.HistogramVec, scheduleLabel
|
||||
t.Fatalf("Histogram with schedule label '%s' not found", scheduleLabel)
|
||||
return 0
|
||||
}
|
||||
|
||||
// TestRepoMaintenanceMetrics verifies that repo maintenance metrics are properly recorded.
|
||||
func TestRepoMaintenanceMetrics(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
repositoryName string
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "maintenance job metrics for repository",
|
||||
repositoryName: "default-restic-abcd",
|
||||
description: "Metrics should be recorded with the repository name label",
|
||||
},
|
||||
{
|
||||
name: "maintenance job metrics for different repository",
|
||||
repositoryName: "velero-backup-repo-xyz",
|
||||
description: "Metrics should be recorded with different repository name",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
m := NewServerMetrics()
|
||||
|
||||
// Test repo maintenance success metric
|
||||
t.Run("RegisterRepoMaintenanceSuccess", func(t *testing.T) {
|
||||
m.RegisterRepoMaintenanceSuccess(tc.repositoryName)
|
||||
|
||||
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||
assert.Equal(t, float64(1), metric, tc.description)
|
||||
})
|
||||
|
||||
// Test repo maintenance failure metric
|
||||
t.Run("RegisterRepoMaintenanceFailure", func(t *testing.T) {
|
||||
m.RegisterRepoMaintenanceFailure(tc.repositoryName)
|
||||
|
||||
metric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), tc.repositoryName)
|
||||
assert.Equal(t, float64(1), metric, tc.description)
|
||||
})
|
||||
|
||||
// Test repo maintenance duration metric
|
||||
t.Run("ObserveRepoMaintenanceDuration", func(t *testing.T) {
|
||||
m.ObserveRepoMaintenanceDuration(tc.repositoryName, 300.5)
|
||||
|
||||
// For histogram, we check the count
|
||||
metric := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), tc.repositoryName)
|
||||
assert.Equal(t, uint64(1), metric, tc.description)
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestMultipleRepoMaintenanceJobsAccumulate verifies that multiple repo maintenance jobs
|
||||
// accumulate metrics under the same repository label.
|
||||
func TestMultipleRepoMaintenanceJobsAccumulate(t *testing.T) {
|
||||
m := NewServerMetrics()
|
||||
repoName := "default-restic-test"
|
||||
|
||||
// Simulate multiple repo maintenance job executions
|
||||
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||
m.RegisterRepoMaintenanceSuccess(repoName)
|
||||
m.RegisterRepoMaintenanceFailure(repoName)
|
||||
m.RegisterRepoMaintenanceFailure(repoName)
|
||||
|
||||
// Record multiple durations
|
||||
m.ObserveRepoMaintenanceDuration(repoName, 120.5)
|
||||
m.ObserveRepoMaintenanceDuration(repoName, 180.3)
|
||||
m.ObserveRepoMaintenanceDuration(repoName, 90.7)
|
||||
|
||||
// Verify accumulated metrics
|
||||
successMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceSuccessTotal].(*prometheus.CounterVec), repoName)
|
||||
assert.Equal(t, float64(3), successMetric, "All repo maintenance successes should be counted")
|
||||
|
||||
failureMetric := getMaintenanceMetricValue(t, m.metrics[repoMaintenanceFailureTotal].(*prometheus.CounterVec), repoName)
|
||||
assert.Equal(t, float64(2), failureMetric, "All repo maintenance failures should be counted")
|
||||
|
||||
durationCount := getMaintenanceHistogramCount(t, m.metrics[repoMaintenanceDurationSeconds].(*prometheus.HistogramVec), repoName)
|
||||
assert.Equal(t, uint64(3), durationCount, "All repo maintenance durations should be observed")
|
||||
}
|
||||
|
||||
// Helper function to get metric value from a CounterVec with repository_name label
|
||||
func getMaintenanceMetricValue(t *testing.T, vec prometheus.Collector, repositoryName string) float64 {
|
||||
t.Helper()
|
||||
ch := make(chan prometheus.Metric, 1)
|
||||
vec.Collect(ch)
|
||||
close(ch)
|
||||
|
||||
for metric := range ch {
|
||||
dto := &dto.Metric{}
|
||||
err := metric.Write(dto)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Check if this metric has the expected repository_name label
|
||||
hasCorrectLabel := false
|
||||
for _, label := range dto.Label {
|
||||
if *label.Name == "repository_name" && *label.Value == repositoryName {
|
||||
hasCorrectLabel = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if hasCorrectLabel {
|
||||
if dto.Counter != nil {
|
||||
return *dto.Counter.Value
|
||||
}
|
||||
if dto.Gauge != nil {
|
||||
return *dto.Gauge.Value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t.Fatalf("Metric with repository_name label '%s' not found", repositoryName)
|
||||
return 0
|
||||
}
|
||||
|
||||
// Helper function to get histogram count with repository_name label
|
||||
func getMaintenanceHistogramCount(t *testing.T, vec *prometheus.HistogramVec, repositoryName string) uint64 {
|
||||
t.Helper()
|
||||
ch := make(chan prometheus.Metric, 1)
|
||||
vec.Collect(ch)
|
||||
close(ch)
|
||||
|
||||
for metric := range ch {
|
||||
dto := &dto.Metric{}
|
||||
err := metric.Write(dto)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Check if this metric has the expected repository_name label
|
||||
hasCorrectLabel := false
|
||||
for _, label := range dto.Label {
|
||||
if *label.Name == "repository_name" && *label.Value == repositoryName {
|
||||
hasCorrectLabel = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if hasCorrectLabel && dto.Histogram != nil {
|
||||
return *dto.Histogram.SampleCount
|
||||
}
|
||||
}
|
||||
|
||||
t.Fatalf("Histogram with repository_name label '%s' not found", repositoryName)
|
||||
return 0
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user