mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-01-06 21:36:30 +00:00
Merge pull request #9414 from shubham-pampattiwar/add-maintenance-job-metrics
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m8s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 5s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 14s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 15s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m43s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 58s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 1m8s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 58s
Some checks failed
Run the E2E test on kind / get-go-version (push) Failing after 1m8s
Run the E2E test on kind / build (push) Has been skipped
Run the E2E test on kind / setup-test-matrix (push) Successful in 5s
Run the E2E test on kind / run-e2e-test (push) Has been skipped
Main CI / get-go-version (push) Successful in 14s
Main CI / Build (push) Failing after 37s
Close stale issues and PRs / stale (push) Successful in 15s
Trivy Nightly Scan / Trivy nightly scan (velero, main) (push) Failing after 1m43s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-aws, main) (push) Failing after 58s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-gcp, main) (push) Failing after 1m8s
Trivy Nightly Scan / Trivy nightly scan (velero-plugin-for-microsoft-azure, main) (push) Failing after 58s
Add Prometheus metrics for maintenance jobs
This commit is contained in:
@@ -42,6 +42,7 @@ import (
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/constant"
|
||||
"github.com/vmware-tanzu/velero/pkg/label"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config"
|
||||
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
|
||||
repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
|
||||
@@ -66,6 +67,7 @@ type BackupRepoReconciler struct {
|
||||
repoMaintenanceConfig string
|
||||
logLevel logrus.Level
|
||||
logFormat *logging.FormatFlag
|
||||
metrics *metrics.ServerMetrics
|
||||
}
|
||||
|
||||
func NewBackupRepoReconciler(
|
||||
@@ -78,6 +80,7 @@ func NewBackupRepoReconciler(
|
||||
repoMaintenanceConfig string,
|
||||
logLevel logrus.Level,
|
||||
logFormat *logging.FormatFlag,
|
||||
metrics *metrics.ServerMetrics,
|
||||
) *BackupRepoReconciler {
|
||||
c := &BackupRepoReconciler{
|
||||
client,
|
||||
@@ -90,6 +93,7 @@ func NewBackupRepoReconciler(
|
||||
repoMaintenanceConfig,
|
||||
logLevel,
|
||||
logFormat,
|
||||
metrics,
|
||||
}
|
||||
|
||||
return c
|
||||
@@ -491,6 +495,12 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.logLevel, r.logFormat, log)
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("Starting repo maintenance failed")
|
||||
|
||||
// Record failure metric when job fails to start
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, &metav1.Time{Time: startTime}, nil, fmt.Sprintf("Failed to start maintenance job, err: %v", err))
|
||||
})
|
||||
@@ -505,11 +515,30 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel
|
||||
|
||||
if status.Result == velerov1api.BackupRepositoryMaintenanceFailed {
|
||||
log.WithError(err).Warn("Pruning repository failed")
|
||||
|
||||
// Record failure metric
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterRepoMaintenanceFailure(req.Name)
|
||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||
}
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp, status.CompleteTimestamp, status.Message)
|
||||
})
|
||||
}
|
||||
|
||||
// Record success metric
|
||||
if r.metrics != nil {
|
||||
r.metrics.RegisterRepoMaintenanceSuccess(req.Name)
|
||||
if status.StartTimestamp != nil && status.CompleteTimestamp != nil {
|
||||
duration := status.CompleteTimestamp.Sub(status.StartTimestamp.Time).Seconds()
|
||||
r.metrics.ObserveRepoMaintenanceDuration(req.Name, duration)
|
||||
}
|
||||
}
|
||||
|
||||
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
|
||||
rr.Status.LastMaintenanceTime = &metav1.Time{Time: status.CompleteTimestamp.Time}
|
||||
updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp, status.CompleteTimestamp, status.Message)
|
||||
|
||||
@@ -19,6 +19,8 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/mock"
|
||||
@@ -32,6 +34,7 @@ import (
|
||||
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/builder"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
"github.com/vmware-tanzu/velero/pkg/repository"
|
||||
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
|
||||
repomaintenance "github.com/vmware-tanzu/velero/pkg/repository/maintenance"
|
||||
@@ -65,6 +68,7 @@ func mockBackupRepoReconciler(t *testing.T, mockOn string, arg any, ret ...any)
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -584,6 +588,7 @@ func TestGetRepositoryMaintenanceFrequency(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
freq := reconciler.getRepositoryMaintenanceFrequency(test.repo)
|
||||
@@ -716,6 +721,7 @@ func TestNeedInvalidBackupRepo(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
need := reconciler.needInvalidBackupRepo(test.oldBSL, test.newBSL)
|
||||
@@ -1581,6 +1587,7 @@ func TestDeleteOldMaintenanceJobWithConfigMap(t *testing.T) {
|
||||
repoMaintenanceConfigName,
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
_, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: test.repo.Namespace, Name: "repo"}})
|
||||
@@ -1638,6 +1645,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
|
||||
@@ -1689,6 +1697,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
|
||||
@@ -1739,6 +1748,7 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
"",
|
||||
logrus.InfoLevel,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
err := reconciler.initializeRepo(t.Context(), rr, location, reconciler.logger)
|
||||
@@ -1750,3 +1760,189 @@ func TestInitializeRepoWithRepositoryTypes(t *testing.T) {
|
||||
assert.Equal(t, velerov1api.BackupRepositoryPhaseReady, rr.Status.Phase)
|
||||
})
|
||||
}
|
||||
|
||||
func TestRepoMaintenanceMetricsRecording(t *testing.T) {
|
||||
now := time.Now().Round(time.Second)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
repo *velerov1api.BackupRepository
|
||||
startJobFunc func(client.Client, context.Context, *velerov1api.BackupRepository, string, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error)
|
||||
waitJobFunc func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error)
|
||||
expectSuccess bool
|
||||
expectFailure bool
|
||||
expectDuration bool
|
||||
}{
|
||||
{
|
||||
name: "metrics recorded on successful maintenance",
|
||||
repo: &velerov1api.BackupRepository{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: velerov1api.DefaultNamespace,
|
||||
Name: "test-repo-success",
|
||||
},
|
||||
Spec: velerov1api.BackupRepositorySpec{
|
||||
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
|
||||
},
|
||||
Status: velerov1api.BackupRepositoryStatus{
|
||||
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
|
||||
},
|
||||
},
|
||||
startJobFunc: startMaintenanceJobSucceed,
|
||||
waitJobFunc: waitMaintenanceJobCompleteFunc(now, velerov1api.BackupRepositoryMaintenanceSucceeded, ""),
|
||||
expectSuccess: true,
|
||||
expectFailure: false,
|
||||
expectDuration: true,
|
||||
},
|
||||
{
|
||||
name: "metrics recorded on failed maintenance",
|
||||
repo: &velerov1api.BackupRepository{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: velerov1api.DefaultNamespace,
|
||||
Name: "test-repo-failure",
|
||||
},
|
||||
Spec: velerov1api.BackupRepositorySpec{
|
||||
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
|
||||
},
|
||||
Status: velerov1api.BackupRepositoryStatus{
|
||||
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
|
||||
},
|
||||
},
|
||||
startJobFunc: startMaintenanceJobSucceed,
|
||||
waitJobFunc: func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
|
||||
return velerov1api.BackupRepositoryMaintenanceStatus{
|
||||
StartTimestamp: &metav1.Time{Time: now},
|
||||
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Minute)}, // Job ran for 1 minute then failed
|
||||
Result: velerov1api.BackupRepositoryMaintenanceFailed,
|
||||
Message: "test error",
|
||||
}, nil
|
||||
},
|
||||
expectSuccess: false,
|
||||
expectFailure: true,
|
||||
expectDuration: true,
|
||||
},
|
||||
{
|
||||
name: "metrics recorded on job start failure",
|
||||
repo: &velerov1api.BackupRepository{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: velerov1api.DefaultNamespace,
|
||||
Name: "test-repo-start-fail",
|
||||
},
|
||||
Spec: velerov1api.BackupRepositorySpec{
|
||||
MaintenanceFrequency: metav1.Duration{Duration: time.Hour},
|
||||
},
|
||||
Status: velerov1api.BackupRepositoryStatus{
|
||||
LastMaintenanceTime: &metav1.Time{Time: now.Add(-2 * time.Hour)},
|
||||
},
|
||||
},
|
||||
startJobFunc: startMaintenanceJobFail,
|
||||
expectSuccess: false,
|
||||
expectFailure: true,
|
||||
expectDuration: false, // No duration when job fails to start
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
// Create metrics instance
|
||||
m := metrics.NewServerMetrics()
|
||||
|
||||
// Create reconciler with metrics
|
||||
reconciler := mockBackupRepoReconciler(t, "", test.repo, nil)
|
||||
reconciler.metrics = m
|
||||
reconciler.clock = &fakeClock{now}
|
||||
|
||||
err := reconciler.Client.Create(t.Context(), test.repo)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Set up job functions
|
||||
funcStartMaintenanceJob = test.startJobFunc
|
||||
funcWaitMaintenanceJobComplete = test.waitJobFunc
|
||||
|
||||
// Run maintenance
|
||||
_ = reconciler.runMaintenanceIfDue(t.Context(), test.repo, velerotest.NewLogger())
|
||||
|
||||
// Verify metrics were recorded
|
||||
successCount := getMaintenanceMetricValue(t, m, "repo_maintenance_success_total", test.repo.Name)
|
||||
failureCount := getMaintenanceMetricValue(t, m, "repo_maintenance_failure_total", test.repo.Name)
|
||||
durationCount := getMaintenanceDurationCount(t, m, test.repo.Name)
|
||||
|
||||
if test.expectSuccess {
|
||||
assert.Equal(t, float64(1), successCount, "Success metric should be recorded")
|
||||
} else {
|
||||
assert.Equal(t, float64(0), successCount, "Success metric should not be recorded")
|
||||
}
|
||||
|
||||
if test.expectFailure {
|
||||
assert.Equal(t, float64(1), failureCount, "Failure metric should be recorded")
|
||||
} else {
|
||||
assert.Equal(t, float64(0), failureCount, "Failure metric should not be recorded")
|
||||
}
|
||||
|
||||
if test.expectDuration {
|
||||
assert.Equal(t, uint64(1), durationCount, "Duration metric should be recorded")
|
||||
} else {
|
||||
assert.Equal(t, uint64(0), durationCount, "Duration metric should not be recorded")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to get maintenance metric value from ServerMetrics
|
||||
func getMaintenanceMetricValue(t *testing.T, m *metrics.ServerMetrics, metricName, repoName string) float64 {
|
||||
t.Helper()
|
||||
|
||||
metricMap := m.Metrics()
|
||||
collector, ok := metricMap[metricName]
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
ch := make(chan prometheus.Metric, 1)
|
||||
collector.Collect(ch)
|
||||
close(ch)
|
||||
|
||||
for metric := range ch {
|
||||
dto := &dto.Metric{}
|
||||
err := metric.Write(dto)
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, label := range dto.Label {
|
||||
if *label.Name == "repository_name" && *label.Value == repoName {
|
||||
if dto.Counter != nil {
|
||||
return *dto.Counter.Value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Helper to get maintenance duration histogram count
|
||||
func getMaintenanceDurationCount(t *testing.T, m *metrics.ServerMetrics, repoName string) uint64 {
|
||||
t.Helper()
|
||||
|
||||
metricMap := m.Metrics()
|
||||
collector, ok := metricMap["repo_maintenance_duration_seconds"]
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
ch := make(chan prometheus.Metric, 1)
|
||||
collector.Collect(ch)
|
||||
close(ch)
|
||||
|
||||
for metric := range ch {
|
||||
dto := &dto.Metric{}
|
||||
err := metric.Write(dto)
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, label := range dto.Label {
|
||||
if *label.Name == "repository_name" && *label.Value == repoName {
|
||||
if dto.Histogram != nil {
|
||||
return *dto.Histogram.SampleCount
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user