diff --git a/changelogs/unreleased/1280-fabito b/changelogs/unreleased/1280-fabito new file mode 100644 index 000000000..5b381339e --- /dev/null +++ b/changelogs/unreleased/1280-fabito @@ -0,0 +1 @@ +Collect 3 new metrics: backup_deletion_{attempt|failure|success}_total \ No newline at end of file diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index 72706e1a8..ca27bfba1 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -614,6 +614,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string s.sharedInformerFactory.Velero().V1().BackupStorageLocations(), s.sharedInformerFactory.Velero().V1().VolumeSnapshotLocations(), newPluginManager, + s.metrics, ) wg.Add(1) go func() { diff --git a/pkg/controller/backup_deletion_controller.go b/pkg/controller/backup_deletion_controller.go index a55532d37..329c19eff 100644 --- a/pkg/controller/backup_deletion_controller.go +++ b/pkg/controller/backup_deletion_controller.go @@ -37,6 +37,7 @@ import ( velerov1client "github.com/heptio/velero/pkg/generated/clientset/versioned/typed/velero/v1" informers "github.com/heptio/velero/pkg/generated/informers/externalversions/velero/v1" listers "github.com/heptio/velero/pkg/generated/listers/velero/v1" + "github.com/heptio/velero/pkg/metrics" "github.com/heptio/velero/pkg/persistence" "github.com/heptio/velero/pkg/plugin/clientmgmt" "github.com/heptio/velero/pkg/plugin/velero" @@ -63,6 +64,7 @@ type backupDeletionController struct { clock clock.Clock newPluginManager func(logrus.FieldLogger) clientmgmt.Manager newBackupStore func(*v1.BackupStorageLocation, persistence.ObjectStoreGetter, logrus.FieldLogger) (persistence.BackupStore, error) + metrics *metrics.ServerMetrics } // NewBackupDeletionController creates a new backup deletion controller. @@ -79,6 +81,7 @@ func NewBackupDeletionController( backupLocationInformer informers.BackupStorageLocationInformer, snapshotLocationInformer informers.VolumeSnapshotLocationInformer, newPluginManager func(logrus.FieldLogger) clientmgmt.Manager, + metrics *metrics.ServerMetrics, ) Interface { c := &backupDeletionController{ genericController: newGenericController("backup-deletion", logger), @@ -92,7 +95,7 @@ func NewBackupDeletionController( podvolumeBackupLister: podvolumeBackupInformer.Lister(), backupLocationLister: backupLocationInformer.Lister(), snapshotLocationLister: snapshotLocationInformer.Lister(), - + metrics: metrics, // use variables to refer to these functions so they can be // replaced with fakes for testing. newPluginManager: newPluginManager, @@ -234,6 +237,9 @@ func (c *backupDeletionController) processRequest(req *v1.DeleteBackupRequest) e return err } + backupScheduleName := backup.GetLabels()[v1.ScheduleNameLabel] + c.metrics.RegisterBackupDeletionAttempt(backupScheduleName) + var errs []string pluginManager := c.newPluginManager(log) @@ -339,6 +345,12 @@ func (c *backupDeletionController) processRequest(req *v1.DeleteBackupRequest) e } } + if len(errs) == 0 { + c.metrics.RegisterBackupDeletionSuccess(backupScheduleName) + } else { + c.metrics.RegisterBackupDeletionFailed(backupScheduleName) + } + // Update status to processed and record errors req, err = c.patchDeleteBackupRequest(req, func(r *v1.DeleteBackupRequest) { r.Status.Phase = v1.DeleteBackupRequestPhaseProcessed diff --git a/pkg/controller/backup_deletion_controller_test.go b/pkg/controller/backup_deletion_controller_test.go index 308de9c45..1b77b44bd 100644 --- a/pkg/controller/backup_deletion_controller_test.go +++ b/pkg/controller/backup_deletion_controller_test.go @@ -36,6 +36,7 @@ import ( pkgbackup "github.com/heptio/velero/pkg/backup" "github.com/heptio/velero/pkg/generated/clientset/versioned/fake" informers "github.com/heptio/velero/pkg/generated/informers/externalversions" + "github.com/heptio/velero/pkg/metrics" "github.com/heptio/velero/pkg/persistence" persistencemocks "github.com/heptio/velero/pkg/persistence/mocks" "github.com/heptio/velero/pkg/plugin/clientmgmt" @@ -61,6 +62,7 @@ func TestBackupDeletionControllerProcessQueueItem(t *testing.T) { sharedInformers.Velero().V1().BackupStorageLocations(), sharedInformers.Velero().V1().VolumeSnapshotLocations(), nil, // new plugin manager func + metrics.NewServerMetrics(), ).(*backupDeletionController) // Error splitting key @@ -147,6 +149,7 @@ func setupBackupDeletionControllerTest(objects ...runtime.Object) *backupDeletio sharedInformers.Velero().V1().BackupStorageLocations(), sharedInformers.Velero().V1().VolumeSnapshotLocations(), func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }, + metrics.NewServerMetrics(), ).(*backupDeletionController), req: req, @@ -723,6 +726,7 @@ func TestBackupDeletionControllerDeleteExpiredRequests(t *testing.T) { sharedInformers.Velero().V1().BackupStorageLocations(), sharedInformers.Velero().V1().VolumeSnapshotLocations(), nil, // new plugin manager func + metrics.NewServerMetrics(), ).(*backupDeletionController) fakeClock := &clock.FakeClock{} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 77be33af1..502d89fc1 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -34,6 +34,9 @@ const ( backupSuccessTotal = "backup_success_total" backupFailureTotal = "backup_failure_total" backupDurationSeconds = "backup_duration_seconds" + backupDeletionAttemptTotal = "backup_deletion_attempt_total" + backupDeletionSuccessTotal = "backup_deletion_success_total" + backupDeletionFailureTotal = "backup_deletion_failure_total" restoreAttemptTotal = "restore_attempt_total" restoreValidationFailedTotal = "restore_validation_failed_total" restoreSuccessTotal = "restore_success_total" @@ -105,6 +108,30 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), + backupDeletionAttemptTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: backupDeletionAttemptTotal, + Help: "Total number of attempted backup deletions", + }, + []string{scheduleLabel}, + ), + backupDeletionSuccessTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: backupDeletionSuccessTotal, + Help: "Total number of successful backup deletions", + }, + []string{scheduleLabel}, + ), + backupDeletionFailureTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: backupDeletionFailureTotal, + Help: "Total number of failed backup deletions", + }, + []string{scheduleLabel}, + ), backupDurationSeconds: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metricNamespace, @@ -314,6 +341,15 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } + if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } + if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } + if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } @@ -442,6 +478,27 @@ func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds fl // ------------------------------------------------------------------- } +// RegisterBackupDeletionAttempt records the number of attempted backup deletions +func (m *ServerMetrics) RegisterBackupDeletionAttempt(backupSchedule string) { + if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + +// RegisterBackupDeletionFailed records the number of failed backup deletions +func (m *ServerMetrics) RegisterBackupDeletionFailed(backupSchedule string) { + if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + +// RegisterBackupDeletionSuccess records the number of successful backup deletions +func (m *ServerMetrics) RegisterBackupDeletionSuccess(backupSchedule string) { + if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + // toSeconds translates a time.Duration value into a float64 // representing the number of seconds in that duration. func toSeconds(d time.Duration) float64 {