Merge pull request #985 from shubheksha/fix/977-add-prom-metrics-volume-snapshots

Add prometheus metrics for successful and attempted volume snapshots per backup
This commit is contained in:
Steve Kriss
2018-10-30 13:25:00 -06:00
committed by GitHub
2 changed files with 78 additions and 18 deletions

View File

@@ -470,6 +470,9 @@ func recordBackupMetrics(backup *api.Backup, backupFile *os.File, serverMetrics
backupDuration := backup.Status.CompletionTimestamp.Time.Sub(backup.Status.StartTimestamp.Time)
backupDurationSeconds := float64(backupDuration / time.Second)
serverMetrics.RegisterBackupDuration(backupScheduleName, backupDurationSeconds)
serverMetrics.RegisterVolumeSnapshotAttempts(backupScheduleName, backup.Status.VolumeSnapshotsAttempted)
serverMetrics.RegisterVolumeSnapshotSuccesses(backupScheduleName, backup.Status.VolumeSnapshotsCompleted)
serverMetrics.RegisterVolumeSnapshotFailures(backupScheduleName, backup.Status.VolumeSnapshotsAttempted-backup.Status.VolumeSnapshotsCompleted)
return err
}

View File

@@ -28,17 +28,19 @@ type ServerMetrics struct {
}
const (
metricNamespace = "ark"
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
// TODO: Rename the Count variables to match their strings
backupAttemptCount = "backup_attempt_total"
backupSuccessCount = "backup_success_total"
backupFailureCount = "backup_failure_total"
metricNamespace = "ark"
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
backupAttemptTotal = "backup_attempt_total"
backupSuccessTotal = "backup_success_total"
backupFailureTotal = "backup_failure_total"
backupDurationSeconds = "backup_duration_seconds"
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
restoreSuccessTotal = "restore_success_total"
restoreFailedTotal = "restore_failed_total"
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
@@ -58,26 +60,26 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
backupAttemptCount: prometheus.NewCounterVec(
backupAttemptTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: backupAttemptCount,
Name: backupAttemptTotal,
Help: "Total number of attempted backups",
},
[]string{scheduleLabel},
),
backupSuccessCount: prometheus.NewCounterVec(
backupSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: backupSuccessCount,
Name: backupSuccessTotal,
Help: "Total number of successful backups",
},
[]string{scheduleLabel},
),
backupFailureCount: prometheus.NewCounterVec(
backupFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: backupFailureCount,
Name: backupFailureTotal,
Help: "Total number of failed backups",
},
[]string{scheduleLabel},
@@ -133,6 +135,30 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
volumeSnapshotAttemptTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: volumeSnapshotAttemptTotal,
Help: "Total number of attempted volume snapshots",
},
[]string{scheduleLabel},
),
volumeSnapshotSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: volumeSnapshotSuccessTotal,
Help: "Total number of successful volume snapshots",
},
[]string{scheduleLabel},
),
volumeSnapshotFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: volumeSnapshotFailureTotal,
Help: "Total number of failed volume snapshots",
},
[]string{scheduleLabel},
),
},
}
}
@@ -144,14 +170,15 @@ func (m *ServerMetrics) RegisterAllMetrics() {
}
}
// InitSchedule initializes counter metrics of a schedule.
func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[backupAttemptCount].(*prometheus.CounterVec); ok {
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[backupSuccessCount].(*prometheus.CounterVec); ok {
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[backupFailureCount].(*prometheus.CounterVec); ok {
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
@@ -166,6 +193,15 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
}
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
@@ -177,21 +213,21 @@ func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, si
// RegisterBackupAttempt records an backup attempt.
func (m *ServerMetrics) RegisterBackupAttempt(backupSchedule string) {
if c, ok := m.metrics[backupAttemptCount].(*prometheus.CounterVec); ok {
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterBackupSuccess records a successful completion of a backup.
func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) {
if c, ok := m.metrics[backupSuccessCount].(*prometheus.CounterVec); ok {
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterBackupFailed records a failed backup.
func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) {
if c, ok := m.metrics[backupFailureCount].(*prometheus.CounterVec); ok {
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
@@ -236,3 +272,24 @@ func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterVolumeSnapshotAttempts records an attempt to snapshot a volume.
func (m *ServerMetrics) RegisterVolumeSnapshotAttempts(backupSchedule string, volumeSnapshotsAttempted int) {
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsAttempted))
}
}
// RegisterVolumeSnapshotSuccesses records a completed volume snapshot.
func (m *ServerMetrics) RegisterVolumeSnapshotSuccesses(backupSchedule string, volumeSnapshotsCompleted int) {
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsCompleted))
}
}
// RegisterVolumeSnapshotFailures records a failed volume snapshot.
func (m *ServerMetrics) RegisterVolumeSnapshotFailures(backupSchedule string, volumeSnapshotsFailed int) {
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsFailed))
}
}