diff --git a/pkg/apis/velero/v1/restore.go b/pkg/apis/velero/v1/restore.go index 3bad9329c..a046017a1 100644 --- a/pkg/apis/velero/v1/restore.go +++ b/pkg/apis/velero/v1/restore.go @@ -82,10 +82,14 @@ const ( // RestorePhaseInProgress means the restore is currently executing. RestorePhaseInProgress RestorePhase = "InProgress" - // RestorePhaseCompleted means the restore has finished executing. - // Any relevant warnings or errors will be captured in the Status. + // RestorePhaseCompleted means the restore has run successfully + // without errors. RestorePhaseCompleted RestorePhase = "Completed" + // RestorePhasePartiallyFailed means the restore has run to completion + // but encountered 1+ errors restoring individual items. + RestorePhasePartiallyFailed RestorePhase = "PartiallyFailed" + // RestorePhaseFailed means the restore was unable to execute. // The failing error is recorded in status.FailureReason. RestorePhaseFailed RestorePhase = "Failed" diff --git a/pkg/controller/restore_controller.go b/pkg/controller/restore_controller.go index a0eb3a2c3..ef663f0e7 100644 --- a/pkg/controller/restore_controller.go +++ b/pkg/controller/restore_controller.go @@ -259,6 +259,10 @@ func (c *restoreController) processRestore(restore *api.Restore) error { restore.Status.Phase = api.RestorePhaseFailed restore.Status.FailureReason = err.Error() c.metrics.RegisterRestoreFailed(backupScheduleName) + } else if restore.Status.Errors > 0 { + c.logger.Debug("Restore partially failed") + restore.Status.Phase = api.RestorePhasePartiallyFailed + c.metrics.RegisterRestorePartialFailure(backupScheduleName) } else { c.logger.Debug("Restore completed") restore.Status.Phase = api.RestorePhaseCompleted diff --git a/pkg/controller/restore_controller_test.go b/pkg/controller/restore_controller_test.go index e7f052d69..01783251b 100644 --- a/pkg/controller/restore_controller_test.go +++ b/pkg/controller/restore_controller_test.go @@ -300,6 +300,7 @@ func TestProcessQueueItem(t *testing.T) { restorerError: errors.New("blarg"), expectedErr: false, expectedPhase: string(api.RestorePhaseInProgress), + expectedFinalPhase: string(api.RestorePhasePartiallyFailed), expectedRestoreErrors: 1, expectedRestorerCall: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseInProgress).Restore, }, @@ -595,7 +596,7 @@ func TestProcessQueueItem(t *testing.T) { if test.expectedFinalPhase != "" { expected = Patch{ Status: StatusPatch{ - Phase: api.RestorePhaseCompleted, + Phase: api.RestorePhase(test.expectedFinalPhase), Errors: test.expectedRestoreErrors, }, } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c3e6173b6..3b4565e8a 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -42,6 +42,7 @@ const ( restoreAttemptTotal = "restore_attempt_total" restoreValidationFailedTotal = "restore_validation_failed_total" restoreSuccessTotal = "restore_success_total" + restorePartialFailureTotal = "restore_partial_failure_total" restoreFailedTotal = "restore_failed_total" volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total" volumeSnapshotSuccessTotal = "volume_snapshot_success_total" @@ -162,6 +163,14 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), + restorePartialFailureTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: restorePartialFailureTotal, + Help: "Total number of partially failed restores", + }, + []string{scheduleLabel}, + ), restoreFailedTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, @@ -236,6 +245,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } + if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } @@ -346,6 +358,13 @@ func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) { } } +// RegisterRestorePartialFailure records a restore that partially failed. +func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) { + if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + // RegisterRestoreFailed records a restore that failed. func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) { if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {