diff --git a/pkg/apis/velero/v1/restore.go b/pkg/apis/velero/v1/restore.go index 3bad9329c..a046017a1 100644 --- a/pkg/apis/velero/v1/restore.go +++ b/pkg/apis/velero/v1/restore.go @@ -82,10 +82,14 @@ const ( // RestorePhaseInProgress means the restore is currently executing. RestorePhaseInProgress RestorePhase = "InProgress" - // RestorePhaseCompleted means the restore has finished executing. - // Any relevant warnings or errors will be captured in the Status. + // RestorePhaseCompleted means the restore has run successfully + // without errors. RestorePhaseCompleted RestorePhase = "Completed" + // RestorePhasePartiallyFailed means the restore has run to completion + // but encountered 1+ errors restoring individual items. + RestorePhasePartiallyFailed RestorePhase = "PartiallyFailed" + // RestorePhaseFailed means the restore was unable to execute. // The failing error is recorded in status.FailureReason. RestorePhaseFailed RestorePhase = "Failed" diff --git a/pkg/cmd/cli/restore/logs.go b/pkg/cmd/cli/restore/logs.go index ae40440cc..2b6f6a14e 100644 --- a/pkg/cmd/cli/restore/logs.go +++ b/pkg/cmd/cli/restore/logs.go @@ -50,7 +50,10 @@ func NewLogsCommand(f client.Factory) *cobra.Command { cmd.Exit("Error checking for restore %q: %v", restoreName, err) } - if restore.Status.Phase != v1.RestorePhaseCompleted && restore.Status.Phase != v1.RestorePhaseFailed { + switch restore.Status.Phase { + case v1.RestorePhaseCompleted, v1.RestorePhaseFailed, v1.RestorePhasePartiallyFailed: + // terminal phases, don't exit. + default: cmd.Exit("Logs for restore %q are not available until it's finished processing. Please wait "+ "until the restore has a phase of Completed or Failed and try again.", restoreName) } diff --git a/pkg/cmd/util/output/restore_describer.go b/pkg/cmd/util/output/restore_describer.go index cbeed3fe7..dada2aca4 100644 --- a/pkg/cmd/util/output/restore_describer.go +++ b/pkg/cmd/util/output/restore_describer.go @@ -19,6 +19,7 @@ package output import ( "bytes" "encoding/json" + "fmt" "sort" "strings" @@ -35,7 +36,17 @@ func DescribeRestore(restore *v1.Restore, podVolumeRestores []v1.PodVolumeRestor d.DescribeMetadata(restore.ObjectMeta) d.Println() - d.Printf("Phase:\t%s\n", restore.Status.Phase) + phase := restore.Status.Phase + if phase == "" { + phase = v1.RestorePhaseNew + } + + resultsNote := "" + if phase == v1.RestorePhaseFailed || phase == v1.RestorePhasePartiallyFailed { + resultsNote = fmt.Sprintf(" (run 'velero restore logs %s' for more information)", restore.Name) + } + + d.Printf("Phase:\t%s%s\n", restore.Status.Phase, resultsNote) if len(restore.Status.ValidationErrors) > 0 { d.Println() diff --git a/pkg/controller/restore_controller.go b/pkg/controller/restore_controller.go index a0eb3a2c3..ef663f0e7 100644 --- a/pkg/controller/restore_controller.go +++ b/pkg/controller/restore_controller.go @@ -259,6 +259,10 @@ func (c *restoreController) processRestore(restore *api.Restore) error { restore.Status.Phase = api.RestorePhaseFailed restore.Status.FailureReason = err.Error() c.metrics.RegisterRestoreFailed(backupScheduleName) + } else if restore.Status.Errors > 0 { + c.logger.Debug("Restore partially failed") + restore.Status.Phase = api.RestorePhasePartiallyFailed + c.metrics.RegisterRestorePartialFailure(backupScheduleName) } else { c.logger.Debug("Restore completed") restore.Status.Phase = api.RestorePhaseCompleted diff --git a/pkg/controller/restore_controller_test.go b/pkg/controller/restore_controller_test.go index e7f052d69..01783251b 100644 --- a/pkg/controller/restore_controller_test.go +++ b/pkg/controller/restore_controller_test.go @@ -300,6 +300,7 @@ func TestProcessQueueItem(t *testing.T) { restorerError: errors.New("blarg"), expectedErr: false, expectedPhase: string(api.RestorePhaseInProgress), + expectedFinalPhase: string(api.RestorePhasePartiallyFailed), expectedRestoreErrors: 1, expectedRestorerCall: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseInProgress).Restore, }, @@ -595,7 +596,7 @@ func TestProcessQueueItem(t *testing.T) { if test.expectedFinalPhase != "" { expected = Patch{ Status: StatusPatch{ - Phase: api.RestorePhaseCompleted, + Phase: api.RestorePhase(test.expectedFinalPhase), Errors: test.expectedRestoreErrors, }, } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c3e6173b6..3b4565e8a 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -42,6 +42,7 @@ const ( restoreAttemptTotal = "restore_attempt_total" restoreValidationFailedTotal = "restore_validation_failed_total" restoreSuccessTotal = "restore_success_total" + restorePartialFailureTotal = "restore_partial_failure_total" restoreFailedTotal = "restore_failed_total" volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total" volumeSnapshotSuccessTotal = "volume_snapshot_success_total" @@ -162,6 +163,14 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), + restorePartialFailureTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: restorePartialFailureTotal, + Help: "Total number of partially failed restores", + }, + []string{scheduleLabel}, + ), restoreFailedTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, @@ -236,6 +245,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } + if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } @@ -346,6 +358,13 @@ func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) { } } +// RegisterRestorePartialFailure records a restore that partially failed. +func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) { + if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + // RegisterRestoreFailed records a restore that failed. func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) { if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {