From 3a3527553a6c8015c456378b87125fc80570684c Mon Sep 17 00:00:00 2001 From: allenxu404 Date: Thu, 26 Oct 2023 16:52:00 +0800 Subject: [PATCH] Fix inconsistent behavior of Backup and Restore hook execution Signed-off-by: allenxu404 --- changelogs/unreleased/7022-allenxu404 | 1 + internal/hook/wait_exec_hook_handler.go | 19 +++++++++++-------- internal/hook/wait_exec_hook_handler_test.go | 14 +++++++------- pkg/apis/velero/v1/backup_types.go | 8 ++++---- pkg/restore/restore.go | 15 ++++++++------- 5 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 changelogs/unreleased/7022-allenxu404 diff --git a/changelogs/unreleased/7022-allenxu404 b/changelogs/unreleased/7022-allenxu404 new file mode 100644 index 000000000..139a5e842 --- /dev/null +++ b/changelogs/unreleased/7022-allenxu404 @@ -0,0 +1 @@ +Fix inconsistent behavior of Backup and Restore hook execution \ No newline at end of file diff --git a/internal/hook/wait_exec_hook_handler.go b/internal/hook/wait_exec_hook_handler.go index aa6b9b188..04ad967a7 100644 --- a/internal/hook/wait_exec_hook_handler.go +++ b/internal/hook/wait_exec_hook_handler.go @@ -50,6 +50,11 @@ type DefaultListWatchFactory struct { PodsGetter cache.Getter } +type HookErrInfo struct { + Namespace string + Err error +} + func (d *DefaultListWatchFactory) NewListWatch(namespace string, selector fields.Selector) cache.ListerWatcher { return cache.NewListWatchFromClient(d.PodsGetter, "pods", namespace, selector) } @@ -158,8 +163,8 @@ func (e *DefaultWaitExecHookHandler) HandleHooks( if hook.Hook.WaitTimeout.Duration != 0 && time.Since(waitStart) > hook.Hook.WaitTimeout.Duration { err := fmt.Errorf("hook %s in container %s expired before executing", hook.HookName, hook.Hook.Container) hookLog.Error(err) + errors = append(errors, err) if hook.Hook.OnError == velerov1api.HookErrorModeFail { - errors = append(errors, err) cancel() return } @@ -172,8 +177,9 @@ func (e *DefaultWaitExecHookHandler) HandleHooks( } if err := e.PodCommandExecutor.ExecutePodCommand(hookLog, podMap, pod.Namespace, pod.Name, hook.HookName, eh); err != nil { hookLog.WithError(err).Error("Error executing hook") + err = fmt.Errorf("hook %s in container %s failed to execute, err: %v", hook.HookName, hook.Hook.Container, err) + errors = append(errors, err) if hook.Hook.OnError == velerov1api.HookErrorModeFail { - errors = append(errors, err) cancel() return } @@ -204,10 +210,9 @@ func (e *DefaultWaitExecHookHandler) HandleHooks( podWatcher.Run(ctx.Done()) // There are some cases where this function could return with unexecuted hooks: the pod may - // be deleted, a hook with OnError mode Fail could fail, or it may timeout waiting for + // be deleted, a hook could fail, or it may timeout waiting for // containers to become ready. - // Each unexecuted hook is logged as an error but only hooks with OnError mode Fail return - // an error from this function. + // Each unexecuted hook is logged as an error and this error will be returned from this function. for _, hooks := range byContainer { for _, hook := range hooks { if hook.executed { @@ -222,9 +227,7 @@ func (e *DefaultWaitExecHookHandler) HandleHooks( }, ) hookLog.Error(err) - if hook.Hook.OnError == velerov1api.HookErrorModeFail { - errors = append(errors, err) - } + errors = append(errors, err) } } diff --git a/internal/hook/wait_exec_hook_handler_test.go b/internal/hook/wait_exec_hook_handler_test.go index 2702e9c45..3e809ccfa 100644 --- a/internal/hook/wait_exec_hook_handler_test.go +++ b/internal/hook/wait_exec_hook_handler_test.go @@ -209,10 +209,10 @@ func TestWaitExecHandleHooks(t *testing.T) { Result(), }, }, - expectedErrors: []error{errors.New("pod hook error")}, + expectedErrors: []error{errors.New("hook in container container1 failed to execute, err: pod hook error")}, }, { - name: "should return no error when hook from annotation fails with on error mode continue", + name: "should return error when hook from annotation fails with on error mode continue", initialPod: builder.ForPod("default", "my-pod"). ObjectMeta(builder.WithAnnotations( podRestoreHookCommandAnnotationKey, "/usr/bin/foo", @@ -278,7 +278,7 @@ func TestWaitExecHandleHooks(t *testing.T) { Result(), }, }, - expectedErrors: nil, + expectedErrors: []error{errors.New("hook in container container1 failed to execute, err: pod hook error")}, }, { name: "should return no error when hook from annotation executes after 10ms wait for container to start", @@ -422,7 +422,7 @@ func TestWaitExecHandleHooks(t *testing.T) { }, }, { - name: "should return no error when spec hook with wait timeout expires with OnError mode Continue", + name: "should return error when spec hook with wait timeout expires with OnError mode Continue", groupResource: "pods", initialPod: builder.ForPod("default", "my-pod"). Containers(&v1.Container{ @@ -435,7 +435,7 @@ func TestWaitExecHandleHooks(t *testing.T) { }, }). Result(), - expectedErrors: nil, + expectedErrors: []error{errors.New("hook my-hook-1 in container container1 in pod default/my-pod not executed: context deadline exceeded")}, byContainer: map[string][]PodExecRestoreHook{ "container1": { { @@ -515,8 +515,8 @@ func TestWaitExecHandleHooks(t *testing.T) { sharedHooksContextTimeout: time.Millisecond, }, { - name: "should return no error when shared hooks context is canceled before spec hook with OnError mode Continue executes", - expectedErrors: nil, + name: "should return error when shared hooks context is canceled before spec hook with OnError mode Continue executes", + expectedErrors: []error{errors.New("hook my-hook-1 in container container1 in pod default/my-pod not executed: context deadline exceeded")}, groupResource: "pods", initialPod: builder.ForPod("default", "my-pod"). Containers(&v1.Container{ diff --git a/pkg/apis/velero/v1/backup_types.go b/pkg/apis/velero/v1/backup_types.go index e25e1463d..ed1518010 100644 --- a/pkg/apis/velero/v1/backup_types.go +++ b/pkg/apis/velero/v1/backup_types.go @@ -261,12 +261,12 @@ type ExecHook struct { type HookErrorMode string const ( - // HookErrorModeContinue means that an error from a hook is acceptable, and the backup can - // proceed. + // HookErrorModeContinue means that an error from a hook is acceptable and the backup/restore can + // proceed with the rest of hooks' execution. This backup/restore should be in `PartiallyFailed` status. HookErrorModeContinue HookErrorMode = "Continue" - // HookErrorModeFail means that an error from a hook is problematic, and the backup should be in - // error. + // HookErrorModeFail means that an error from a hook is problematic and Velero should stop executing following hooks. + // This backup/restore should be in `PartiallyFailed` status. HookErrorModeFail HookErrorMode = "Fail" ) diff --git a/pkg/restore/restore.go b/pkg/restore/restore.go index bc7a97a08..24a33cea6 100644 --- a/pkg/restore/restore.go +++ b/pkg/restore/restore.go @@ -311,7 +311,7 @@ func (kr *kubernetesRestorer) RestoreWithResolvers( discoveryHelper: kr.discoveryHelper, resourcePriorities: kr.resourcePriorities, resourceRestoreHooks: resourceRestoreHooks, - hooksErrs: make(chan error), + hooksErrs: make(chan hook.HookErrInfo), waitExecHookHandler: waitExecHookHandler, hooksContext: hooksCtx, hooksCancelFunc: hooksCancelFunc, @@ -360,7 +360,7 @@ type restoreContext struct { discoveryHelper discovery.Helper resourcePriorities Priorities hooksWaitGroup sync.WaitGroup - hooksErrs chan error + hooksErrs chan hook.HookErrInfo resourceRestoreHooks []hook.ResourceRestoreHook waitExecHookHandler hook.WaitExecHookHandler hooksContext go_context.Context @@ -655,8 +655,8 @@ func (ctx *restoreContext) execute() (results.Result, results.Result) { ctx.hooksWaitGroup.Wait() close(ctx.hooksErrs) }() - for err := range ctx.hooksErrs { - errs.Velero = append(errs.Velero, err.Error()) + for errInfo := range ctx.hooksErrs { + errs.Add(errInfo.Namespace, errInfo.Err) } ctx.log.Info("Done waiting for all post-restore exec hooks to complete") @@ -1899,10 +1899,11 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) { // on the ctx.podVolumeErrs channel. defer ctx.hooksWaitGroup.Done() + podNs := createdObj.GetNamespace() pod := new(v1.Pod) if err := runtime.DefaultUnstructuredConverter.FromUnstructured(createdObj.UnstructuredContent(), &pod); err != nil { ctx.log.WithError(err).Error("error converting unstructured pod") - ctx.hooksErrs <- err + ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err} return } execHooksByContainer, err := hook.GroupRestoreExecHooks( @@ -1912,7 +1913,7 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) { ) if err != nil { ctx.log.WithError(err).Errorf("error getting exec hooks for pod %s/%s", pod.Namespace, pod.Name) - ctx.hooksErrs <- err + ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err} return } @@ -1922,7 +1923,7 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) { for _, err := range errs { // Errors are already logged in the HandleHooks method. - ctx.hooksErrs <- err + ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err} } } }()