From 3b80e00d620e2e5b4fe4050a1f7dde9977276c7b Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Wed, 15 Jan 2020 10:27:21 -0700 Subject: [PATCH 1/3] fix race condition in waiting for restic restores to complete Signed-off-by: Steve Kriss --- pkg/restore/restore.go | 45 ++++++++++++++++------- pkg/util/sync/error_group.go | 71 ------------------------------------ 2 files changed, 32 insertions(+), 84 deletions(-) delete mode 100644 pkg/util/sync/error_group.go diff --git a/pkg/restore/restore.go b/pkg/restore/restore.go index d005d15f3..05ea63eb1 100644 --- a/pkg/restore/restore.go +++ b/pkg/restore/restore.go @@ -25,6 +25,7 @@ import ( "path/filepath" "sort" "strings" + "sync" "time" uuid "github.com/gofrs/uuid" @@ -56,7 +57,6 @@ import ( "github.com/vmware-tanzu/velero/pkg/util/collections" "github.com/vmware-tanzu/velero/pkg/util/filesystem" "github.com/vmware-tanzu/velero/pkg/util/kube" - velerosync "github.com/vmware-tanzu/velero/pkg/util/sync" "github.com/vmware-tanzu/velero/pkg/volume" ) @@ -274,6 +274,7 @@ func (kr *kubernetesRestorer) Restore( actions: resolvedActions, volumeSnapshotterGetter: volumeSnapshotterGetter, resticRestorer: resticRestorer, + resticErrs: make(chan error), pvsToProvision: sets.NewString(), pvRestorer: pvRestorer, volumeSnapshots: req.VolumeSnapshots, @@ -365,7 +366,8 @@ type context struct { actions []resolvedAction volumeSnapshotterGetter VolumeSnapshotterGetter resticRestorer restic.Restorer - globalWaitGroup velerosync.ErrorGroup + resticWaitGroup sync.WaitGroup + resticErrs chan error pvsToProvision sets.String pvRestorer PVRestorer volumeSnapshots []*volume.Snapshot @@ -454,17 +456,27 @@ func (ctx *context) execute() (Result, Result) { } } - // TODO timeout? - ctx.log.Debug("Waiting on global wait group") - waitErrs := ctx.globalWaitGroup.Wait() - ctx.log.Debug("Done waiting on global wait group") + ctx.log.Info("Waiting for all restic restores to complete") - for _, err := range waitErrs { + // wait for all of the restic restore goroutines to be done, which is + // only possible once all of their errors have been received by the loop + // below, then close the resticErrs channel so the loop terminates. + go func() { + // TODO timeout? + ctx.resticWaitGroup.Wait() + close(ctx.resticErrs) + }() + + // this loop will only terminate when the ctx.resticErrs channel is closed + // in the above goroutine, *after* all errors from the goroutines have been + // received by this loop. + for err := range ctx.resticErrs { // TODO not ideal to be adding these to Velero-level errors // rather than a specific namespace, but don't have a way // to track the namespace right now. errs.Velero = append(errs.Velero, err.Error()) } + ctx.log.Info("Done waiting for all restic restores to complete") return warnings, errs } @@ -1148,11 +1160,17 @@ func restorePodVolumeBackups(ctx *context, createdObj *unstructured.Unstructured if ctx.resticRestorer == nil { ctx.log.Warn("No restic restorer, not restoring pod's volumes") } else { - ctx.globalWaitGroup.GoErrorSlice(func() []error { + ctx.resticWaitGroup.Add(1) + go func() { + // Done() will only be called after all errors have been successfully sent + // on the ctx.resticErrs channel + defer ctx.resticWaitGroup.Done() + pod := new(v1.Pod) if err := runtime.DefaultUnstructuredConverter.FromUnstructured(createdObj.UnstructuredContent(), &pod); err != nil { ctx.log.WithError(err).Error("error converting unstructured pod") - return []error{err} + ctx.resticErrs <- err + return } data := restic.RestoreData{ @@ -1164,11 +1182,12 @@ func restorePodVolumeBackups(ctx *context, createdObj *unstructured.Unstructured } if errs := ctx.resticRestorer.RestorePodVolumes(data); errs != nil { ctx.log.WithError(kubeerrs.NewAggregate(errs)).Error("unable to successfully complete restic restores of pod's volumes") - return errs - } - return nil - }) + for _, err := range errs { + ctx.resticErrs <- err + } + } + }() } } diff --git a/pkg/util/sync/error_group.go b/pkg/util/sync/error_group.go deleted file mode 100644 index ead625a9a..000000000 --- a/pkg/util/sync/error_group.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright 2018 the Velero contributors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package sync - -import "sync" - -// An ErrorGroup waits for a collection of goroutines that return errors to finish. -// The main goroutine calls Go one or more times to execute a function that returns -// an error in a goroutine. Then it calls Wait to wait for all goroutines to finish -// and collect the results of each. -type ErrorGroup struct { - wg sync.WaitGroup - errChan chan error -} - -// Go runs the specified function in a goroutine. -func (eg *ErrorGroup) Go(action func() error) { - if eg.errChan == nil { - eg.errChan = make(chan error) - } - - eg.wg.Add(1) - go func() { - eg.errChan <- action() - eg.wg.Done() - }() -} - -// GoErrorSlice runs a function that returns a slice of errors -// in a goroutine. -func (eg *ErrorGroup) GoErrorSlice(action func() []error) { - if eg.errChan == nil { - eg.errChan = make(chan error) - } - - eg.wg.Add(1) - go func() { - for _, err := range action() { - eg.errChan <- err - } - eg.wg.Done() - }() -} - -// Wait waits for all functions run via Go to finish, -// and returns all of their errors. -func (eg *ErrorGroup) Wait() []error { - var errs []error - go func() { - for { - errs = append(errs, <-eg.errChan) - } - }() - - eg.wg.Wait() - return errs -} From 65e970f59d5db8a754bf3bd924f721542190259e Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Wed, 15 Jan 2020 15:21:31 -0700 Subject: [PATCH 2/3] changelog Signed-off-by: Steve Kriss --- changelogs/unreleased/2201-skriss | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/2201-skriss diff --git a/changelogs/unreleased/2201-skriss b/changelogs/unreleased/2201-skriss new file mode 100644 index 000000000..82c120e61 --- /dev/null +++ b/changelogs/unreleased/2201-skriss @@ -0,0 +1 @@ +bug fix: fix race condition resulting in restores sometimes succeeding despite restic restore failures From ae316193ea0669c49015c2870718e8f28fd3dfab Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Tue, 21 Jan 2020 12:36:46 -0700 Subject: [PATCH 3/3] move log statement Signed-off-by: Steve Kriss --- pkg/restore/restore.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/restore/restore.go b/pkg/restore/restore.go index 05ea63eb1..1ec6c3ec8 100644 --- a/pkg/restore/restore.go +++ b/pkg/restore/restore.go @@ -456,12 +456,12 @@ func (ctx *context) execute() (Result, Result) { } } - ctx.log.Info("Waiting for all restic restores to complete") - // wait for all of the restic restore goroutines to be done, which is // only possible once all of their errors have been received by the loop // below, then close the resticErrs channel so the loop terminates. go func() { + ctx.log.Info("Waiting for all restic restores to complete") + // TODO timeout? ctx.resticWaitGroup.Wait() close(ctx.resticErrs)