From 3b80e00d620e2e5b4fe4050a1f7dde9977276c7b Mon Sep 17 00:00:00 2001
From: Steve Kriss <krisss@vmware.com>
Date: Wed, 15 Jan 2020 10:27:21 -0700
Subject: [PATCH 1/3] fix race condition in waiting for restic restores to
 complete

Signed-off-by: Steve Kriss <krisss@vmware.com>
---
 pkg/restore/restore.go       | 45 ++++++++++++++++-------
 pkg/util/sync/error_group.go | 71 ------------------------------------
 2 files changed, 32 insertions(+), 84 deletions(-)
 delete mode 100644 pkg/util/sync/error_group.go

diff --git a/pkg/restore/restore.go b/pkg/restore/restore.go
index d005d15f3..05ea63eb1 100644
--- a/pkg/restore/restore.go
+++ b/pkg/restore/restore.go
@@ -25,6 +25,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strings"
+	"sync"
 	"time"
 
 	uuid "github.com/gofrs/uuid"
@@ -56,7 +57,6 @@ import (
 	"github.com/vmware-tanzu/velero/pkg/util/collections"
 	"github.com/vmware-tanzu/velero/pkg/util/filesystem"
 	"github.com/vmware-tanzu/velero/pkg/util/kube"
-	velerosync "github.com/vmware-tanzu/velero/pkg/util/sync"
 	"github.com/vmware-tanzu/velero/pkg/volume"
 )
 
@@ -274,6 +274,7 @@ func (kr *kubernetesRestorer) Restore(
 		actions:                    resolvedActions,
 		volumeSnapshotterGetter:    volumeSnapshotterGetter,
 		resticRestorer:             resticRestorer,
+		resticErrs:                 make(chan error),
 		pvsToProvision:             sets.NewString(),
 		pvRestorer:                 pvRestorer,
 		volumeSnapshots:            req.VolumeSnapshots,
@@ -365,7 +366,8 @@ type context struct {
 	actions                    []resolvedAction
 	volumeSnapshotterGetter    VolumeSnapshotterGetter
 	resticRestorer             restic.Restorer
-	globalWaitGroup            velerosync.ErrorGroup
+	resticWaitGroup            sync.WaitGroup
+	resticErrs                 chan error
 	pvsToProvision             sets.String
 	pvRestorer                 PVRestorer
 	volumeSnapshots            []*volume.Snapshot
@@ -454,17 +456,27 @@ func (ctx *context) execute() (Result, Result) {
 		}
 	}
 
-	// TODO timeout?
-	ctx.log.Debug("Waiting on global wait group")
-	waitErrs := ctx.globalWaitGroup.Wait()
-	ctx.log.Debug("Done waiting on global wait group")
+	ctx.log.Info("Waiting for all restic restores to complete")
 
-	for _, err := range waitErrs {
+	// wait for all of the restic restore goroutines to be done, which is
+	// only possible once all of their errors have been received by the loop
+	// below, then close the resticErrs channel so the loop terminates.
+	go func() {
+		// TODO timeout?
+		ctx.resticWaitGroup.Wait()
+		close(ctx.resticErrs)
+	}()
+
+	// this loop will only terminate when the ctx.resticErrs channel is closed
+	// in the above goroutine, *after* all errors from the goroutines have been
+	// received by this loop.
+	for err := range ctx.resticErrs {
 		// TODO not ideal to be adding these to Velero-level errors
 		// rather than a specific namespace, but don't have a way
 		// to track the namespace right now.
 		errs.Velero = append(errs.Velero, err.Error())
 	}
+	ctx.log.Info("Done waiting for all restic restores to complete")
 
 	return warnings, errs
 }
@@ -1148,11 +1160,17 @@ func restorePodVolumeBackups(ctx *context, createdObj *unstructured.Unstructured
 	if ctx.resticRestorer == nil {
 		ctx.log.Warn("No restic restorer, not restoring pod's volumes")
 	} else {
-		ctx.globalWaitGroup.GoErrorSlice(func() []error {
+		ctx.resticWaitGroup.Add(1)
+		go func() {
+			// Done() will only be called after all errors have been successfully sent
+			// on the ctx.resticErrs channel
+			defer ctx.resticWaitGroup.Done()
+
 			pod := new(v1.Pod)
 			if err := runtime.DefaultUnstructuredConverter.FromUnstructured(createdObj.UnstructuredContent(), &pod); err != nil {
 				ctx.log.WithError(err).Error("error converting unstructured pod")
-				return []error{err}
+				ctx.resticErrs <- err
+				return
 			}
 
 			data := restic.RestoreData{
@@ -1164,11 +1182,12 @@ func restorePodVolumeBackups(ctx *context, createdObj *unstructured.Unstructured
 			}
 			if errs := ctx.resticRestorer.RestorePodVolumes(data); errs != nil {
 				ctx.log.WithError(kubeerrs.NewAggregate(errs)).Error("unable to successfully complete restic restores of pod's volumes")
-				return errs
-			}
 
-			return nil
-		})
+				for _, err := range errs {
+					ctx.resticErrs <- err
+				}
+			}
+		}()
 	}
 }
 
diff --git a/pkg/util/sync/error_group.go b/pkg/util/sync/error_group.go
deleted file mode 100644
index ead625a9a..000000000
--- a/pkg/util/sync/error_group.go
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-Copyright 2018 the Velero contributors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package sync
-
-import "sync"
-
-// An ErrorGroup waits for a collection of goroutines that return errors to finish.
-// The main goroutine calls Go one or more times to execute a function that returns
-// an error in a goroutine. Then it calls Wait to wait for all goroutines to finish
-// and collect the results of each.
-type ErrorGroup struct {
-	wg      sync.WaitGroup
-	errChan chan error
-}
-
-// Go runs the specified function in a goroutine.
-func (eg *ErrorGroup) Go(action func() error) {
-	if eg.errChan == nil {
-		eg.errChan = make(chan error)
-	}
-
-	eg.wg.Add(1)
-	go func() {
-		eg.errChan <- action()
-		eg.wg.Done()
-	}()
-}
-
-// GoErrorSlice runs a function that returns a slice of errors
-// in a goroutine.
-func (eg *ErrorGroup) GoErrorSlice(action func() []error) {
-	if eg.errChan == nil {
-		eg.errChan = make(chan error)
-	}
-
-	eg.wg.Add(1)
-	go func() {
-		for _, err := range action() {
-			eg.errChan <- err
-		}
-		eg.wg.Done()
-	}()
-}
-
-// Wait waits for all functions run via Go to finish,
-// and returns all of their errors.
-func (eg *ErrorGroup) Wait() []error {
-	var errs []error
-	go func() {
-		for {
-			errs = append(errs, <-eg.errChan)
-		}
-	}()
-
-	eg.wg.Wait()
-	return errs
-}

From 65e970f59d5db8a754bf3bd924f721542190259e Mon Sep 17 00:00:00 2001
From: Steve Kriss <krisss@vmware.com>
Date: Wed, 15 Jan 2020 15:21:31 -0700
Subject: [PATCH 2/3] changelog

Signed-off-by: Steve Kriss <krisss@vmware.com>
---
 changelogs/unreleased/2201-skriss | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelogs/unreleased/2201-skriss

diff --git a/changelogs/unreleased/2201-skriss b/changelogs/unreleased/2201-skriss
new file mode 100644
index 000000000..82c120e61
--- /dev/null
+++ b/changelogs/unreleased/2201-skriss
@@ -0,0 +1 @@
+bug fix: fix race condition resulting in restores sometimes succeeding despite restic restore failures

From ae316193ea0669c49015c2870718e8f28fd3dfab Mon Sep 17 00:00:00 2001
From: Steve Kriss <krisss@vmware.com>
Date: Tue, 21 Jan 2020 12:36:46 -0700
Subject: [PATCH 3/3] move log statement

Signed-off-by: Steve Kriss <krisss@vmware.com>
---
 pkg/restore/restore.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/restore/restore.go b/pkg/restore/restore.go
index 05ea63eb1..1ec6c3ec8 100644
--- a/pkg/restore/restore.go
+++ b/pkg/restore/restore.go
@@ -456,12 +456,12 @@ func (ctx *context) execute() (Result, Result) {
 		}
 	}
 
-	ctx.log.Info("Waiting for all restic restores to complete")
-
 	// wait for all of the restic restore goroutines to be done, which is
 	// only possible once all of their errors have been received by the loop
 	// below, then close the resticErrs channel so the loop terminates.
 	go func() {
+		ctx.log.Info("Waiting for all restic restores to complete")
+
 		// TODO timeout?
 		ctx.resticWaitGroup.Wait()
 		close(ctx.resticErrs)