From fdcf1df4fd4f18fcc7018ab1a881f94487f4ff54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wenkai=20Yin=28=E5=B0=B9=E6=96=87=E5=BC=80=29?= Date: Wed, 12 Feb 2025 13:12:58 +0800 Subject: [PATCH] Fix WaitGroup panic issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure WaitGroup.Add() is called before WaitGroup.Done() to avoid WaitGroup panic issue Fixes #8657 Signed-off-by: Wenkai Yin(尹文开) --- .github/workflows/pr-codespell.yml | 2 +- .golangci.yaml | 2 +- changelogs/unreleased/8680-ywk253100 | 1 + pkg/backup/backup.go | 2 +- pkg/backup/backup_test.go | 4 +-- pkg/podvolume/backupper.go | 45 ++++++++++++++++++++-------- pkg/podvolume/backupper_test.go | 27 ++++++++++------- 7 files changed, 55 insertions(+), 28 deletions(-) create mode 100644 changelogs/unreleased/8680-ywk253100 diff --git a/.github/workflows/pr-codespell.yml b/.github/workflows/pr-codespell.yml index 0d3138e40..900371ca2 100644 --- a/.github/workflows/pr-codespell.yml +++ b/.github/workflows/pr-codespell.yml @@ -13,7 +13,7 @@ jobs: - name: Codespell uses: codespell-project/actions-codespell@master with: - # ignore the config/.../crd.go file as it's generated binary data that is edited elswhere. + # ignore the config/.../crd.go file as it's generated binary data that is edited elsewhere. skip: .git,*.png,*.jpg,*.woff,*.ttf,*.gif,*.ico,./config/crd/v1beta1/crds/crds.go,./config/crd/v1/crds/crds.go,./config/crd/v2alpha1/crds/crds.go,./go.sum,./LICENSE ignore_words_list: iam,aks,ist,bridget,ue,shouldnot,atleast,notin,sme,optin check_filenames: true diff --git a/.golangci.yaml b/.golangci.yaml index 9cd608150..e437a348c 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -277,7 +277,7 @@ linters-settings: force-case-trailing-whitespace: 0 # Force cuddling of err checks with err var assignment force-err-cuddling: false - # Allow leading comments to be separated with empty liens + # Allow leading comments to be separated with empty lines allow-separated-leading-comment: false linters: diff --git a/changelogs/unreleased/8680-ywk253100 b/changelogs/unreleased/8680-ywk253100 new file mode 100644 index 000000000..b326ffb34 --- /dev/null +++ b/changelogs/unreleased/8680-ywk253100 @@ -0,0 +1 @@ +Fix #8657: WaitGroup panic issue \ No newline at end of file diff --git a/pkg/backup/backup.go b/pkg/backup/backup.go index 3b5592851..fb4fe8f49 100644 --- a/pkg/backup/backup.go +++ b/pkg/backup/backup.go @@ -762,7 +762,7 @@ func (kb *kubernetesBackupper) waitUntilPVBsProcessed(ctx context.Context, log l if processed { continue } - updatedPVB, err := itemBlock.itemBackupper.podVolumeBackupper.GetPodVolumeBackup(pvb.Namespace, pvb.Name) + updatedPVB, err := itemBlock.itemBackupper.podVolumeBackupper.GetPodVolumeBackupByPodAndVolume(pvb.Spec.Pod.Namespace, pvb.Spec.Pod.Name, pvb.Spec.Volume) if err != nil { allProcessed = false log.Infof("failed to get PVB: %v", err) diff --git a/pkg/backup/backup_test.go b/pkg/backup/backup_test.go index ad4ec6496..fef91d4b0 100644 --- a/pkg/backup/backup_test.go +++ b/pkg/backup/backup_test.go @@ -3946,9 +3946,9 @@ func (b *fakePodVolumeBackupper) WaitAllPodVolumesProcessed(log logrus.FieldLogg return b.pvbs } -func (b *fakePodVolumeBackupper) GetPodVolumeBackup(namespace, name string) (*velerov1.PodVolumeBackup, error) { +func (b *fakePodVolumeBackupper) GetPodVolumeBackupByPodAndVolume(podNamespace, podName, volume string) (*velerov1.PodVolumeBackup, error) { for _, pvb := range b.pvbs { - if pvb.Namespace == namespace && pvb.Name == name { + if pvb.Spec.Pod.Namespace == podNamespace && pvb.Spec.Pod.Name == podName && pvb.Spec.Volume == volume { return pvb, nil } } diff --git a/pkg/podvolume/backupper.go b/pkg/podvolume/backupper.go index 009edc5cc..471cdb1a0 100644 --- a/pkg/podvolume/backupper.go +++ b/pkg/podvolume/backupper.go @@ -43,12 +43,17 @@ import ( "github.com/vmware-tanzu/velero/pkg/util/kube" ) +const ( + indexNamePod = "POD" + pvbKeyPattern = "%s+%s+%s" +) + // Backupper can execute pod volume backups of volumes in a pod. type Backupper interface { // BackupPodVolumes backs up all specified volumes in a pod. BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api.Pod, volumesToBackup []string, resPolicies *resourcepolicies.Policies, log logrus.FieldLogger) ([]*velerov1api.PodVolumeBackup, *PVCBackupSummary, []error) WaitAllPodVolumesProcessed(log logrus.FieldLogger) []*velerov1api.PodVolumeBackup - GetPodVolumeBackup(namespace, name string) (*velerov1api.PodVolumeBackup, error) + GetPodVolumeBackupByPodAndVolume(podNamespace, podName, volume string) (*velerov1api.PodVolumeBackup, error) ListPodVolumeBackupsByPod(podNamespace, podName string) ([]*velerov1api.PodVolumeBackup, error) } @@ -106,9 +111,7 @@ func (pbs *PVCBackupSummary) addSkipped(volumeName string, reason string) { } } -const indexNamePod = "POD" - -func podIndexFunc(obj interface{}) ([]string, error) { +func podIndexFunc(obj any) ([]string, error) { pvb, ok := obj.(*velerov1api.PodVolumeBackup) if !ok { return nil, errors.Errorf("expected PodVolumeBackup, but got %T", obj) @@ -119,6 +122,16 @@ func podIndexFunc(obj interface{}) ([]string, error) { return []string{cache.NewObjectName(pvb.Spec.Pod.Namespace, pvb.Spec.Pod.Name).String()}, nil } +// the PVB's name is auto-generated when creating the PVB, we cannot get the name or uid before creating it. +// So we cannot use namespace&name or uid as the key because we need to insert PVB into the indexer before creating it in API server +func podVolumeBackupKey(obj any) (string, error) { + pvb, ok := obj.(*velerov1api.PodVolumeBackup) + if !ok { + return "", fmt.Errorf("expected PodVolumeBackup, but got %T", obj) + } + return fmt.Sprintf(pvbKeyPattern, pvb.Spec.Pod.Namespace, pvb.Spec.Pod.Name, pvb.Spec.Volume), nil +} + func newBackupper( ctx context.Context, log logrus.FieldLogger, @@ -137,7 +150,7 @@ func newBackupper( uploaderType: uploaderType, pvbInformer: pvbInformer, wg: sync.WaitGroup{}, - pvbIndexer: cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + pvbIndexer: cache.NewIndexer(podVolumeBackupKey, cache.Indexers{ indexNamePod: podIndexFunc, }), } @@ -335,16 +348,22 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api. } volumeBackup := newPodVolumeBackup(backup, pod, volume, repoIdentifier, b.uploaderType, pvc) - if err := veleroclient.CreateRetryGenerateName(b.crClient, b.ctx, volumeBackup); err != nil { - errs = append(errs, err) - continue - } - b.wg.Add(1) - + // the PVB must be added into the indexer before creating it in API server otherwise unexpected behavior may happen: + // the PVB may be handled very quickly by the controller and the informer handler will insert the PVB before "b.pvbIndexer.Add(volumeBackup)" runs, + // this causes the PVB inserted by "b.pvbIndexer.Add(volumeBackup)" overrides the PVB in the indexer while the PVB inserted by "b.pvbIndexer.Add(volumeBackup)" + // contains empty "Status" if err := b.pvbIndexer.Add(volumeBackup); err != nil { errs = append(errs, errors.Wrapf(err, "failed to add PodVolumeBackup %s/%s to indexer", volumeBackup.Namespace, volumeBackup.Name)) continue } + // similar with above: the PVB may be handled very quickly by the controller and the informer handler will call "b.wg.Done()" before "b.wg.Add(1)" runs which causes panic + // see https://github.com/vmware-tanzu/velero/issues/8657 + b.wg.Add(1) + if err := veleroclient.CreateRetryGenerateName(b.crClient, b.ctx, volumeBackup); err != nil { + b.wg.Done() + errs = append(errs, err) + continue + } podVolumeBackups = append(podVolumeBackups, volumeBackup) pvcSummary.addBackedup(volumeName) @@ -386,8 +405,8 @@ func (b *backupper) WaitAllPodVolumesProcessed(log logrus.FieldLogger) []*velero return podVolumeBackups } -func (b *backupper) GetPodVolumeBackup(namespace, name string) (*velerov1api.PodVolumeBackup, error) { - obj, exist, err := b.pvbIndexer.GetByKey(cache.NewObjectName(namespace, name).String()) +func (b *backupper) GetPodVolumeBackupByPodAndVolume(podNamespace, podName, volume string) (*velerov1api.PodVolumeBackup, error) { + obj, exist, err := b.pvbIndexer.GetByKey(fmt.Sprintf(pvbKeyPattern, podNamespace, podName, volume)) if err != nil { return nil, err } diff --git a/pkg/podvolume/backupper_test.go b/pkg/podvolume/backupper_test.go index a824daa70..eec5a441e 100644 --- a/pkg/podvolume/backupper_test.go +++ b/pkg/podvolume/backupper_test.go @@ -582,37 +582,44 @@ func TestBackupPodVolumes(t *testing.T) { } } -func TestGetPodVolumeBackup(t *testing.T) { +func TestGetPodVolumeBackupByPodAndVolume(t *testing.T) { backupper := &backupper{ - pvbIndexer: cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + pvbIndexer: cache.NewIndexer(podVolumeBackupKey, cache.Indexers{ indexNamePod: podIndexFunc, }), } obj := &velerov1api.PodVolumeBackup{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "velero", - Name: "pvb", - }, Spec: velerov1api.PodVolumeBackupSpec{ Pod: corev1api.ObjectReference{ Kind: "Pod", Namespace: "default", Name: "pod", }, + Volume: "volume", }, } err := backupper.pvbIndexer.Add(obj) require.NoError(t, err) - // not exist PVB - pvb, err := backupper.GetPodVolumeBackup("invalid-namespace", "invalid-name") + // incorrect pod namespace + pvb, err := backupper.GetPodVolumeBackupByPodAndVolume("invalid-namespace", "pod", "volume") require.NoError(t, err) assert.Nil(t, pvb) - // exist PVB - pvb, err = backupper.GetPodVolumeBackup("velero", "pvb") + // incorrect pod name + pvb, err = backupper.GetPodVolumeBackupByPodAndVolume("default", "invalid-pod", "volume") + require.NoError(t, err) + assert.Nil(t, pvb) + + // incorrect volume + pvb, err = backupper.GetPodVolumeBackupByPodAndVolume("default", "pod", "invalid-volume") + require.NoError(t, err) + assert.Nil(t, pvb) + + // correct pod namespace, name and volume + pvb, err = backupper.GetPodVolumeBackupByPodAndVolume("default", "pod", "volume") require.NoError(t, err) assert.NotNil(t, pvb) }