diff --git a/.github/workflows/nightly-trivy-scan.yml b/.github/workflows/nightly-trivy-scan.yml index 0f5fe0a68..85ce3cdc5 100644 --- a/.github/workflows/nightly-trivy-scan.yml +++ b/.github/workflows/nightly-trivy-scan.yml @@ -22,7 +22,7 @@ jobs: uses: actions/checkout@v6 - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master + uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 with: image-ref: 'docker.io/velero/${{ matrix.images }}:${{ matrix.versions }}' severity: 'CRITICAL,HIGH,MEDIUM' diff --git a/changelogs/unreleased/9403-GabriFedi97 b/changelogs/unreleased/9403-GabriFedi97 new file mode 100644 index 000000000..515549220 --- /dev/null +++ b/changelogs/unreleased/9403-GabriFedi97 @@ -0,0 +1 @@ +Include InitContainer configured as Sidecars when validating the existence of the target containers configured for the Backup Hooks diff --git a/changelogs/unreleased/9614-blackpiglet b/changelogs/unreleased/9614-blackpiglet new file mode 100644 index 000000000..6d15e4f2e --- /dev/null +++ b/changelogs/unreleased/9614-blackpiglet @@ -0,0 +1 @@ +Add check for file extraction from tarball. \ No newline at end of file diff --git a/changelogs/unreleased/9628-priyansh17 b/changelogs/unreleased/9628-priyansh17 new file mode 100644 index 000000000..f65b55e16 --- /dev/null +++ b/changelogs/unreleased/9628-priyansh17 @@ -0,0 +1 @@ +Implement original VolumeSnapshotContent deletion for legacy backups \ No newline at end of file diff --git a/changelogs/unreleased/9638-adam-jian-zhang b/changelogs/unreleased/9638-adam-jian-zhang new file mode 100644 index 000000000..3fd99e5e7 --- /dev/null +++ b/changelogs/unreleased/9638-adam-jian-zhang @@ -0,0 +1 @@ +Fix issue #9636, fix configmap lookup in non-default namespaces diff --git a/changelogs/unreleased/9643-priyansh17 b/changelogs/unreleased/9643-priyansh17 new file mode 100644 index 000000000..10870c871 --- /dev/null +++ b/changelogs/unreleased/9643-priyansh17 @@ -0,0 +1 @@ +Fix issue #9641, Remove redundant ReadyToUse polling in CSI VolumeSnapshotContent delete plugin \ No newline at end of file diff --git a/hack/build-image/Dockerfile b/hack/build-image/Dockerfile index 65378b22e..0a60e6a16 100644 --- a/hack/build-image/Dockerfile +++ b/hack/build-image/Dockerfile @@ -22,7 +22,7 @@ ENV GOPROXY=${GOPROXY} # kubebuilder test bundle is separated from kubebuilder. Need to setup it for CI test. # Using setup-envtest to download envtest binaries -RUN go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest && \ +RUN go install sigs.k8s.io/controller-runtime/tools/setup-envtest@v0.0.0-20260305094418-8122a6266696 && \ mkdir -p /usr/local/kubebuilder/bin && \ ENVTEST_ASSETS_DIR=$(setup-envtest use 1.33.0 --bin-dir /usr/local/kubebuilder/bin -p path) && \ cp -r ${ENVTEST_ASSETS_DIR}/* /usr/local/kubebuilder/bin/ diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action.go b/internal/delete/actions/csi/volumesnapshotcontent_action.go index 7a6724df1..98e0fc03b 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action.go @@ -18,7 +18,6 @@ package csi import ( "context" - "time" "github.com/google/uuid" snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" @@ -27,14 +26,11 @@ import ( corev1api "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/wait" crclient "sigs.k8s.io/controller-runtime/pkg/client" - velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/client" plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common" "github.com/vmware-tanzu/velero/pkg/plugin/velero" - "github.com/vmware-tanzu/velero/pkg/util/boolptr" kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube" ) @@ -71,7 +67,7 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( // So skip deleting VolumeSnapshotContent not have the backup name // in its labels. if !kubeutil.HasBackupLabel(&snapCont.ObjectMeta, input.Backup.Name) { - p.log.Info( + p.log.Infof( "VolumeSnapshotContent %s was not taken by backup %s, skipping deletion", snapCont.Name, input.Backup.Name, @@ -81,6 +77,17 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( p.log.Infof("Deleting VolumeSnapshotContent %s", snapCont.Name) + // Try to delete the original VSC from the cluster first. + // This handles legacy (pre-1.15) backups where the original VSC + // with DeletionPolicy=Retain still exists in the cluster. + originalVSCName := snapCont.Name + if cleaned := p.tryDeleteOriginalVSC(context.TODO(), originalVSCName); cleaned { + p.log.Infof("Successfully deleted original VolumeSnapshotContent %s from cluster, skipping temp VSC creation", originalVSCName) + return nil + } + + // create a temp VSC to trigger cloud snapshot deletion + // (for backups where the original VSC no longer exists in cluster) uuid, err := uuid.NewRandom() if err != nil { p.log.WithError(err).Errorf("Fail to generate the UUID to create VSC %s", snapCont.Name) @@ -114,71 +121,62 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( if err := p.crClient.Create(context.TODO(), &snapCont); err != nil { return errors.Wrapf(err, "fail to create VolumeSnapshotContent %s", snapCont.Name) } + p.log.Infof("Created temp VolumeSnapshotContent %s with DeletionPolicy=Delete to trigger cloud snapshot cleanup", snapCont.Name) - // Read resource timeout from backup annotation, if not set, use default value. - timeout, err := time.ParseDuration( - input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation]) - if err != nil { - p.log.Warnf("fail to parse resource timeout annotation %s: %s", - input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation], err.Error()) - timeout = 10 * time.Minute - } - p.log.Debugf("resource timeout is set to %s", timeout.String()) - - interval := 5 * time.Second - - // Wait until VSC created and ReadyToUse is true. - if err := wait.PollUntilContextTimeout( - context.Background(), - interval, - timeout, - true, - func(ctx context.Context) (bool, error) { - return checkVSCReadiness(ctx, &snapCont, p.crClient) - }, - ); err != nil { - // Clean up the VSC we created since it can't become ready - if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) { - p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name) - } - return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name) - } - + // Delete the temp VSC immediately to trigger cloud snapshot removal. + // The CSI driver will handle the actual cloud snapshot deletion. if err := p.crClient.Delete( context.TODO(), &snapCont, ); err != nil && !apierrors.IsNotFound(err) { - p.log.Infof("VolumeSnapshotContent %s not found", snapCont.Name) + p.log.WithError(err).Errorf("Failed to delete temp VolumeSnapshotContent %s", snapCont.Name) return err } + p.log.Infof("Successfully triggered deletion of VolumeSnapshotContent %s and its cloud snapshot", snapCont.Name) return nil } -var checkVSCReadiness = func( +// tryDeleteOriginalVSC attempts to find and delete the original VSC from +// the cluster (legacy pre-1.15 backups). It patches the DeletionPolicy to +// Delete so the CSI driver also removes the cloud snapshot, then deletes +// the VSC object itself. +// Returns true if the original VSC was found and deletion was initiated. +func (p *volumeSnapshotContentDeleteItemAction) tryDeleteOriginalVSC( ctx context.Context, - vsc *snapshotv1api.VolumeSnapshotContent, - client crclient.Client, -) (bool, error) { - tmpVSC := new(snapshotv1api.VolumeSnapshotContent) - if err := client.Get(ctx, crclient.ObjectKeyFromObject(vsc), tmpVSC); err != nil { - return false, errors.Wrapf( - err, "failed to get VolumeSnapshotContent %s", vsc.Name, - ) + vscName string, +) bool { + existing := new(snapshotv1api.VolumeSnapshotContent) + if err := p.crClient.Get(ctx, crclient.ObjectKey{Name: vscName}, existing); err != nil { + if apierrors.IsNotFound(err) { + p.log.Debugf("Original VolumeSnapshotContent %s not found in cluster, will use temp VSC flow", vscName) + } else { + p.log.WithError(err).Warnf("Error looking up original VolumeSnapshotContent %s, will use temp VSC flow", vscName) + } + return false } - if tmpVSC.Status != nil && boolptr.IsSetToTrue(tmpVSC.Status.ReadyToUse) { - return true, nil + p.log.Debugf("Found original VolumeSnapshotContent %s in cluster (legacy backup), cleaning up directly", vscName) + + // Patch DeletionPolicy to Delete so the CSI driver removes the cloud snapshot + if existing.Spec.DeletionPolicy != snapshotv1api.VolumeSnapshotContentDelete { + original := existing.DeepCopy() + existing.Spec.DeletionPolicy = snapshotv1api.VolumeSnapshotContentDelete + if err := p.crClient.Patch(ctx, existing, crclient.MergeFrom(original)); err != nil { + p.log.WithError(err).Warnf("Failed to patch DeletionPolicy on original VSC %s, will use temp VSC flow", vscName) + return false + } + p.log.Debugf("Patched DeletionPolicy to Delete on original VolumeSnapshotContent %s", vscName) } - // Fail fast on permanent CSI driver errors (e.g., InvalidSnapshot.NotFound) - if tmpVSC.Status != nil && tmpVSC.Status.Error != nil && tmpVSC.Status.Error.Message != nil { - return false, errors.Errorf( - "VolumeSnapshotContent %s has error: %s", vsc.Name, *tmpVSC.Status.Error.Message, - ) + // Delete the original VSC — the CSI driver will clean up the cloud snapshot + if err := p.crClient.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + p.log.WithError(err).Warnf("Failed to delete original VolumeSnapshotContent %s, will use temp VSC flow", vscName) + return false } - return false, nil + p.log.Infof("Deleted original VolumeSnapshotContent %s with DeletionPolicy=Delete, CSI driver will remove cloud snapshot", vscName) + return true } func NewVolumeSnapshotContentDeleteItemAction( diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go index 7dbd6d7ff..114bd752f 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go @@ -22,9 +22,10 @@ import ( "testing" snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" - "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" + corev1api "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" @@ -37,19 +38,44 @@ import ( velerotest "github.com/vmware-tanzu/velero/pkg/test" ) +// fakeClientWithErrors wraps a real client and injects errors for specific operations. +type fakeClientWithErrors struct { + crclient.Client + getError error + patchError error + deleteError error +} + +func (c *fakeClientWithErrors) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error { + if c.getError != nil { + return c.getError + } + return c.Client.Get(ctx, key, obj, opts...) +} + +func (c *fakeClientWithErrors) Patch(ctx context.Context, obj crclient.Object, patch crclient.Patch, opts ...crclient.PatchOption) error { + if c.patchError != nil { + return c.patchError + } + return c.Client.Patch(ctx, obj, patch, opts...) +} + +func (c *fakeClientWithErrors) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error { + if c.deleteError != nil { + return c.deleteError + } + return c.Client.Delete(ctx, obj, opts...) +} + func TestVSCExecute(t *testing.T) { snapshotHandleStr := "test" tests := []struct { - name string - item runtime.Unstructured - vsc *snapshotv1api.VolumeSnapshotContent - backup *velerov1api.Backup - function func( - ctx context.Context, - vsc *snapshotv1api.VolumeSnapshotContent, - client crclient.Client, - ) (bool, error) - expectErr bool + name string + item runtime.Unstructured + vsc *snapshotv1api.VolumeSnapshotContent + backup *velerov1api.Backup + preExistingVSC *snapshotv1api.VolumeSnapshotContent + expectErr bool }{ { name: "VolumeSnapshotContent doesn't have backup label", @@ -71,40 +97,22 @@ func TestVSCExecute(t *testing.T) { { name: "Normal case, VolumeSnapshot should be deleted", vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).VolumeSnapshotClassName("volumesnapshotclass").Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), - backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), + backup: builder.ForBackup("velero", "backup").Result(), expectErr: false, - function: func( - ctx context.Context, - vsc *snapshotv1api.VolumeSnapshotContent, - client crclient.Client, - ) (bool, error) { - return true, nil - }, }, { - name: "Error case, deletion fails", + name: "Original VSC exists in cluster, cleaned up directly", vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), - backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), - expectErr: true, - function: func( - ctx context.Context, - vsc *snapshotv1api.VolumeSnapshotContent, - client crclient.Client, - ) (bool, error) { - return false, errors.Errorf("test error case") - }, - }, - { - name: "Error case with CSI error, dangling VSC should be cleaned up", - vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), - backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), - expectErr: true, - function: func( - ctx context.Context, - vsc *snapshotv1api.VolumeSnapshotContent, - client crclient.Client, - ) (bool, error) { - return false, errors.Errorf("VolumeSnapshotContent %s has error: InvalidSnapshot.NotFound", vsc.Name) + backup: builder.ForBackup("velero", "backup").Result(), + expectErr: false, + preExistingVSC: &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "bar"}, + Spec: snapshotv1api.VolumeSnapshotContentSpec{ + DeletionPolicy: snapshotv1api.VolumeSnapshotContentRetain, + Driver: "disk.csi.azure.com", + Source: snapshotv1api.VolumeSnapshotContentSource{SnapshotHandle: stringPtr("snap-123")}, + VolumeSnapshotRef: corev1api.ObjectReference{Name: "vs-1", Namespace: "default"}, + }, }, }, } @@ -113,7 +121,10 @@ func TestVSCExecute(t *testing.T) { t.Run(test.name, func(t *testing.T) { crClient := velerotest.NewFakeControllerRuntimeClient(t) logger := logrus.StandardLogger() - checkVSCReadiness = test.function + + if test.preExistingVSC != nil { + require.NoError(t, crClient.Create(t.Context(), test.preExistingVSC)) + } p := volumeSnapshotContentDeleteItemAction{log: logger, crClient: crClient} @@ -171,72 +182,147 @@ func TestNewVolumeSnapshotContentDeleteItemAction(t *testing.T) { require.NoError(t, err1) } -func TestCheckVSCReadiness(t *testing.T) { +func TestTryDeleteOriginalVSC(t *testing.T) { tests := []struct { name string - vsc *snapshotv1api.VolumeSnapshotContent - createVSC bool - expectErr bool - ready bool + vscName string + existing *snapshotv1api.VolumeSnapshotContent + createIt bool + expectRet bool }{ { - name: "VSC not exist", - vsc: &snapshotv1api.VolumeSnapshotContent{ - ObjectMeta: metav1.ObjectMeta{ - Name: "vsc-1", - Namespace: "velero", - }, - }, - createVSC: false, - expectErr: true, - ready: false, + name: "VSC not found in cluster, returns false", + vscName: "not-found", + expectRet: false, }, { - name: "VSC not ready", - vsc: &snapshotv1api.VolumeSnapshotContent{ - ObjectMeta: metav1.ObjectMeta{ - Name: "vsc-1", - Namespace: "velero", - }, - }, - createVSC: true, - expectErr: false, - ready: false, - }, - { - name: "VSC has error from CSI driver", - vsc: &snapshotv1api.VolumeSnapshotContent{ - ObjectMeta: metav1.ObjectMeta{ - Name: "vsc-1", - Namespace: "velero", - }, - Status: &snapshotv1api.VolumeSnapshotContentStatus{ - ReadyToUse: boolPtr(false), - Error: &snapshotv1api.VolumeSnapshotError{ - Message: stringPtr("InvalidSnapshot.NotFound: The snapshot 'snap-0abc123' does not exist."), + name: "VSC found with Retain policy, patches and deletes", + vscName: "legacy-vsc", + existing: &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "legacy-vsc"}, + Spec: snapshotv1api.VolumeSnapshotContentSpec{ + DeletionPolicy: snapshotv1api.VolumeSnapshotContentRetain, + Driver: "disk.csi.azure.com", + Source: snapshotv1api.VolumeSnapshotContentSource{ + SnapshotHandle: stringPtr("snap-123"), + }, + VolumeSnapshotRef: corev1api.ObjectReference{ + Name: "vs-1", + Namespace: "default", }, }, }, - createVSC: true, - expectErr: true, - ready: false, + createIt: true, + expectRet: true, + }, + { + name: "VSC found with Delete policy already, just deletes", + vscName: "already-delete-vsc", + existing: &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "already-delete-vsc"}, + Spec: snapshotv1api.VolumeSnapshotContentSpec{ + DeletionPolicy: snapshotv1api.VolumeSnapshotContentDelete, + Driver: "disk.csi.azure.com", + Source: snapshotv1api.VolumeSnapshotContentSource{ + SnapshotHandle: stringPtr("snap-456"), + }, + VolumeSnapshotRef: corev1api.ObjectReference{ + Name: "vs-2", + Namespace: "default", + }, + }, + }, + createIt: true, + expectRet: true, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { crClient := velerotest.NewFakeControllerRuntimeClient(t) - if test.createVSC { - require.NoError(t, crClient.Create(t.Context(), test.vsc)) + logger := logrus.StandardLogger() + p := &volumeSnapshotContentDeleteItemAction{ + log: logger, + crClient: crClient, } - ready, err := checkVSCReadiness(t.Context(), test.vsc, crClient) - require.Equal(t, test.ready, ready) - if test.expectErr { - require.Error(t, err) + if test.createIt && test.existing != nil { + require.NoError(t, crClient.Create(t.Context(), test.existing)) + } + + result := p.tryDeleteOriginalVSC(t.Context(), test.vscName) + require.Equal(t, test.expectRet, result) + + // If cleanup succeeded, verify the VSC is gone + if test.expectRet { + err := crClient.Get(t.Context(), crclient.ObjectKey{Name: test.vscName}, + &snapshotv1api.VolumeSnapshotContent{}) + require.True(t, apierrors.IsNotFound(err), + "VSC should have been deleted from cluster") } }) } + + // Error injection tests for tryDeleteOriginalVSC + t.Run("Get returns non-NotFound error, returns false", func(t *testing.T) { + errClient := &fakeClientWithErrors{ + Client: velerotest.NewFakeControllerRuntimeClient(t), + getError: fmt.Errorf("connection refused"), + } + p := &volumeSnapshotContentDeleteItemAction{ + log: logrus.StandardLogger(), + crClient: errClient, + } + require.False(t, p.tryDeleteOriginalVSC(t.Context(), "some-vsc")) + }) + + t.Run("Patch fails, returns false", func(t *testing.T) { + realClient := velerotest.NewFakeControllerRuntimeClient(t) + vsc := &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "patch-fail-vsc"}, + Spec: snapshotv1api.VolumeSnapshotContentSpec{ + DeletionPolicy: snapshotv1api.VolumeSnapshotContentRetain, + Driver: "disk.csi.azure.com", + Source: snapshotv1api.VolumeSnapshotContentSource{SnapshotHandle: stringPtr("snap-789")}, + VolumeSnapshotRef: corev1api.ObjectReference{Name: "vs-3", Namespace: "default"}, + }, + } + require.NoError(t, realClient.Create(t.Context(), vsc)) + + errClient := &fakeClientWithErrors{ + Client: realClient, + patchError: fmt.Errorf("patch forbidden"), + } + p := &volumeSnapshotContentDeleteItemAction{ + log: logrus.StandardLogger(), + crClient: errClient, + } + require.False(t, p.tryDeleteOriginalVSC(t.Context(), "patch-fail-vsc")) + }) + + t.Run("Delete fails, returns false", func(t *testing.T) { + realClient := velerotest.NewFakeControllerRuntimeClient(t) + vsc := &snapshotv1api.VolumeSnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "delete-fail-vsc"}, + Spec: snapshotv1api.VolumeSnapshotContentSpec{ + DeletionPolicy: snapshotv1api.VolumeSnapshotContentDelete, + Driver: "disk.csi.azure.com", + Source: snapshotv1api.VolumeSnapshotContentSource{SnapshotHandle: stringPtr("snap-999")}, + VolumeSnapshotRef: corev1api.ObjectReference{Name: "vs-4", Namespace: "default"}, + }, + } + require.NoError(t, realClient.Create(t.Context(), vsc)) + + errClient := &fakeClientWithErrors{ + Client: realClient, + deleteError: fmt.Errorf("delete forbidden"), + } + p := &volumeSnapshotContentDeleteItemAction{ + log: logrus.StandardLogger(), + crClient: errClient, + } + require.False(t, p.tryDeleteOriginalVSC(t.Context(), "delete-fail-vsc")) + }) } func boolPtr(b bool) *bool { diff --git a/pkg/archive/extractor.go b/pkg/archive/extractor.go index 87d9ab413..aae9a7a85 100644 --- a/pkg/archive/extractor.go +++ b/pkg/archive/extractor.go @@ -19,8 +19,10 @@ package archive import ( "archive/tar" "compress/gzip" + "fmt" "io" "path/filepath" + "strings" "github.com/sirupsen/logrus" @@ -66,6 +68,16 @@ func (e *Extractor) writeFile(target string, tarRdr *tar.Reader) error { return nil } +// sanitizeArchivePath sanitizes archive file path from "G305: Zip Slip vulnerability" +func sanitizeArchivePath(destDir, sourcePath string) (targetPath string, err error) { + targetPath = filepath.Join(destDir, sourcePath) + if strings.HasPrefix(targetPath, filepath.Clean(destDir)) { + return targetPath, nil + } + + return "", fmt.Errorf("invalid archive path %q: escapes target directory", sourcePath) +} + func (e *Extractor) readBackup(tarRdr *tar.Reader) (string, error) { dir, err := e.fs.TempDir("", "") if err != nil { @@ -84,7 +96,11 @@ func (e *Extractor) readBackup(tarRdr *tar.Reader) (string, error) { return "", err } - target := filepath.Join(dir, header.Name) //nolint:gosec // Internal usage. No need to check. + target, err := sanitizeArchivePath(dir, header.Name) + if err != nil { + e.log.Infof("error sanitizing archive path: %s", err.Error()) + return "", err + } switch header.Typeflag { case tar.TypeDir: diff --git a/pkg/archive/extractor_test.go b/pkg/archive/extractor_test.go index 47cea734c..a4daf02ca 100644 --- a/pkg/archive/extractor_test.go +++ b/pkg/archive/extractor_test.go @@ -18,6 +18,7 @@ package archive import ( "archive/tar" + "bytes" "compress/gzip" "io" "os" @@ -87,6 +88,31 @@ func TestUnzipAndExtractBackup(t *testing.T) { } } +func TestUnzipAndExtractBackupRejectsPathTraversal(t *testing.T) { + ext := NewExtractor(test.NewLogger(), test.NewFakeFileSystem()) + + var buf bytes.Buffer + gzw := gzip.NewWriter(&buf) + tw := tar.NewWriter(gzw) + + err := tw.WriteHeader(&tar.Header{ + Name: "../escape.txt", + Mode: 0600, + Typeflag: tar.TypeReg, + Size: int64(len("data")), + }) + require.NoError(t, err) + + _, err = tw.Write([]byte("data")) + require.NoError(t, err) + require.NoError(t, tw.Close()) + require.NoError(t, gzw.Close()) + + _, err = ext.UnzipAndExtractBackup(&buf) + require.Error(t, err) + require.Contains(t, err.Error(), "invalid archive path") +} + func createArchive(files []string, fs filesystem.Interface) (string, error) { outName := "output.tar.gz" out, err := fs.Create(outName) diff --git a/pkg/cmd/cli/install/install.go b/pkg/cmd/cli/install/install.go index 83b84ce2f..8115e1353 100644 --- a/pkg/cmd/cli/install/install.go +++ b/pkg/cmd/cli/install/install.go @@ -381,8 +381,8 @@ This is useful as a starting point for more customized installations. # velero install --provider azure --plugins velero/velero-plugin-for-microsoft-azure:v1.0.0 --bucket $BLOB_CONTAINER --secret-file ./credentials-velero --backup-location-config resourceGroup=$AZURE_BACKUP_RESOURCE_GROUP,storageAccount=$AZURE_STORAGE_ACCOUNT_ID[,subscriptionId=$AZURE_BACKUP_SUBSCRIPTION_ID] --snapshot-location-config apiTimeout=[,resourceGroup=$AZURE_BACKUP_RESOURCE_GROUP,subscriptionId=$AZURE_BACKUP_SUBSCRIPTION_ID]`, Run: func(c *cobra.Command, args []string) { - cmd.CheckError(o.Validate(c, args, f)) cmd.CheckError(o.Complete(args, f)) + cmd.CheckError(o.Validate(c, args, f)) cmd.CheckError(o.Run(c, f)) }, } diff --git a/pkg/cmd/cli/install/install_test.go b/pkg/cmd/cli/install/install_test.go index c5d147646..1e0e7a4cf 100644 --- a/pkg/cmd/cli/install/install_test.go +++ b/pkg/cmd/cli/install/install_test.go @@ -17,11 +17,18 @@ limitations under the License. package install import ( + "context" "testing" + "github.com/spf13/cobra" "github.com/spf13/pflag" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + corev1api "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + factorymocks "github.com/vmware-tanzu/velero/pkg/client/mocks" + velerotest "github.com/vmware-tanzu/velero/pkg/test" ) func TestPriorityClassNameFlag(t *testing.T) { @@ -91,3 +98,168 @@ func TestPriorityClassNameFlag(t *testing.T) { }) } } + +// makeValidateCmd returns a minimal *cobra.Command that satisfies output.ValidateFlags. +func makeValidateCmd() *cobra.Command { + c := &cobra.Command{} + // output.ValidateFlags only inspects the "output" flag; add it so validation passes. + c.Flags().StringP("output", "o", "", "output format") + return c +} + +// configMapInNamespace builds a ConfigMap with a single JSON data entry in the given namespace. +func configMapInNamespace(namespace, name, jsonValue string) *corev1api.ConfigMap { + return &corev1api.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Data: map[string]string{ + "config": jsonValue, + }, + } +} + +// TestValidateConfigMapsUseFactoryNamespace verifies that Validate resolves the target +// namespace correctly for all three ConfigMap flags. +// +// The fix (Option B) calls Complete before Validate in NewCommand so that o.Namespace is +// populated from f.Namespace() before VerifyJSONConfigs runs. Tests mirror that order by +// calling Complete before Validate. +func TestValidateConfigMapsUseFactoryNamespace(t *testing.T) { + const targetNS = "tenant-b" + const defaultNS = "default" + + // Shared options that satisfy every other validation gate: + // - NoDefaultBackupLocation=true + UseVolumeSnapshots=false skips provider/bucket/plugins checks + // - NoSecret=true satisfies the secret-file check + baseOptions := func() *Options { + o := NewInstallOptions() + o.NoDefaultBackupLocation = true + o.UseVolumeSnapshots = false + o.NoSecret = true + return o + } + + tests := []struct { + name string + setupOpts func(o *Options, cmName string) + cmJSON string + wantErrMsg string // substring expected in error; empty means success + }{ + { + name: "NodeAgentConfigMap found in factory namespace", + setupOpts: func(o *Options, cmName string) { + o.NodeAgentConfigMap = cmName + }, + cmJSON: `{}`, + }, + { + name: "NodeAgentConfigMap not found when only in default namespace", + setupOpts: func(o *Options, cmName string) { + o.NodeAgentConfigMap = cmName + }, + cmJSON: `{}`, + wantErrMsg: "--node-agent-configmap specified ConfigMap", + }, + { + name: "RepoMaintenanceJobConfigMap found in factory namespace", + setupOpts: func(o *Options, cmName string) { + o.RepoMaintenanceJobConfigMap = cmName + }, + cmJSON: `{}`, + }, + { + name: "RepoMaintenanceJobConfigMap not found when only in default namespace", + setupOpts: func(o *Options, cmName string) { + o.RepoMaintenanceJobConfigMap = cmName + }, + cmJSON: `{}`, + wantErrMsg: "--repo-maintenance-job-configmap specified ConfigMap", + }, + { + name: "BackupRepoConfigMap found in factory namespace", + setupOpts: func(o *Options, cmName string) { + o.BackupRepoConfigMap = cmName + }, + cmJSON: `{}`, + }, + { + name: "BackupRepoConfigMap not found when only in default namespace", + setupOpts: func(o *Options, cmName string) { + o.BackupRepoConfigMap = cmName + }, + cmJSON: `{}`, + wantErrMsg: "--backup-repository-configmap specified ConfigMap", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + const cmName = "my-config" + + // Decide where to place the ConfigMap: + // "not found" cases put it in "default", so the factory namespace lookup misses it. + cmNamespace := targetNS + if tc.wantErrMsg != "" { + cmNamespace = defaultNS + } + + cm := configMapInNamespace(cmNamespace, cmName, tc.cmJSON) + kbClient := velerotest.NewFakeControllerRuntimeClient(t, cm) + + f := &factorymocks.Factory{} + f.On("Namespace").Return(targetNS) + f.On("KubebuilderClient").Return(kbClient, nil) + + o := baseOptions() + tc.setupOpts(o, cmName) + + // Mirror the NewCommand call order: Complete populates o.Namespace before Validate runs. + require.NoError(t, o.Complete([]string{}, f)) + + c := makeValidateCmd() + c.SetContext(context.Background()) + + err := o.Validate(c, []string{}, f) + + if tc.wantErrMsg == "" { + require.NoError(t, err) + } else { + require.Error(t, err) + assert.Contains(t, err.Error(), tc.wantErrMsg) + } + }) + } +} + +// TestNewCommandRunClosureOrder covers the Run closure in NewCommand (the lines that were +// reordered by the fix: Complete → Validate → Run). +// +// The closure uses CheckError which calls os.Exit on any error, so the only safe path is one +// where all three steps return nil. DryRun=true causes o.Run to return after PrintWithFormat +// (which is a no-op when no --output flag is set) without touching any cluster clients. +func TestNewCommandRunClosureOrder(t *testing.T) { + const targetNS = "tenant-b" + const cmName = "my-config" + + cm := configMapInNamespace(targetNS, cmName, `{}`) + kbClient := velerotest.NewFakeControllerRuntimeClient(t, cm) + + f := &factorymocks.Factory{} + f.On("Namespace").Return(targetNS) + f.On("KubebuilderClient").Return(kbClient, nil) + + c := NewCommand(f) + c.SetArgs([]string{ + "--no-default-backup-location", + "--use-volume-snapshots=false", + "--no-secret", + "--dry-run", + "--node-agent-configmap", cmName, + }) + + // Execute drives the full Run closure: Complete populates o.Namespace, Validate + // looks up the ConfigMap in targetNS (succeeds), Run returns early via DryRun. + require.NoError(t, c.Execute()) +} diff --git a/pkg/podexec/pod_command_executor.go b/pkg/podexec/pod_command_executor.go index b1d7fbf59..0dbc28e95 100644 --- a/pkg/podexec/pod_command_executor.go +++ b/pkg/podexec/pod_command_executor.go @@ -20,6 +20,7 @@ import ( "bytes" "context" "net/url" + "slices" "time" "github.com/pkg/errors" @@ -184,10 +185,22 @@ func (e *defaultPodCommandExecutor) ExecutePodCommand(log logrus.FieldLogger, it } func ensureContainerExists(pod *corev1api.Pod, container string) error { - for _, c := range pod.Spec.Containers { - if c.Name == container { - return nil - } + existsAsMainContainer := slices.ContainsFunc(pod.Spec.Containers, func(c corev1api.Container) bool { + return c.Name == container + }) + + if existsAsMainContainer { + return nil + } + + existsAsSidecar := slices.ContainsFunc(pod.Spec.InitContainers, func(c corev1api.Container) bool { + return c.RestartPolicy != nil && + *c.RestartPolicy == corev1api.ContainerRestartPolicyAlways && + c.Name == container + }) + + if existsAsSidecar { + return nil } return errors.Errorf("no such container: %q", container) diff --git a/pkg/podexec/pod_command_executor_test.go b/pkg/podexec/pod_command_executor_test.go index e28eb0458..3286ba42d 100644 --- a/pkg/podexec/pod_command_executor_test.go +++ b/pkg/podexec/pod_command_executor_test.go @@ -35,6 +35,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/rest" "k8s.io/client-go/tools/remotecommand" + "k8s.io/utils/ptr" v1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" velerotest "github.com/vmware-tanzu/velero/pkg/test" @@ -253,6 +254,15 @@ func TestEnsureContainerExists(t *testing.T) { Name: "foo", }, }, + InitContainers: []corev1api.Container{ + { + Name: "baz", + }, + { + Name: "qux", + RestartPolicy: ptr.To(corev1api.ContainerRestartPolicyAlways), + }, + }, }, } @@ -260,7 +270,13 @@ func TestEnsureContainerExists(t *testing.T) { require.EqualError(t, err, `no such container: "bar"`) err = ensureContainerExists(pod, "foo") - assert.NoError(t, err) + require.NoError(t, err) + + err = ensureContainerExists(pod, "baz") + require.EqualError(t, err, `no such container: "baz"`) + + err = ensureContainerExists(pod, "qux") + require.NoError(t, err) } func TestPodCompeted(t *testing.T) { diff --git a/test/e2e/repomaintenance/repo_maintenance_config.go b/test/e2e/repomaintenance/repo_maintenance_config.go index 1c616ea60..c0092c4bf 100644 --- a/test/e2e/repomaintenance/repo_maintenance_config.go +++ b/test/e2e/repomaintenance/repo_maintenance_config.go @@ -55,10 +55,12 @@ var GlobalRepoMaintenanceTest func() = TestFunc(&RepoMaintenanceTestCase{ jobConfigs: velerotypes.JobConfigs{ KeepLatestMaintenanceJobs: &keepJobNum, PodResources: &velerokubeutil.PodResources{ - CPURequest: "100m", - MemoryRequest: "100Mi", - CPULimit: "200m", - MemoryLimit: "200Mi", + CPURequest: "100m", + MemoryRequest: "100Mi", + EphemeralStorageRequest: "5Gi", + CPULimit: "200m", + MemoryLimit: "200Mi", + EphemeralStorageLimit: "10Gi", }, PriorityClassName: test.PriorityClassNameForRepoMaintenance, },