diff --git a/changelogs/unreleased/9791-christian-schlichtherle b/changelogs/unreleased/9791-christian-schlichtherle new file mode 100644 index 000000000..069423559 --- /dev/null +++ b/changelogs/unreleased/9791-christian-schlichtherle @@ -0,0 +1 @@ +Fix DataUploadDeleteAction creating snapshot-info ConfigMaps labeled with the wrong backup name when a DataUpload CR from another backup is incidentally captured in the backup tarball, which caused Kopia snapshots to be leaked in object storage on expiry of the real owning backup. diff --git a/pkg/datamover/dataupload_delete_action.go b/pkg/datamover/dataupload_delete_action.go index 1c09a20a2..6b36e1068 100644 --- a/pkg/datamover/dataupload_delete_action.go +++ b/pkg/datamover/dataupload_delete_action.go @@ -14,6 +14,7 @@ import ( velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" velerov2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1" + "github.com/vmware-tanzu/velero/pkg/label" "github.com/vmware-tanzu/velero/pkg/plugin/velero" repotypes "github.com/vmware-tanzu/velero/pkg/repository/types" ) @@ -35,6 +36,44 @@ func (d *DataUploadDeleteAction) Execute(input *velero.DeleteItemActionExecuteIn if err := runtime.DefaultUnstructuredConverter.FromUnstructured(input.Item.UnstructuredContent(), &du); err != nil { return errors.WithStack(errors.Wrapf(err, "failed to convert input.Item from unstructured")) } + // Only create a snapshot-info ConfigMap when the DataUpload's owning + // backup (its velero.io/backup-name label) matches the backup currently + // being deleted. Two other cases reach this code path and must be + // skipped, because the resulting CM would be unmatchable and only adds + // etcd churn: + // + // 1. The label is missing. We have no verifiable owner, so a CM created + // with the executing backup's label is a guess that deleteMovedSnapshots + // cannot rely on. + // 2. The label names a different backup. Velero does not support + // self-protection, so this almost always means the velero namespace + // was captured in a backup tarball and the DataUpload CR belongs to + // an unrelated backup. Creating a CM labeled with the executing + // backup mislabels the snapshot and causes the real owning backup's + // deleteMovedSnapshots query to miss it, leaking the Kopia snapshot + // in the object store. + // + // Both cases warn so misconfigured installs surface in logs. + owner := du.Labels[velerov1.BackupNameLabel] + switch { + case owner == "": + d.logger.Warnf( + "DataUpload %q has no %q label, so its owning backup cannot be verified; "+ + "skipping snapshot-info ConfigMap creation because a CM without a verifiable owner "+ + "cannot be matched back to its snapshot at backup deletion time.", + du.Name, velerov1.BackupNameLabel, + ) + return nil + case owner != label.GetValidName(input.Backup.Name): + d.logger.Warnf( + "DataUpload %q belongs to backup %q but is being deleted under backup %q; "+ + "this almost always means the velero namespace was included in a backup tarball. "+ + "Velero does not support self-protection — exclude the velero namespace from your schedules. "+ + "Skipping snapshot-info ConfigMap creation to avoid mislabeling.", + du.Name, owner, input.Backup.Name, + ) + return nil + } cm := genConfigmap(input.Backup, *du) if cm == nil { // will not fail the backup deletion diff --git a/pkg/datamover/dataupload_delete_action_test.go b/pkg/datamover/dataupload_delete_action_test.go new file mode 100644 index 000000000..cdbf4b006 --- /dev/null +++ b/pkg/datamover/dataupload_delete_action_test.go @@ -0,0 +1,171 @@ +/* +Copyright the Velero contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package datamover + +import ( + "fmt" + "strings" + "testing" + + "github.com/sirupsen/logrus" + logrustest "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1api "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + crclient "sigs.k8s.io/controller-runtime/pkg/client" + + velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + velerov2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1" + "github.com/vmware-tanzu/velero/pkg/builder" + "github.com/vmware-tanzu/velero/pkg/plugin/velero" + velerotest "github.com/vmware-tanzu/velero/pkg/test" +) + +func toUnstructured(t *testing.T, du *velerov2alpha1.DataUpload) runtime.Unstructured { + t.Helper() + m, err := runtime.DefaultUnstructuredConverter.ToUnstructured(du) + require.NoError(t, err) + return &unstructured.Unstructured{Object: m} +} + +func newCompletedDataUpload(name, ownerBackup string) *velerov2alpha1.DataUpload { + du := &velerov2alpha1.DataUpload{ + TypeMeta: metav1.TypeMeta{ + APIVersion: velerov2alpha1.SchemeGroupVersion.String(), + Kind: "DataUpload", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "velero", + Name: name, + }, + Spec: velerov2alpha1.DataUploadSpec{ + SnapshotType: velerov2alpha1.SnapshotTypeCSI, + SourcePVC: "my-pvc", + SourceNamespace: "app", + BackupStorageLocation: "default", + DataMover: "velero", + }, + Status: velerov2alpha1.DataUploadStatus{ + Phase: velerov2alpha1.DataUploadPhaseCompleted, + SnapshotID: "kopia-snapshot-id", + }, + } + if ownerBackup != "" { + du.Labels = map[string]string{velerov1.BackupNameLabel: ownerBackup} + } + return du +} + +func TestDataUploadDeleteActionAppliesTo(t *testing.T) { + a := NewDataUploadDeleteAction(logrus.StandardLogger(), nil) + selector, err := a.AppliesTo() + require.NoError(t, err) + require.Equal(t, velero.ResourceSelector{IncludedResources: []string{"datauploads.velero.io"}}, selector) +} + +func TestDataUploadDeleteActionExecute(t *testing.T) { + tests := []struct { + name string + duName string + duOwnerBackup string // value placed in velero.io/backup-name label on the DataUpload + executingBackup string // name of the Backup being deleted (input.Backup.Name) + wantConfigMap bool + wantWarnContains string // substring expected in a warn-level log entry; empty means no warn expected + }{ + { + name: "DataUpload owned by the executing backup creates a snapshot-info ConfigMap", + duName: "daily-backup-abcde", + duOwnerBackup: "daily-backup", + executingBackup: "daily-backup", + wantConfigMap: true, + wantWarnContains: "", + }, + { + name: "DataUpload owned by a different backup is skipped and a warning is logged", + duName: "daily-backup-abcde", + duOwnerBackup: "daily-backup", + executingBackup: "hourly-backup", + wantConfigMap: false, + wantWarnContains: "velero namespace", + }, + { + name: "DataUpload with no backup-name label is skipped and a warning is logged", + duName: "unlabeled-du", + duOwnerBackup: "", + executingBackup: "some-backup", + wantConfigMap: false, + wantWarnContains: "cannot be verified", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + crClient := velerotest.NewFakeControllerRuntimeClient(t) + logger, hook := logrustest.NewNullLogger() + logger.SetLevel(logrus.DebugLevel) + action := NewDataUploadDeleteAction(logger, crClient) + + du := newCompletedDataUpload(tc.duName, tc.duOwnerBackup) + backup := builder.ForBackup("velero", tc.executingBackup).StorageLocation("default").Result() + + err := action.Execute(&velero.DeleteItemActionExecuteInput{ + Item: toUnstructured(t, du), + Backup: backup, + }) + require.NoError(t, err) + + cm := &corev1api.ConfigMap{} + getErr := crClient.Get(t.Context(), crclient.ObjectKey{ + Namespace: backup.Namespace, + Name: fmt.Sprintf("%s-info", du.Name), + }, cm) + + if tc.wantConfigMap { + require.NoError(t, getErr, "expected snapshot-info ConfigMap to be created") + assert.Equal(t, tc.executingBackup, cm.Labels[velerov1.BackupNameLabel]) + assert.Equal(t, "true", cm.Labels[velerov1.DataUploadSnapshotInfoLabel]) + } else { + require.Error(t, getErr) + assert.True(t, apierrors.IsNotFound(getErr), + "expected no ConfigMap to be created for foreign DataUpload, but got: %v", getErr) + } + + // The action must surface DataUploads it cannot generate a useful + // snapshot-info ConfigMap for as warnings, so operators who + // accidentally included the velero namespace in a backup (or + // otherwise produced DataUploads without a verifiable owner) can + // detect the misconfiguration from logs instead of having the + // case silently swallowed. + var sawWarn bool + for _, entry := range hook.AllEntries() { + if entry.Level == logrus.WarnLevel && + strings.Contains(entry.Message, tc.duName) && + (tc.wantWarnContains == "" || strings.Contains(entry.Message, tc.wantWarnContains)) { + sawWarn = true + break + } + } + assert.Equal(t, tc.wantWarnContains != "", sawWarn, + "unexpected warn log presence (wantContains=%q, got=%v); entries=%v", + tc.wantWarnContains, sawWarn, hook.AllEntries()) + }) + } +}