From a5d32f29da98c7b54358501c70f037758334346d Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Fri, 10 Oct 2025 12:21:29 -0700 Subject: [PATCH 01/19] Sanitize Azure HTTP responses in BSL status messages Azure storage errors include verbose HTTP response details and XML in error messages, making the BSL status.message field cluttered and hard to read. This change adds sanitization to extract only the error code and meaningful message. Before: BackupStorageLocation "test" is unavailable: rpc error: code = Unknown desc = GET https://... RESPONSE 404: 404 The specified container does not exist. ERROR CODE: ContainerNotFound After: BackupStorageLocation "test" is unavailable: rpc error: code = Unknown desc = ContainerNotFound: The specified container does not exist. AWS and GCP error messages are preserved as-is since they don't contain verbose HTTP responses. Fixes #8368 Signed-off-by: Shubham Pampattiwar --- .../backup_storage_location_controller.go | 94 ++++++++++++++++++- ...backup_storage_location_controller_test.go | 78 +++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) diff --git a/pkg/controller/backup_storage_location_controller.go b/pkg/controller/backup_storage_location_controller.go index ba765607d..a8edb565e 100644 --- a/pkg/controller/backup_storage_location_controller.go +++ b/pkg/controller/backup_storage_location_controller.go @@ -18,6 +18,7 @@ package controller import ( "context" + "regexp" "strings" "time" @@ -46,6 +47,97 @@ const ( bslValidationEnqueuePeriod = 10 * time.Second ) +// sanitizeStorageError cleans up verbose HTTP responses from cloud provider errors, +// particularly Azure which includes full HTTP response details and XML in error messages. +// It extracts the error code and message while removing HTTP headers and response bodies. +func sanitizeStorageError(err error) string { + if err == nil { + return "" + } + + errMsg := err.Error() + + // Check if this looks like an Azure HTTP response error + // Azure errors contain patterns like "RESPONSE 404:" and "ERROR CODE:" + if !strings.Contains(errMsg, "RESPONSE") || !strings.Contains(errMsg, "ERROR CODE:") { + // Not an Azure-style error, return as-is + return errMsg + } + + // Extract the error code (e.g., "ContainerNotFound", "BlobNotFound") + errorCodeRegex := regexp.MustCompile(`ERROR CODE:\s*(\w+)`) + errorCodeMatch := errorCodeRegex.FindStringSubmatch(errMsg) + var errorCode string + if len(errorCodeMatch) > 1 { + errorCode = errorCodeMatch[1] + } + + // Extract the error message from the XML or plain text + // Look for message between tags or after "RESPONSE XXX:" + var errorMessage string + + // Try to extract from XML first + messageRegex := regexp.MustCompile(`(.*?)`) + messageMatch := messageRegex.FindStringSubmatch(errMsg) + if len(messageMatch) > 1 { + errorMessage = messageMatch[1] + // Remove RequestId and Time from the message + if idx := strings.Index(errorMessage, "\nRequestId:"); idx != -1 { + errorMessage = errorMessage[:idx] + } + } else { + // Try to extract from plain text response (e.g., "RESPONSE 404: 404 The specified container does not exist.") + responseRegex := regexp.MustCompile(`RESPONSE\s+\d+:\s+\d+\s+([^\n]+)`) + responseMatch := responseRegex.FindStringSubmatch(errMsg) + if len(responseMatch) > 1 { + errorMessage = strings.TrimSpace(responseMatch[1]) + } + } + + // Build a clean error message + var cleanMsg string + if errorCode != "" && errorMessage != "" { + cleanMsg = errorCode + ": " + errorMessage + } else if errorCode != "" { + cleanMsg = errorCode + } else if errorMessage != "" { + cleanMsg = errorMessage + } else { + // Fallback: try to extract the desc part from gRPC error + descRegex := regexp.MustCompile(`desc\s*=\s*(.+)`) + descMatch := descRegex.FindStringSubmatch(errMsg) + if len(descMatch) > 1 { + // Take everything up to the first newline or "RESPONSE" marker + desc := descMatch[1] + if idx := strings.Index(desc, "\n"); idx != -1 { + desc = desc[:idx] + } + if idx := strings.Index(desc, "RESPONSE"); idx != -1 { + desc = strings.TrimSpace(desc[:idx]) + } + cleanMsg = desc + } else { + // Last resort: return first line + if idx := strings.Index(errMsg, "\n"); idx != -1 { + cleanMsg = errMsg[:idx] + } else { + cleanMsg = errMsg + } + } + } + + // Preserve the prefix part of the error (e.g., "rpc error: code = Unknown desc = ") + // but replace the verbose description with our clean message + if strings.Contains(errMsg, "desc = ") { + parts := strings.SplitN(errMsg, "desc = ", 2) + if len(parts) == 2 { + return parts[0] + "desc = " + cleanMsg + } + } + + return cleanMsg +} + // BackupStorageLocationReconciler reconciles a BackupStorageLocation object type backupStorageLocationReconciler struct { ctx context.Context @@ -127,7 +219,7 @@ func (r *backupStorageLocationReconciler) Reconcile(ctx context.Context, req ctr err = errors.Wrapf(err, "BackupStorageLocation %q is unavailable", location.Name) unavailableErrors = append(unavailableErrors, err.Error()) location.Status.Phase = velerov1api.BackupStorageLocationPhaseUnavailable - location.Status.Message = err.Error() + location.Status.Message = sanitizeStorageError(err) } else { log.Info("BackupStorageLocations is valid, marking as available") location.Status.Phase = velerov1api.BackupStorageLocationPhaseAvailable diff --git a/pkg/controller/backup_storage_location_controller_test.go b/pkg/controller/backup_storage_location_controller_test.go index b07c4a1fb..0ff3c3b00 100644 --- a/pkg/controller/backup_storage_location_controller_test.go +++ b/pkg/controller/backup_storage_location_controller_test.go @@ -303,3 +303,81 @@ func TestBSLReconcile(t *testing.T) { }) } } + +func TestSanitizeStorageError(t *testing.T) { + tests := []struct { + name string + input error + expected string + }{ + { + name: "Nil error", + input: nil, + expected: "", + }, + { + name: "Simple error without Azure formatting", + input: errors.New("simple error message"), + expected: "simple error message", + }, + { + name: "AWS style error", + input: errors.New("NoSuchBucket: The specified bucket does not exist"), + expected: "NoSuchBucket: The specified bucket does not exist", + }, + { + name: "Azure container not found error with full HTTP response", + input: errors.New(`rpc error: code = Unknown desc = GET https://oadp100711zl59k.blob.core.windows.net/oadp100711zl59k1 +-------------------------------------------------------------------------------- +RESPONSE 404: 404 The specified container does not exist. +ERROR CODE: ContainerNotFound +-------------------------------------------------------------------------------- +ContainerNotFoundThe specified container does not exist. +RequestId:63cf34d8-801e-0078-09b4-2e4682000000 +Time:2024-11-04T12:23:04.5623627Z +-------------------------------------------------------------------------------- +`), + expected: "rpc error: code = Unknown desc = ContainerNotFound: The specified container does not exist.", + }, + { + name: "Azure blob not found error", + input: errors.New(`rpc error: code = Unknown desc = GET https://storage.blob.core.windows.net/container/blob +-------------------------------------------------------------------------------- +RESPONSE 404: 404 The specified blob does not exist. +ERROR CODE: BlobNotFound +-------------------------------------------------------------------------------- +BlobNotFoundThe specified blob does not exist. +RequestId:12345678-1234-1234-1234-123456789012 +Time:2024-11-04T12:23:04.5623627Z +-------------------------------------------------------------------------------- +`), + expected: "rpc error: code = Unknown desc = BlobNotFound: The specified blob does not exist.", + }, + { + name: "Azure error with plain text response (no XML)", + input: errors.New(`rpc error: code = Unknown desc = GET https://storage.blob.core.windows.net/container +-------------------------------------------------------------------------------- +RESPONSE 404: 404 The specified container does not exist. +ERROR CODE: ContainerNotFound +-------------------------------------------------------------------------------- +`), + expected: "rpc error: code = Unknown desc = ContainerNotFound: The specified container does not exist.", + }, + { + name: "Azure error without XML message but with error code", + input: errors.New(`rpc error: code = Unknown desc = operation failed +RESPONSE 403: 403 Forbidden +ERROR CODE: AuthorizationFailure +-------------------------------------------------------------------------------- +`), + expected: "rpc error: code = Unknown desc = AuthorizationFailure: Forbidden", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual := sanitizeStorageError(test.input) + assert.Equal(t, test.expected, actual) + }) + } +} From 60dd3dc832296bad7bcd517f77c33bcc8fb3a24a Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Fri, 10 Oct 2025 12:25:30 -0700 Subject: [PATCH 02/19] Add changelog for PR #9321 Signed-off-by: Shubham Pampattiwar --- changelogs/unreleased/9321-shubham-pampattiwar | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9321-shubham-pampattiwar diff --git a/changelogs/unreleased/9321-shubham-pampattiwar b/changelogs/unreleased/9321-shubham-pampattiwar new file mode 100644 index 000000000..7fd9cec10 --- /dev/null +++ b/changelogs/unreleased/9321-shubham-pampattiwar @@ -0,0 +1 @@ +Sanitize Azure HTTP responses in BSL status messages From 20af2c20c5b41b0245f6eb8d51853a4761ac3583 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Wed, 12 Nov 2025 15:39:38 -0800 Subject: [PATCH 03/19] Address PR review comments: sanitize errors and add SAS token scrubbing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses three review comments on PR #9321: 1. Keep sanitization in controller (response to @ywk253100) - Maintaining centralized error handling for easier extension - Azure-specific patterns detected and others passed through unchanged 2. Sanitize unavailableErrors array (@priyansh17) - Now using sanitizeStorageError() for both unavailableErrors array and location.Status.Message for consistency 3. Add SAS token scrubbing (@anshulahuja98) - Scrubs Azure SAS token parameters to prevent credential leakage - Redacts: sig, se, st, sp, spr, sv, sr, sip, srt, ss - Example: ?sig=secret becomes ?sig=***REDACTED*** Added comprehensive test coverage for SAS token scrubbing with 4 new test cases covering various scenarios. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Signed-off-by: Shubham Pampattiwar --- .../backup_storage_location_controller.go | 9 ++++- ...backup_storage_location_controller_test.go | 34 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/pkg/controller/backup_storage_location_controller.go b/pkg/controller/backup_storage_location_controller.go index a8edb565e..2e43a6187 100644 --- a/pkg/controller/backup_storage_location_controller.go +++ b/pkg/controller/backup_storage_location_controller.go @@ -50,6 +50,7 @@ const ( // sanitizeStorageError cleans up verbose HTTP responses from cloud provider errors, // particularly Azure which includes full HTTP response details and XML in error messages. // It extracts the error code and message while removing HTTP headers and response bodies. +// It also scrubs sensitive information like SAS tokens from URLs. func sanitizeStorageError(err error) string { if err == nil { return "" @@ -57,6 +58,12 @@ func sanitizeStorageError(err error) string { errMsg := err.Error() + // Scrub sensitive information from URLs (SAS tokens, credentials, etc.) + // Azure SAS token parameters: sig, se, st, sp, spr, sv, sr, sip, srt, ss + // These appear as query parameters in URLs like: ?sig=value&se=value + sasParamsRegex := regexp.MustCompile(`([?&])(sig|se|st|sp|spr|sv|sr|sip|srt|ss)=([^&\s<>\n]+)`) + errMsg = sasParamsRegex.ReplaceAllString(errMsg, `${1}${2}=***REDACTED***`) + // Check if this looks like an Azure HTTP response error // Azure errors contain patterns like "RESPONSE 404:" and "ERROR CODE:" if !strings.Contains(errMsg, "RESPONSE") || !strings.Contains(errMsg, "ERROR CODE:") { @@ -217,7 +224,7 @@ func (r *backupStorageLocationReconciler) Reconcile(ctx context.Context, req ctr if err != nil { log.Info("BackupStorageLocation is invalid, marking as unavailable") err = errors.Wrapf(err, "BackupStorageLocation %q is unavailable", location.Name) - unavailableErrors = append(unavailableErrors, err.Error()) + unavailableErrors = append(unavailableErrors, sanitizeStorageError(err)) location.Status.Phase = velerov1api.BackupStorageLocationPhaseUnavailable location.Status.Message = sanitizeStorageError(err) } else { diff --git a/pkg/controller/backup_storage_location_controller_test.go b/pkg/controller/backup_storage_location_controller_test.go index 0ff3c3b00..8a5dc4a4a 100644 --- a/pkg/controller/backup_storage_location_controller_test.go +++ b/pkg/controller/backup_storage_location_controller_test.go @@ -372,6 +372,40 @@ ERROR CODE: AuthorizationFailure `), expected: "rpc error: code = Unknown desc = AuthorizationFailure: Forbidden", }, + { + name: "Error with Azure SAS token in URL", + input: errors.New(`rpc error: code = Unknown desc = GET https://storage.blob.core.windows.net/backup?sv=2020-08-04&sig=abc123secrettoken&se=2024-12-31T23:59:59Z&sp=rwdl +-------------------------------------------------------------------------------- +RESPONSE 404: 404 The specified container does not exist. +ERROR CODE: ContainerNotFound +-------------------------------------------------------------------------------- +`), + expected: "rpc error: code = Unknown desc = ContainerNotFound: The specified container does not exist.", + }, + { + name: "Error with multiple SAS parameters", + input: errors.New(`GET https://mystorageaccount.blob.core.windows.net/container?sv=2020-08-04&ss=b&srt=sco&sp=rwdlac&se=2024-12-31&st=2024-01-01&sip=168.1.5.60&spr=https&sig=SIGNATURE_HASH`), + expected: "GET https://mystorageaccount.blob.core.windows.net/container?sv=***REDACTED***&ss=***REDACTED***&srt=***REDACTED***&sp=***REDACTED***&se=***REDACTED***&st=***REDACTED***&sip=***REDACTED***&spr=***REDACTED***&sig=***REDACTED***", + }, + { + name: "Simple URL without SAS tokens unchanged", + input: errors.New("GET https://storage.blob.core.windows.net/container/blob"), + expected: "GET https://storage.blob.core.windows.net/container/blob", + }, + { + name: "Azure error with SAS token in full HTTP response", + input: errors.New(`rpc error: code = Unknown desc = GET https://oadp100711zl59k.blob.core.windows.net/backup?sig=secretsignature123&se=2024-12-31 +-------------------------------------------------------------------------------- +RESPONSE 404: 404 The specified container does not exist. +ERROR CODE: ContainerNotFound +-------------------------------------------------------------------------------- +ContainerNotFoundThe specified container does not exist. +RequestId:63cf34d8-801e-0078-09b4-2e4682000000 +Time:2024-11-04T12:23:04.5623627Z +-------------------------------------------------------------------------------- +`), + expected: "rpc error: code = Unknown desc = ContainerNotFound: The specified container does not exist.", + }, } for _, test := range tests { From 096436507e6d8fdd31a7c937fc57e50db238899c Mon Sep 17 00:00:00 2001 From: Xun Jiang Date: Wed, 26 Nov 2025 14:08:01 +0800 Subject: [PATCH 04/19] Remove VolumeSnapshotClass from CSI restore and deletion process. Remove VolumeSnapshotClass from backup sync process. Signed-off-by: Xun Jiang --- changelogs/unreleased/9431-blackpiglet | 1 + .../csi/volumesnapshotcontent_action.go | 8 +++ .../csi/volumesnapshotcontent_action_test.go | 4 +- .../actions/csi/volumesnapshot_action.go | 29 ++++++---- .../csi/volumesnapshotcontent_action.go | 4 ++ .../csi/volumesnapshotcontent_action_test.go | 2 +- pkg/controller/backup_sync_controller.go | 57 ------------------- pkg/controller/backup_sync_controller_test.go | 3 - pkg/persistence/object_store.go | 13 ++--- .../actions/csi/volumesnapshot_action.go | 8 +++ .../actions/csi/volumesnapshot_action_test.go | 22 ++++--- .../csi/volumesnapshotcontent_action.go | 8 +++ .../csi/volumesnapshotcontent_action_test.go | 34 ++++++----- 13 files changed, 90 insertions(+), 103 deletions(-) create mode 100644 changelogs/unreleased/9431-blackpiglet diff --git a/changelogs/unreleased/9431-blackpiglet b/changelogs/unreleased/9431-blackpiglet new file mode 100644 index 000000000..f82eb459c --- /dev/null +++ b/changelogs/unreleased/9431-blackpiglet @@ -0,0 +1 @@ +Remove VolumeSnapshotClass from CSI B/R process. diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action.go b/internal/delete/actions/csi/volumesnapshotcontent_action.go index 582041533..d12c7c43a 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action.go @@ -103,6 +103,14 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute( snapCont.ResourceVersion = "" + if snapCont.Spec.VolumeSnapshotClassName != nil { + // Delete VolumeSnapshotClass from the VolumeSnapshotContent. + // This is necessary to make the deletion independent of the VolumeSnapshotClass. + snapCont.Spec.VolumeSnapshotClassName = nil + p.log.Debugf("Deleted VolumeSnapshotClassName from VolumeSnapshotContent %s to make deletion independent of VolumeSnapshotClass", + snapCont.Name) + } + if err := p.crClient.Create(context.TODO(), &snapCont); err != nil { return errors.Wrapf(err, "fail to create VolumeSnapshotContent %s", snapCont.Name) } diff --git a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go index 1c84cf7bf..24baccdb2 100644 --- a/internal/delete/actions/csi/volumesnapshotcontent_action_test.go +++ b/internal/delete/actions/csi/volumesnapshotcontent_action_test.go @@ -70,7 +70,7 @@ func TestVSCExecute(t *testing.T) { }, { name: "Normal case, VolumeSnapshot should be deleted", - vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), + vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).VolumeSnapshotClassName("volumesnapshotclass").Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), expectErr: false, function: func( @@ -82,7 +82,7 @@ func TestVSCExecute(t *testing.T) { }, }, { - name: "Normal case, VolumeSnapshot should be deleted", + name: "Error case, deletion fails", vsc: builder.ForVolumeSnapshotContent("bar").ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).Result(), backup: builder.ForBackup("velero", "backup").ObjectMeta(builder.WithAnnotationsMap(map[string]string{velerov1api.ResourceTimeoutAnnotation: "5s"})).Result(), expectErr: true, diff --git a/pkg/backup/actions/csi/volumesnapshot_action.go b/pkg/backup/actions/csi/volumesnapshot_action.go index 69f881181..b7283b628 100644 --- a/pkg/backup/actions/csi/volumesnapshot_action.go +++ b/pkg/backup/actions/csi/volumesnapshot_action.go @@ -84,17 +84,6 @@ func (p *volumeSnapshotBackupItemAction) Execute( return nil, nil, "", nil, errors.WithStack(err) } - additionalItems := make([]velero.ResourceIdentifier, 0) - if vs.Spec.VolumeSnapshotClassName != nil { - additionalItems = append( - additionalItems, - velero.ResourceIdentifier{ - GroupResource: kuberesource.VolumeSnapshotClasses, - Name: *vs.Spec.VolumeSnapshotClassName, - }, - ) - } - if backup.Status.Phase == velerov1api.BackupPhaseFinalizing || backup.Status.Phase == velerov1api.BackupPhaseFinalizingPartiallyFailed { p.log. @@ -105,6 +94,24 @@ func (p *volumeSnapshotBackupItemAction) Execute( return item, nil, "", nil, nil } + additionalItems := make([]velero.ResourceIdentifier, 0) + + if vs.Spec.VolumeSnapshotClassName != nil { + // This is still needed to add the VolumeSnapshotClass to the backup. + // The secret with VolumeSnapshotClass is still relevant to backup. + additionalItems = append( + additionalItems, + velero.ResourceIdentifier{ + GroupResource: kuberesource.VolumeSnapshotClasses, + Name: *vs.Spec.VolumeSnapshotClassName, + }, + ) + + // Because async operation will update VolumeSnapshot during finalizing phase. + // No matter what we do, VolumeSnapshotClass cannot be deleted. So skip it. + // Just deleting VolumeSnapshotClass during restore and delete is enough. + } + p.log.Infof("Getting VolumesnapshotContent for Volumesnapshot %s/%s", vs.Namespace, vs.Name) diff --git a/pkg/backup/actions/csi/volumesnapshotcontent_action.go b/pkg/backup/actions/csi/volumesnapshotcontent_action.go index c2ab334e0..d4cd6d46c 100644 --- a/pkg/backup/actions/csi/volumesnapshotcontent_action.go +++ b/pkg/backup/actions/csi/volumesnapshotcontent_action.go @@ -97,6 +97,10 @@ func (p *volumeSnapshotContentBackupItemAction) Execute( }) } + // Because async operation will update VolumeSnapshotContent during finalizing phase. + // No matter what we do, VolumeSnapshotClass cannot be deleted. So skip it. + // Just deleting VolumeSnapshotClass during restore and delete is enough. + snapContMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&snapCont) if err != nil { return nil, nil, "", nil, errors.WithStack(err) diff --git a/pkg/backup/actions/csi/volumesnapshotcontent_action_test.go b/pkg/backup/actions/csi/volumesnapshotcontent_action_test.go index 120eebdb4..e24623d9a 100644 --- a/pkg/backup/actions/csi/volumesnapshotcontent_action_test.go +++ b/pkg/backup/actions/csi/volumesnapshotcontent_action_test.go @@ -42,7 +42,7 @@ func TestVSCExecute(t *testing.T) { expectedItems []velero.ResourceIdentifier }{ { - name: "Invalid VolumeSnapshotClass", + name: "Invalid VolumeSnapshotContent", item: velerotest.UnstructuredOrDie( ` { diff --git a/pkg/controller/backup_sync_controller.go b/pkg/controller/backup_sync_controller.go index 69a6b2ac1..b84ae6f0b 100644 --- a/pkg/controller/backup_sync_controller.go +++ b/pkg/controller/backup_sync_controller.go @@ -21,7 +21,6 @@ import ( "fmt" "time" - snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" "github.com/pkg/errors" "github.com/sirupsen/logrus" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -36,7 +35,6 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/constant" - "github.com/vmware-tanzu/velero/pkg/features" "github.com/vmware-tanzu/velero/pkg/label" "github.com/vmware-tanzu/velero/pkg/persistence" "github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt" @@ -243,31 +241,6 @@ func (b *backupSyncReconciler) Reconcile(ctx context.Context, req ctrl.Request) log.Debug("Synced pod volume backup into cluster") } } - - if features.IsEnabled(velerov1api.CSIFeatureFlag) { - // we are syncing these objects only to ensure that the storage snapshots are cleaned up - // on backup deletion or expiry. - log.Info("Syncing CSI VolumeSnapshotClasses in backup") - vsClasses, err := backupStore.GetCSIVolumeSnapshotClasses(backupName) - if err != nil { - log.WithError(errors.WithStack(err)).Error("Error getting CSI VolumeSnapClasses for this backup from backup store") - continue - } - for _, vsClass := range vsClasses { - vsClass.ResourceVersion = "" - err := b.client.Create(ctx, vsClass, &client.CreateOptions{}) - switch { - case err != nil && apierrors.IsAlreadyExists(err): - log.Debugf("VolumeSnapshotClass %s already exists in cluster", vsClass.Name) - continue - case err != nil && !apierrors.IsAlreadyExists(err): - log.WithError(errors.WithStack(err)).Errorf("Error syncing VolumeSnapshotClass %s into cluster", vsClass.Name) - continue - default: - log.Infof("Created CSI VolumeSnapshotClass %s", vsClass.Name) - } - } - } } b.deleteOrphanedBackups(ctx, location.Name, backupStoreBackups, log) @@ -364,40 +337,10 @@ func (b *backupSyncReconciler) deleteOrphanedBackups(ctx context.Context, locati if err := b.client.Delete(ctx, &backupList.Items[i], &client.DeleteOptions{}); err != nil { log.WithError(errors.WithStack(err)).Error("Error deleting orphaned backup from cluster") - } else { - log.Debug("Deleted orphaned backup from cluster") - b.deleteCSISnapshotsByBackup(ctx, backup.Name, log) } } } -func (b *backupSyncReconciler) deleteCSISnapshotsByBackup(ctx context.Context, backupName string, log logrus.FieldLogger) { - if !features.IsEnabled(velerov1api.CSIFeatureFlag) { - return - } - m := client.MatchingLabels{velerov1api.BackupNameLabel: label.GetValidName(backupName)} - var vsList snapshotv1api.VolumeSnapshotList - listOptions := &client.ListOptions{ - LabelSelector: label.NewSelectorForBackup(label.GetValidName(backupName)), - } - if err := b.client.List(ctx, &vsList, listOptions); err != nil { - log.WithError(err).Warnf("Failed to list volumesnapshots for backup: %s, the deletion will be skipped", backupName) - } else { - for i, vs := range vsList.Items { - name := kube.NamespaceAndName(vs.GetObjectMeta()) - log.Debugf("Deleting volumesnapshot %s", name) - if err := b.client.Delete(context.TODO(), &vsList.Items[i]); err != nil { - log.WithError(err).Warnf("Failed to delete volumesnapshot %s", name) - } - } - } - vsc := &snapshotv1api.VolumeSnapshotContent{} - log.Debugf("Deleting volumesnapshotcontents for backup: %s", backupName) - if err := b.client.DeleteAllOf(context.TODO(), vsc, m); err != nil { - log.WithError(err).Warnf("Failed to delete volumesnapshotcontents for backup: %s", backupName) - } -} - // backupSyncSourceOrderFunc returns a new slice with the default backup location first (if it exists), // followed by the rest of the locations in no particular order. func backupSyncSourceOrderFunc(objList client.ObjectList) client.ObjectList { diff --git a/pkg/controller/backup_sync_controller_test.go b/pkg/controller/backup_sync_controller_test.go index 0aa7f2818..fe440ff09 100644 --- a/pkg/controller/backup_sync_controller_test.go +++ b/pkg/controller/backup_sync_controller_test.go @@ -24,7 +24,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" "github.com/sirupsen/logrus" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -451,8 +450,6 @@ var _ = Describe("Backup Sync Reconciler", func() { backupStore.On("GetBackupMetadata", backup.backup.Name).Return(backup.backup, nil) backupStore.On("GetPodVolumeBackups", backup.backup.Name).Return(backup.podVolumeBackups, nil) backupStore.On("BackupExists", "bucket-1", backup.backup.Name).Return(true, nil) - backupStore.On("GetCSIVolumeSnapshotClasses", backup.backup.Name).Return([]*snapshotv1api.VolumeSnapshotClass{}, nil) - backupStore.On("GetCSIVolumeSnapshotContents", backup.backup.Name).Return([]*snapshotv1api.VolumeSnapshotContent{}, nil) } backupStore.On("ListBackups").Return(backupNames, nil) } diff --git a/pkg/persistence/object_store.go b/pkg/persistence/object_store.go index 44ad96118..edcd2fa49 100644 --- a/pkg/persistence/object_store.go +++ b/pkg/persistence/object_store.go @@ -266,13 +266,12 @@ func (s *objectBackupStore) PutBackup(info BackupInfo) error { // Since the logic for all of these files is the exact same except for the name and the contents, // use a map literal to iterate through them and write them to the bucket. var backupObjs = map[string]io.Reader{ - s.layout.getPodVolumeBackupsKey(info.Name): info.PodVolumeBackups, - s.layout.getBackupVolumeSnapshotsKey(info.Name): info.VolumeSnapshots, - s.layout.getBackupItemOperationsKey(info.Name): info.BackupItemOperations, - s.layout.getBackupResourceListKey(info.Name): info.BackupResourceList, - s.layout.getCSIVolumeSnapshotClassesKey(info.Name): info.CSIVolumeSnapshotClasses, - s.layout.getBackupResultsKey(info.Name): info.BackupResults, - s.layout.getBackupVolumeInfoKey(info.Name): info.BackupVolumeInfo, + s.layout.getPodVolumeBackupsKey(info.Name): info.PodVolumeBackups, + s.layout.getBackupVolumeSnapshotsKey(info.Name): info.VolumeSnapshots, + s.layout.getBackupItemOperationsKey(info.Name): info.BackupItemOperations, + s.layout.getBackupResourceListKey(info.Name): info.BackupResourceList, + s.layout.getBackupResultsKey(info.Name): info.BackupResults, + s.layout.getBackupVolumeInfoKey(info.Name): info.BackupVolumeInfo, } for key, reader := range backupObjs { diff --git a/pkg/restore/actions/csi/volumesnapshot_action.go b/pkg/restore/actions/csi/volumesnapshot_action.go index 50fb3f0ed..6d4bc1eda 100644 --- a/pkg/restore/actions/csi/volumesnapshot_action.go +++ b/pkg/restore/actions/csi/volumesnapshot_action.go @@ -103,6 +103,14 @@ func (p *volumeSnapshotRestoreItemAction) Execute( // DeletionPolicy to Retain. resetVolumeSnapshotAnnotation(&vs) + if vs.Spec.VolumeSnapshotClassName != nil { + // Delete VolumeSnapshotClass from the VolumeSnapshot. + // This is necessary to make the restore independent of the VolumeSnapshotClass. + vs.Spec.VolumeSnapshotClassName = nil + p.log.Debugf("Deleted VolumeSnapshotClassName from VolumeSnapshot %s/%s to make restore independent of VolumeSnapshotClass", + vs.Namespace, vs.Name) + } + vsMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&vs) if err != nil { p.log.Errorf("Fail to convert VS %s to unstructured", vs.Namespace+"/"+vs.Name) diff --git a/pkg/restore/actions/csi/volumesnapshot_action_test.go b/pkg/restore/actions/csi/volumesnapshot_action_test.go index bec4f9582..4fb37e301 100644 --- a/pkg/restore/actions/csi/volumesnapshot_action_test.go +++ b/pkg/restore/actions/csi/volumesnapshot_action_test.go @@ -124,14 +124,20 @@ func TestVSExecute(t *testing.T) { }, { name: "Normal case, VSC should be created", - vs: builder.ForVolumeSnapshot("ns", "vsName").ObjectMeta( - builder.WithAnnotationsMap( - map[string]string{ - velerov1api.VolumeSnapshotHandleAnnotation: "vsc", - velerov1api.DriverNameAnnotation: "pd.csi.storage.gke.io", - }, - ), - ).SourceVolumeSnapshotContentName(newVscName).Status().BoundVolumeSnapshotContentName("vscName").Result(), + vs: builder.ForVolumeSnapshot("ns", "vsName"). + ObjectMeta( + builder.WithAnnotationsMap( + map[string]string{ + velerov1api.VolumeSnapshotHandleAnnotation: "vsc", + velerov1api.DriverNameAnnotation: "pd.csi.storage.gke.io", + }, + ), + ). + SourceVolumeSnapshotContentName(newVscName). + VolumeSnapshotClass("vscClass"). + Status(). + BoundVolumeSnapshotContentName("vscName"). + Result(), restore: builder.ForRestore("velero", "restore").ObjectMeta(builder.WithUID("restoreUID")).Result(), expectErr: false, expectedVS: builder.ForVolumeSnapshot("ns", "test").SourceVolumeSnapshotContentName(newVscName).Result(), diff --git a/pkg/restore/actions/csi/volumesnapshotcontent_action.go b/pkg/restore/actions/csi/volumesnapshotcontent_action.go index 9b7d5c878..0a268eab2 100644 --- a/pkg/restore/actions/csi/volumesnapshotcontent_action.go +++ b/pkg/restore/actions/csi/volumesnapshotcontent_action.go @@ -108,6 +108,14 @@ func (p *volumeSnapshotContentRestoreItemAction) Execute( return nil, errors.Errorf("fail to get snapshot handle from VSC %s status", vsc.Name) } + if vsc.Spec.VolumeSnapshotClassName != nil { + // Delete VolumeSnapshotClass from the VolumeSnapshotContent. + // This is necessary to make the restore independent of the VolumeSnapshotClass. + vsc.Spec.VolumeSnapshotClassName = nil + p.log.Debugf("Deleted VolumeSnapshotClassName from VolumeSnapshotContent %s to make restore independent of VolumeSnapshotClass", + vsc.Name) + } + additionalItems := []velero.ResourceIdentifier{} if csi.IsVolumeSnapshotContentHasDeleteSecret(&vsc) { additionalItems = append(additionalItems, diff --git a/pkg/restore/actions/csi/volumesnapshotcontent_action_test.go b/pkg/restore/actions/csi/volumesnapshotcontent_action_test.go index 85fd1a092..83b79bd5b 100644 --- a/pkg/restore/actions/csi/volumesnapshotcontent_action_test.go +++ b/pkg/restore/actions/csi/volumesnapshotcontent_action_test.go @@ -55,13 +55,17 @@ func TestVSCExecute(t *testing.T) { }, { name: "Normal case, additional items should return ", - vsc: builder.ForVolumeSnapshotContent("test").ObjectMeta(builder.WithAnnotationsMap( - map[string]string{ - velerov1api.PrefixedSecretNameAnnotation: "name", - velerov1api.PrefixedSecretNamespaceAnnotation: "namespace", - }, - )).VolumeSnapshotRef("velero", "vsName", "vsUID"). - Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleName}).Result(), + vsc: builder.ForVolumeSnapshotContent("test"). + ObjectMeta(builder.WithAnnotationsMap( + map[string]string{ + velerov1api.PrefixedSecretNameAnnotation: "name", + velerov1api.PrefixedSecretNamespaceAnnotation: "namespace", + }, + )). + VolumeSnapshotRef("velero", "vsName", "vsUID"). + VolumeSnapshotClassName("vsClass"). + Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleName}). + Result(), restore: builder.ForRestore("velero", "restore").ObjectMeta(builder.WithUID("restoreUID")). NamespaceMappings("velero", "restore").Result(), expectErr: false, @@ -72,15 +76,17 @@ func TestVSCExecute(t *testing.T) { Name: "name", }, }, - expectedVSC: builder.ForVolumeSnapshotContent(newVscName).ObjectMeta(builder.WithAnnotationsMap( - map[string]string{ - velerov1api.PrefixedSecretNameAnnotation: "name", - velerov1api.PrefixedSecretNamespaceAnnotation: "namespace", - }, - )).VolumeSnapshotRef("restore", newVscName, ""). + expectedVSC: builder.ForVolumeSnapshotContent(newVscName). + ObjectMeta(builder.WithAnnotationsMap( + map[string]string{ + velerov1api.PrefixedSecretNameAnnotation: "name", + velerov1api.PrefixedSecretNamespaceAnnotation: "namespace", + }, + )).VolumeSnapshotRef("restore", newVscName, ""). Source(snapshotv1api.VolumeSnapshotContentSource{SnapshotHandle: &snapshotHandleName}). DeletionPolicy(snapshotv1api.VolumeSnapshotContentRetain). - Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleName}).Result(), + Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleName}). + Result(), }, { name: "VSC exists in cluster, same as the normal case", From a1026cb5319a9b543d5f9ee2f2e349578b9c4bb8 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Fri, 12 Dec 2025 12:25:45 -0500 Subject: [PATCH 05/19] Update golangci-lint installation script URL to use HEAD for latest version (#9451) Signed-off-by: Tiger Kaovilai --- hack/build-image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/build-image/Dockerfile b/hack/build-image/Dockerfile index 68a34fbf0..89477d2fb 100644 --- a/hack/build-image/Dockerfile +++ b/hack/build-image/Dockerfile @@ -94,7 +94,7 @@ RUN ARCH=$(go env GOARCH) && \ chmod +x /usr/bin/goreleaser # get golangci-lint -RUN curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v2.5.0 +RUN curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s -- -b $(go env GOPATH)/bin v2.5.0 # install kubectl RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/$(go env GOARCH)/kubectl From bcdc30b59ae8ed3e36017eb33170c84ddf7d5493 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 2 Dec 2025 12:27:35 -0800 Subject: [PATCH 06/19] Add PVC-to-Pod cache to improve volume policy performance The GetPodsUsingPVC function had O(N*M) complexity - for each PVC, it listed ALL pods in the namespace and iterated through each pod. With many PVCs and pods, this caused significant performance degradation (2+ seconds per PV in some cases). This change introduces a PVC-to-Pod cache that is built once per backup and reused for all PVC lookups, reducing complexity from O(N*M) to O(N+M). Changes: - Add PVCPodCache struct with thread-safe caching in podvolume pkg - Add NewVolumeHelperImplWithCache constructor for cache support - Build cache before backup item processing in backup.go - Add comprehensive unit tests for cache functionality - Graceful fallback to direct lookups if cache fails Fixes #9179 Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper.go | 30 +- pkg/backup/backup.go | 19 +- pkg/util/podvolume/pod_volume.go | 140 +++++++ pkg/util/podvolume/pod_volume_test.go | 366 ++++++++++++++++++ 4 files changed, 553 insertions(+), 2 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper.go b/internal/volumehelper/volume_policy_helper.go index 520523099..1808427fb 100644 --- a/internal/volumehelper/volume_policy_helper.go +++ b/internal/volumehelper/volume_policy_helper.go @@ -33,6 +33,9 @@ type volumeHelperImpl struct { // to the volume policy check, but fs-backup is based on the pod resource, // the resource filter on PVC and PV doesn't work on this scenario. backupExcludePVC bool + // pvcPodCache provides cached PVC to Pod mappings for improved performance. + // When there are many PVCs and pods, using this cache avoids O(N*M) lookups. + pvcPodCache *podvolumeutil.PVCPodCache } func NewVolumeHelperImpl( @@ -50,6 +53,29 @@ func NewVolumeHelperImpl( client: client, defaultVolumesToFSBackup: defaultVolumesToFSBackup, backupExcludePVC: backupExcludePVC, + pvcPodCache: nil, // Cache will be nil by default for backward compatibility + } +} + +// NewVolumeHelperImplWithCache creates a VolumeHelper with a PVC-to-Pod cache for improved performance. +// The cache should be built before backup processing begins. +func NewVolumeHelperImplWithCache( + volumePolicy *resourcepolicies.Policies, + snapshotVolumes *bool, + logger logrus.FieldLogger, + client crclient.Client, + defaultVolumesToFSBackup bool, + backupExcludePVC bool, + pvcPodCache *podvolumeutil.PVCPodCache, +) VolumeHelper { + return &volumeHelperImpl{ + volumePolicy: volumePolicy, + snapshotVolumes: snapshotVolumes, + logger: logger, + client: client, + defaultVolumesToFSBackup: defaultVolumesToFSBackup, + backupExcludePVC: backupExcludePVC, + pvcPodCache: pvcPodCache, } } @@ -105,10 +131,12 @@ func (v *volumeHelperImpl) ShouldPerformSnapshot(obj runtime.Unstructured, group // If this PV is claimed, see if we've already taken a (pod volume backup) // snapshot of the contents of this PV. If so, don't take a snapshot. if pv.Spec.ClaimRef != nil { - pods, err := podvolumeutil.GetPodsUsingPVC( + // Use cached lookup if available for better performance with many PVCs/pods + pods, err := podvolumeutil.GetPodsUsingPVCWithCache( pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name, v.client, + v.pvcPodCache, ) if err != nil { v.logger.WithError(err).Errorf("fail to get pod for PV %s", pv.Name) diff --git a/pkg/backup/backup.go b/pkg/backup/backup.go index de4947afb..23601c310 100644 --- a/pkg/backup/backup.go +++ b/pkg/backup/backup.go @@ -64,6 +64,7 @@ import ( "github.com/vmware-tanzu/velero/pkg/util/boolptr" "github.com/vmware-tanzu/velero/pkg/util/collections" "github.com/vmware-tanzu/velero/pkg/util/kube" + podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume" ) // BackupVersion is the current backup major version for Velero. @@ -408,6 +409,21 @@ func (kb *kubernetesBackupper) BackupWithResolvers( } backupRequest.Status.Progress = &velerov1api.BackupProgress{TotalItems: len(items)} + // Build PVC-to-Pod cache for improved volume policy lookup performance. + // This avoids O(N*M) complexity when there are many PVCs and pods. + // See issue #9179 for details. + pvcPodCache := podvolumeutil.NewPVCPodCache() + namespaces := backupRequest.NamespaceIncludesExcludes.GetIncludes() + if len(namespaces) > 0 { + if err := pvcPodCache.BuildCacheForNamespaces(context.Background(), namespaces, kb.kbClient); err != nil { + // Log warning but continue - the cache will fall back to direct lookups + log.WithError(err).Warn("Failed to build PVC-to-Pod cache, falling back to direct lookups") + pvcPodCache = nil + } else { + log.Infof("Built PVC-to-Pod cache for %d namespaces", len(namespaces)) + } + } + itemBackupper := &itemBackupper{ backupRequest: backupRequest, tarWriter: tw, @@ -422,13 +438,14 @@ func (kb *kubernetesBackupper) BackupWithResolvers( PodCommandExecutor: kb.podCommandExecutor, }, hookTracker: hook.NewHookTracker(), - volumeHelperImpl: volumehelper.NewVolumeHelperImpl( + volumeHelperImpl: volumehelper.NewVolumeHelperImplWithCache( backupRequest.ResPolicies, backupRequest.Spec.SnapshotVolumes, log, kb.kbClient, boolptr.IsSetToTrue(backupRequest.Spec.DefaultVolumesToFsBackup), !backupRequest.ResourceIncludesExcludes.ShouldInclude(kuberesource.PersistentVolumeClaims.String()), + pvcPodCache, ), kubernetesBackupper: kb, } diff --git a/pkg/util/podvolume/pod_volume.go b/pkg/util/podvolume/pod_volume.go index 526d7e484..aa03e1b2e 100644 --- a/pkg/util/podvolume/pod_volume.go +++ b/pkg/util/podvolume/pod_volume.go @@ -19,6 +19,7 @@ package podvolume import ( "context" "strings" + "sync" "github.com/pkg/errors" corev1api "k8s.io/api/core/v1" @@ -29,6 +30,89 @@ import ( "github.com/vmware-tanzu/velero/pkg/util" ) +// PVCPodCache provides a cached mapping from PVC to the pods that use it. +// This cache is built once per backup to avoid repeated pod listings which +// cause O(N*M) performance issues when there are many PVCs and pods. +type PVCPodCache struct { + mu sync.RWMutex + // cache maps namespace -> pvcName -> []Pod + cache map[string]map[string][]corev1api.Pod + // built indicates whether the cache has been populated + built bool +} + +// NewPVCPodCache creates a new empty PVC to Pod cache. +func NewPVCPodCache() *PVCPodCache { + return &PVCPodCache{ + cache: make(map[string]map[string][]corev1api.Pod), + built: false, + } +} + +// BuildCacheForNamespaces builds the cache by listing pods once per namespace. +// This is much more efficient than listing pods for each PVC lookup. +func (c *PVCPodCache) BuildCacheForNamespaces( + ctx context.Context, + namespaces []string, + crClient crclient.Client, +) error { + c.mu.Lock() + defer c.mu.Unlock() + + for _, ns := range namespaces { + podList := new(corev1api.PodList) + if err := crClient.List( + ctx, + podList, + &crclient.ListOptions{Namespace: ns}, + ); err != nil { + return errors.Wrapf(err, "failed to list pods in namespace %s", ns) + } + + if c.cache[ns] == nil { + c.cache[ns] = make(map[string][]corev1api.Pod) + } + + // Build mapping from PVC name to pods + for i := range podList.Items { + pod := podList.Items[i] + for _, v := range pod.Spec.Volumes { + if v.PersistentVolumeClaim != nil { + pvcName := v.PersistentVolumeClaim.ClaimName + c.cache[ns][pvcName] = append(c.cache[ns][pvcName], pod) + } + } + } + } + + c.built = true + return nil +} + +// GetPodsUsingPVC retrieves pods using a specific PVC from the cache. +// Returns nil slice if the PVC is not found in the cache. +func (c *PVCPodCache) GetPodsUsingPVC(namespace, pvcName string) []corev1api.Pod { + c.mu.RLock() + defer c.mu.RUnlock() + + if nsPods, ok := c.cache[namespace]; ok { + if pods, ok := nsPods[pvcName]; ok { + // Return a copy to avoid race conditions + result := make([]corev1api.Pod, len(pods)) + copy(result, pods) + return result + } + } + return nil +} + +// IsBuilt returns true if the cache has been built. +func (c *PVCPodCache) IsBuilt() bool { + c.mu.RLock() + defer c.mu.RUnlock() + return c.built +} + // GetVolumesByPod returns a list of volume names to backup for the provided pod. func GetVolumesByPod(pod *corev1api.Pod, defaultVolumesToFsBackup, backupExcludePVC bool, volsToProcessByLegacyApproach []string) ([]string, []string) { // tracks the volumes that have been explicitly opted out of backup via the annotation in the pod @@ -109,12 +193,44 @@ func GetVolumesToExclude(obj metav1.Object) []string { return strings.Split(annotations[velerov1api.VolumesToExcludeAnnotation], ",") } +// IsPVCDefaultToFSBackup checks if a PVC should default to fs-backup based on pod annotations. +// Deprecated: Use IsPVCDefaultToFSBackupWithCache for better performance. func IsPVCDefaultToFSBackup(pvcNamespace, pvcName string, crClient crclient.Client, defaultVolumesToFsBackup bool) (bool, error) { pods, err := GetPodsUsingPVC(pvcNamespace, pvcName, crClient) if err != nil { return false, errors.WithStack(err) } + return checkPodsForFSBackup(pods, pvcName, defaultVolumesToFsBackup) +} + +// IsPVCDefaultToFSBackupWithCache is the cached version of IsPVCDefaultToFSBackup. +// If cache is nil or not built, it falls back to the non-cached version. +func IsPVCDefaultToFSBackupWithCache( + pvcNamespace, pvcName string, + crClient crclient.Client, + defaultVolumesToFsBackup bool, + cache *PVCPodCache, +) (bool, error) { + var pods []corev1api.Pod + var err error + + // Use cache if available, otherwise fall back to direct lookup + if cache != nil && cache.IsBuilt() { + pods = cache.GetPodsUsingPVC(pvcNamespace, pvcName) + } else { + pods, err = GetPodsUsingPVC(pvcNamespace, pvcName, crClient) + if err != nil { + return false, errors.WithStack(err) + } + } + + return checkPodsForFSBackup(pods, pvcName, defaultVolumesToFsBackup) +} + +// checkPodsForFSBackup is a helper function that checks if any pod using the PVC +// has the volume selected for fs-backup. +func checkPodsForFSBackup(pods []corev1api.Pod, pvcName string, defaultVolumesToFsBackup bool) (bool, error) { for index := range pods { vols, _ := GetVolumesByPod(&pods[index], defaultVolumesToFsBackup, false, []string{}) if len(vols) > 0 { @@ -140,6 +256,9 @@ func getPodVolumeNameForPVC(pod corev1api.Pod, pvcName string) (string, error) { return "", errors.Errorf("Pod %s/%s does not use PVC %s/%s", pod.Namespace, pod.Name, pod.Namespace, pvcName) } +// GetPodsUsingPVC returns all pods in the given namespace that use the specified PVC. +// This function lists all pods in the namespace and filters them, which can be slow +// when there are many pods. Consider using GetPodsUsingPVCWithCache for better performance. func GetPodsUsingPVC( pvcNamespace, pvcName string, crClient crclient.Client, @@ -165,6 +284,27 @@ func GetPodsUsingPVC( return podsUsingPVC, nil } +// GetPodsUsingPVCWithCache returns all pods that use the specified PVC. +// If cache is available and built, it uses the cache for O(1) lookup. +// Otherwise, it falls back to the original GetPodsUsingPVC function. +func GetPodsUsingPVCWithCache( + pvcNamespace, pvcName string, + crClient crclient.Client, + cache *PVCPodCache, +) ([]corev1api.Pod, error) { + // Use cache if available + if cache != nil && cache.IsBuilt() { + pods := cache.GetPodsUsingPVC(pvcNamespace, pvcName) + if pods == nil { + return []corev1api.Pod{}, nil + } + return pods, nil + } + + // Fall back to direct lookup + return GetPodsUsingPVC(pvcNamespace, pvcName, crClient) +} + func GetVolumesToProcess(volumes []corev1api.Volume, volsToProcessByLegacyApproach []string) []corev1api.Volume { volsToProcess := make([]corev1api.Volume, 0) diff --git a/pkg/util/podvolume/pod_volume_test.go b/pkg/util/podvolume/pod_volume_test.go index 8406c04d8..6732904ef 100644 --- a/pkg/util/podvolume/pod_volume_test.go +++ b/pkg/util/podvolume/pod_volume_test.go @@ -886,3 +886,369 @@ func TestGetVolumesToProcess(t *testing.T) { }) } } + +func TestPVCPodCache_BuildAndGet(t *testing.T) { + objs := []runtime.Object{ + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + Namespace: "default", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + { + Name: "vol2", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc2", + }, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", + Namespace: "default", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + EmptyDir: &corev1api.EmptyDirVolumeSource{}, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod4", + Namespace: "other-ns", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + } + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) + + testCases := []struct { + name string + namespaces []string + pvcNamespace string + pvcName string + expectedPodCount int + }{ + { + name: "should find 2 pods using pvc1 in default namespace", + namespaces: []string{"default", "other-ns"}, + pvcNamespace: "default", + pvcName: "pvc1", + expectedPodCount: 2, + }, + { + name: "should find 1 pod using pvc2 in default namespace", + namespaces: []string{"default", "other-ns"}, + pvcNamespace: "default", + pvcName: "pvc2", + expectedPodCount: 1, + }, + { + name: "should find 1 pod using pvc1 in other-ns", + namespaces: []string{"default", "other-ns"}, + pvcNamespace: "other-ns", + pvcName: "pvc1", + expectedPodCount: 1, + }, + { + name: "should find 0 pods for non-existent PVC", + namespaces: []string{"default", "other-ns"}, + pvcNamespace: "default", + pvcName: "non-existent", + expectedPodCount: 0, + }, + { + name: "should find 0 pods for non-existent namespace", + namespaces: []string{"default", "other-ns"}, + pvcNamespace: "non-existent-ns", + pvcName: "pvc1", + expectedPodCount: 0, + }, + { + name: "should find 0 pods when namespace not in cache", + namespaces: []string{"default"}, + pvcNamespace: "other-ns", + pvcName: "pvc1", + expectedPodCount: 0, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cache := NewPVCPodCache() + err := cache.BuildCacheForNamespaces(t.Context(), tc.namespaces, fakeClient) + require.NoError(t, err) + assert.True(t, cache.IsBuilt()) + + pods := cache.GetPodsUsingPVC(tc.pvcNamespace, tc.pvcName) + assert.Len(t, pods, tc.expectedPodCount, "unexpected number of pods") + }) + } +} + +func TestGetPodsUsingPVCWithCache(t *testing.T) { + objs := []runtime.Object{ + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + Namespace: "default", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + } + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) + + testCases := []struct { + name string + pvcNamespace string + pvcName string + buildCache bool + useNilCache bool + expectedPodCount int + }{ + { + name: "returns cached results when cache is available", + pvcNamespace: "default", + pvcName: "pvc1", + buildCache: true, + useNilCache: false, + expectedPodCount: 2, + }, + { + name: "falls back to direct lookup when cache is nil", + pvcNamespace: "default", + pvcName: "pvc1", + buildCache: false, + useNilCache: true, + expectedPodCount: 2, + }, + { + name: "falls back to direct lookup when cache is not built", + pvcNamespace: "default", + pvcName: "pvc1", + buildCache: false, + useNilCache: false, + expectedPodCount: 2, + }, + { + name: "returns empty slice for non-existent PVC with cache", + pvcNamespace: "default", + pvcName: "non-existent", + buildCache: true, + useNilCache: false, + expectedPodCount: 0, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var cache *PVCPodCache + if !tc.useNilCache { + cache = NewPVCPodCache() + if tc.buildCache { + err := cache.BuildCacheForNamespaces(t.Context(), []string{"default"}, fakeClient) + require.NoError(t, err) + } + } + + pods, err := GetPodsUsingPVCWithCache(tc.pvcNamespace, tc.pvcName, fakeClient, cache) + require.NoError(t, err) + assert.Len(t, pods, tc.expectedPodCount, "unexpected number of pods") + }) + } +} + +func TestIsPVCDefaultToFSBackupWithCache(t *testing.T) { + objs := []runtime.Object{ + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + Annotations: map[string]string{ + "backup.velero.io/backup-volumes": "vol1", + }, + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + Namespace: "default", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc2", + }, + }, + }, + }, + }, + }, + } + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) + + testCases := []struct { + name string + pvcNamespace string + pvcName string + defaultVolumesToFsBackup bool + buildCache bool + useNilCache bool + expectedResult bool + }{ + { + name: "returns true for PVC with opt-in annotation using cache", + pvcNamespace: "default", + pvcName: "pvc1", + defaultVolumesToFsBackup: false, + buildCache: true, + useNilCache: false, + expectedResult: true, + }, + { + name: "returns false for PVC without annotation using cache", + pvcNamespace: "default", + pvcName: "pvc2", + defaultVolumesToFsBackup: false, + buildCache: true, + useNilCache: false, + expectedResult: false, + }, + { + name: "returns true for any PVC with defaultVolumesToFsBackup using cache", + pvcNamespace: "default", + pvcName: "pvc2", + defaultVolumesToFsBackup: true, + buildCache: true, + useNilCache: false, + expectedResult: true, + }, + { + name: "falls back to direct lookup when cache is nil", + pvcNamespace: "default", + pvcName: "pvc1", + defaultVolumesToFsBackup: false, + buildCache: false, + useNilCache: true, + expectedResult: true, + }, + { + name: "returns false for non-existent PVC", + pvcNamespace: "default", + pvcName: "non-existent", + defaultVolumesToFsBackup: false, + buildCache: true, + useNilCache: false, + expectedResult: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var cache *PVCPodCache + if !tc.useNilCache { + cache = NewPVCPodCache() + if tc.buildCache { + err := cache.BuildCacheForNamespaces(t.Context(), []string{"default"}, fakeClient) + require.NoError(t, err) + } + } + + result, err := IsPVCDefaultToFSBackupWithCache(tc.pvcNamespace, tc.pvcName, fakeClient, tc.defaultVolumesToFsBackup, cache) + require.NoError(t, err) + assert.Equal(t, tc.expectedResult, result) + }) + } +} From 60203ad01b962c9d62c4a5ea0dfe4e5b3b062525 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 2 Dec 2025 12:30:02 -0800 Subject: [PATCH 07/19] Add changelog for PR #9441 Signed-off-by: Shubham Pampattiwar --- changelogs/unreleased/9441-shubham-pampattiwar | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9441-shubham-pampattiwar diff --git a/changelogs/unreleased/9441-shubham-pampattiwar b/changelogs/unreleased/9441-shubham-pampattiwar new file mode 100644 index 000000000..b7e821a0b --- /dev/null +++ b/changelogs/unreleased/9441-shubham-pampattiwar @@ -0,0 +1 @@ +Add PVC-to-Pod cache to improve volume policy performance From 26053ae6d684a82ea4a480077a04bf1f68559961 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Fri, 5 Dec 2025 12:19:49 -0800 Subject: [PATCH 08/19] Add unit tests for VolumeHelperImpl with PVC-to-Pod cache Add TestVolumeHelperImplWithCache_ShouldPerformSnapshot to verify: - Volume policy match with cache returns correct snapshot decision - fs-backup via opt-out with cache properly skips snapshot - Fallback to direct lookup when cache is not built These tests verify the cache-enabled code path added in the previous commit for improved volume policy performance. Signed-off-by: Shubham Pampattiwar --- .../volumehelper/volume_policy_helper_test.go | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index 7dd8ae361..c736e46b1 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -34,6 +34,7 @@ import ( "github.com/vmware-tanzu/velero/pkg/builder" "github.com/vmware-tanzu/velero/pkg/kuberesource" velerotest "github.com/vmware-tanzu/velero/pkg/test" + podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume" ) func TestVolumeHelperImpl_ShouldPerformSnapshot(t *testing.T) { @@ -738,3 +739,149 @@ func TestGetVolumeFromResource(t *testing.T) { assert.ErrorContains(t, err, "resource is not a PersistentVolume or Volume") }) } + +func TestVolumeHelperImplWithCache_ShouldPerformSnapshot(t *testing.T) { + testCases := []struct { + name string + inputObj runtime.Object + groupResource schema.GroupResource + pod *corev1api.Pod + resourcePolicies *resourcepolicies.ResourcePolicies + snapshotVolumesFlag *bool + defaultVolumesToFSBackup bool + buildCache bool + shouldSnapshot bool + expectedErr bool + }{ + { + name: "VolumePolicy match with cache, returns true", + inputObj: builder.ForPersistentVolume("example-pv").StorageClass("gp2-csi").ClaimRef("ns", "pvc-1").Result(), + groupResource: kuberesource.PersistentVolumes, + resourcePolicies: &resourcepolicies.ResourcePolicies{ + Version: "v1", + VolumePolicies: []resourcepolicies.VolumePolicy{ + { + Conditions: map[string]any{ + "storageClass": []string{"gp2-csi"}, + }, + Action: resourcepolicies.Action{ + Type: resourcepolicies.Snapshot, + }, + }, + }, + }, + snapshotVolumesFlag: ptr.To(true), + buildCache: true, + shouldSnapshot: true, + expectedErr: false, + }, + { + name: "VolumePolicy not match, fs-backup via opt-out with cache, skips snapshot", + inputObj: builder.ForPersistentVolume("example-pv").StorageClass("gp3-csi").ClaimRef("ns", "pvc-1").Result(), + groupResource: kuberesource.PersistentVolumes, + pod: builder.ForPod("ns", "pod-1").Volumes( + &corev1api.Volume{ + Name: "volume", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc-1", + }, + }, + }, + ).Result(), + resourcePolicies: &resourcepolicies.ResourcePolicies{ + Version: "v1", + VolumePolicies: []resourcepolicies.VolumePolicy{ + { + Conditions: map[string]any{ + "storageClass": []string{"gp2-csi"}, + }, + Action: resourcepolicies.Action{ + Type: resourcepolicies.Snapshot, + }, + }, + }, + }, + snapshotVolumesFlag: ptr.To(true), + defaultVolumesToFSBackup: true, + buildCache: true, + shouldSnapshot: false, + expectedErr: false, + }, + { + name: "Cache not built, falls back to direct lookup", + inputObj: builder.ForPersistentVolume("example-pv").StorageClass("gp2-csi").ClaimRef("ns", "pvc-1").Result(), + groupResource: kuberesource.PersistentVolumes, + resourcePolicies: &resourcepolicies.ResourcePolicies{ + Version: "v1", + VolumePolicies: []resourcepolicies.VolumePolicy{ + { + Conditions: map[string]any{ + "storageClass": []string{"gp2-csi"}, + }, + Action: resourcepolicies.Action{ + Type: resourcepolicies.Snapshot, + }, + }, + }, + }, + snapshotVolumesFlag: ptr.To(true), + buildCache: false, + shouldSnapshot: true, + expectedErr: false, + }, + } + + objs := []runtime.Object{ + &corev1api.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "pvc-1", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) + if tc.pod != nil { + require.NoError(t, fakeClient.Create(t.Context(), tc.pod)) + } + + var p *resourcepolicies.Policies + if tc.resourcePolicies != nil { + p = &resourcepolicies.Policies{} + err := p.BuildPolicy(tc.resourcePolicies) + require.NoError(t, err) + } + + var cache *podvolumeutil.PVCPodCache + if tc.buildCache { + cache = podvolumeutil.NewPVCPodCache() + err := cache.BuildCacheForNamespaces(t.Context(), []string{"ns"}, fakeClient) + require.NoError(t, err) + } + + vh := NewVolumeHelperImplWithCache( + p, + tc.snapshotVolumesFlag, + logrus.StandardLogger(), + fakeClient, + tc.defaultVolumesToFSBackup, + false, + cache, + ) + + obj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(tc.inputObj) + require.NoError(t, err) + + actualShouldSnapshot, actualError := vh.ShouldPerformSnapshot(&unstructured.Unstructured{Object: obj}, tc.groupResource) + if tc.expectedErr { + require.Error(t, actualError) + return + } + require.NoError(t, actualError) + require.Equalf(t, tc.shouldSnapshot, actualShouldSnapshot, "Want shouldSnapshot as %t; Got shouldSnapshot as %t", tc.shouldSnapshot, actualShouldSnapshot) + }) + } +} From a43f14b0712f27d7667d27f8eb7bc3a91374daf7 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Fri, 5 Dec 2025 12:21:59 -0800 Subject: [PATCH 09/19] Add unit tests for ShouldPerformFSBackup with PVC-to-Pod cache Add TestVolumeHelperImplWithCache_ShouldPerformFSBackup to verify: - Volume policy match with cache returns correct fs-backup decision - Volume policy match with snapshot action skips fs-backup - Fallback to direct lookup when cache is not built Signed-off-by: Shubham Pampattiwar --- .../volumehelper/volume_policy_helper_test.go | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index c736e46b1..2a02b675f 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -885,3 +885,153 @@ func TestVolumeHelperImplWithCache_ShouldPerformSnapshot(t *testing.T) { }) } } + +func TestVolumeHelperImplWithCache_ShouldPerformFSBackup(t *testing.T) { + testCases := []struct { + name string + pod *corev1api.Pod + resources []runtime.Object + resourcePolicies *resourcepolicies.ResourcePolicies + snapshotVolumesFlag *bool + defaultVolumesToFSBackup bool + buildCache bool + shouldFSBackup bool + expectedErr bool + }{ + { + name: "VolumePolicy match with cache, return true", + pod: builder.ForPod("ns", "pod-1"). + Volumes( + &corev1api.Volume{ + Name: "vol-1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc-1", + }, + }, + }).Result(), + resources: []runtime.Object{ + builder.ForPersistentVolumeClaim("ns", "pvc-1"). + VolumeName("pv-1"). + StorageClass("gp2-csi").Phase(corev1api.ClaimBound).Result(), + builder.ForPersistentVolume("pv-1").StorageClass("gp2-csi").Result(), + }, + resourcePolicies: &resourcepolicies.ResourcePolicies{ + Version: "v1", + VolumePolicies: []resourcepolicies.VolumePolicy{ + { + Conditions: map[string]any{ + "storageClass": []string{"gp2-csi"}, + }, + Action: resourcepolicies.Action{ + Type: resourcepolicies.FSBackup, + }, + }, + }, + }, + buildCache: true, + shouldFSBackup: true, + expectedErr: false, + }, + { + name: "VolumePolicy match with cache, action is snapshot, return false", + pod: builder.ForPod("ns", "pod-1"). + Volumes( + &corev1api.Volume{ + Name: "vol-1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc-1", + }, + }, + }).Result(), + resources: []runtime.Object{ + builder.ForPersistentVolumeClaim("ns", "pvc-1"). + VolumeName("pv-1"). + StorageClass("gp2-csi").Phase(corev1api.ClaimBound).Result(), + builder.ForPersistentVolume("pv-1").StorageClass("gp2-csi").Result(), + }, + resourcePolicies: &resourcepolicies.ResourcePolicies{ + Version: "v1", + VolumePolicies: []resourcepolicies.VolumePolicy{ + { + Conditions: map[string]any{ + "storageClass": []string{"gp2-csi"}, + }, + Action: resourcepolicies.Action{ + Type: resourcepolicies.Snapshot, + }, + }, + }, + }, + buildCache: true, + shouldFSBackup: false, + expectedErr: false, + }, + { + name: "Cache not built, falls back to direct lookup, opt-in annotation", + pod: builder.ForPod("ns", "pod-1"). + ObjectMeta(builder.WithAnnotations(velerov1api.VolumesToBackupAnnotation, "vol-1")). + Volumes( + &corev1api.Volume{ + Name: "vol-1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc-1", + }, + }, + }).Result(), + resources: []runtime.Object{ + builder.ForPersistentVolumeClaim("ns", "pvc-1"). + VolumeName("pv-1"). + StorageClass("gp2-csi").Phase(corev1api.ClaimBound).Result(), + builder.ForPersistentVolume("pv-1").StorageClass("gp2-csi").Result(), + }, + buildCache: false, + defaultVolumesToFSBackup: false, + shouldFSBackup: true, + expectedErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, tc.resources...) + if tc.pod != nil { + require.NoError(t, fakeClient.Create(t.Context(), tc.pod)) + } + + var p *resourcepolicies.Policies + if tc.resourcePolicies != nil { + p = &resourcepolicies.Policies{} + err := p.BuildPolicy(tc.resourcePolicies) + require.NoError(t, err) + } + + var cache *podvolumeutil.PVCPodCache + if tc.buildCache { + cache = podvolumeutil.NewPVCPodCache() + err := cache.BuildCacheForNamespaces(t.Context(), []string{"ns"}, fakeClient) + require.NoError(t, err) + } + + vh := NewVolumeHelperImplWithCache( + p, + tc.snapshotVolumesFlag, + logrus.StandardLogger(), + fakeClient, + tc.defaultVolumesToFSBackup, + false, + cache, + ) + + actualShouldFSBackup, actualError := vh.ShouldPerformFSBackup(tc.pod.Spec.Volumes[0], *tc.pod) + if tc.expectedErr { + require.Error(t, actualError) + return + } + require.NoError(t, actualError) + require.Equalf(t, tc.shouldFSBackup, actualShouldFSBackup, "Want shouldFSBackup as %t; Got shouldFSBackup as %t", tc.shouldFSBackup, actualShouldFSBackup) + }) + } +} From 8e580996747045da4ba2a8f8e1db30ba292b83a6 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Fri, 5 Dec 2025 12:23:28 -0800 Subject: [PATCH 10/19] Add test for cache usage without volume policy Add test case to verify that the PVC-to-Pod cache is used even when no volume policy is configured. When defaultVolumesToFSBackup is true, the cache is used to find pods using the PVC to determine if fs-backup should be used instead of snapshot. Signed-off-by: Shubham Pampattiwar --- .../volumehelper/volume_policy_helper_test.go | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index 2a02b675f..862081725 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -830,6 +830,27 @@ func TestVolumeHelperImplWithCache_ShouldPerformSnapshot(t *testing.T) { shouldSnapshot: true, expectedErr: false, }, + { + name: "No volume policy, defaultVolumesToFSBackup with cache, skips snapshot", + inputObj: builder.ForPersistentVolume("example-pv").StorageClass("gp2-csi").ClaimRef("ns", "pvc-1").Result(), + groupResource: kuberesource.PersistentVolumes, + pod: builder.ForPod("ns", "pod-1").Volumes( + &corev1api.Volume{ + Name: "volume", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc-1", + }, + }, + }, + ).Result(), + resourcePolicies: nil, + snapshotVolumesFlag: ptr.To(true), + defaultVolumesToFSBackup: true, + buildCache: true, + shouldSnapshot: false, + expectedErr: false, + }, } objs := []runtime.Object{ From 041e5e2a7ec46c1c4530a3b175af86a09ad4f1bd Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 9 Dec 2025 18:00:26 -0800 Subject: [PATCH 11/19] Address review feedback - Use ResolveNamespaceList() instead of GetIncludes() for more accurate namespace resolution when building the PVC-to-Pod cache - Refactor NewVolumeHelperImpl to call NewVolumeHelperImplWithCache with nil cache parameter to avoid code duplication Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper.go | 18 +++++++++--------- pkg/backup/backup.go | 7 +++++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper.go b/internal/volumehelper/volume_policy_helper.go index 1808427fb..b4c3b467e 100644 --- a/internal/volumehelper/volume_policy_helper.go +++ b/internal/volumehelper/volume_policy_helper.go @@ -46,15 +46,15 @@ func NewVolumeHelperImpl( defaultVolumesToFSBackup bool, backupExcludePVC bool, ) VolumeHelper { - return &volumeHelperImpl{ - volumePolicy: volumePolicy, - snapshotVolumes: snapshotVolumes, - logger: logger, - client: client, - defaultVolumesToFSBackup: defaultVolumesToFSBackup, - backupExcludePVC: backupExcludePVC, - pvcPodCache: nil, // Cache will be nil by default for backward compatibility - } + return NewVolumeHelperImplWithCache( + volumePolicy, + snapshotVolumes, + logger, + client, + defaultVolumesToFSBackup, + backupExcludePVC, + nil, + ) } // NewVolumeHelperImplWithCache creates a VolumeHelper with a PVC-to-Pod cache for improved performance. diff --git a/pkg/backup/backup.go b/pkg/backup/backup.go index 23601c310..db6eec3a4 100644 --- a/pkg/backup/backup.go +++ b/pkg/backup/backup.go @@ -413,8 +413,11 @@ func (kb *kubernetesBackupper) BackupWithResolvers( // This avoids O(N*M) complexity when there are many PVCs and pods. // See issue #9179 for details. pvcPodCache := podvolumeutil.NewPVCPodCache() - namespaces := backupRequest.NamespaceIncludesExcludes.GetIncludes() - if len(namespaces) > 0 { + namespaces, err := backupRequest.NamespaceIncludesExcludes.ResolveNamespaceList() + if err != nil { + log.WithError(err).Warn("Failed to resolve namespace list for PVC-to-Pod cache, falling back to direct lookups") + pvcPodCache = nil + } else if len(namespaces) > 0 { if err := pvcPodCache.BuildCacheForNamespaces(context.Background(), namespaces, kb.kbClient); err != nil { // Log warning but continue - the cache will fall back to direct lookups log.WithError(err).Warn("Failed to build PVC-to-Pod cache, falling back to direct lookups") From 99e821a87079a4e7e3eec7a1ed3911123e655671 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Thu, 11 Dec 2025 00:00:13 -0800 Subject: [PATCH 12/19] Address review feedback: move cache building to volumehelper - Rename NewVolumeHelperImplWithCache to NewVolumeHelperImplWithNamespaces - Move cache building logic from backup.go into volumehelper - Return error from NewVolumeHelperImplWithNamespaces if cache build fails - Remove fallback in main backup path - backup fails if cache build fails - Update NewVolumeHelperImpl to call NewVolumeHelperImplWithNamespaces - Add comments clarifying fallback is only used by plugins - Update tests for new error return signature This addresses review comments from @Lyndon-Li and @kaovilai: - Cache building is now encapsulated in volumehelper - No fallback in main backup path ensures predictable performance - Code reuse between constructors Fixes #9179 Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper.go | 30 ++++++++++--- .../volumehelper/volume_policy_helper_test.go | 23 +++++----- pkg/backup/backup.go | 43 ++++++++----------- pkg/util/podvolume/pod_volume.go | 4 ++ 4 files changed, 56 insertions(+), 44 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper.go b/internal/volumehelper/volume_policy_helper.go index b4c3b467e..22a88a146 100644 --- a/internal/volumehelper/volume_policy_helper.go +++ b/internal/volumehelper/volume_policy_helper.go @@ -1,6 +1,7 @@ package volumehelper import ( + "context" "fmt" "strings" @@ -46,7 +47,9 @@ func NewVolumeHelperImpl( defaultVolumesToFSBackup bool, backupExcludePVC bool, ) VolumeHelper { - return NewVolumeHelperImplWithCache( + // Pass nil namespaces - no cache will be built, so this never fails. + // This is used by plugins that don't need the cache optimization. + vh, _ := NewVolumeHelperImplWithNamespaces( volumePolicy, snapshotVolumes, logger, @@ -55,19 +58,32 @@ func NewVolumeHelperImpl( backupExcludePVC, nil, ) + return vh } -// NewVolumeHelperImplWithCache creates a VolumeHelper with a PVC-to-Pod cache for improved performance. -// The cache should be built before backup processing begins. -func NewVolumeHelperImplWithCache( +// NewVolumeHelperImplWithNamespaces creates a VolumeHelper with a PVC-to-Pod cache for improved performance. +// The cache is built internally from the provided namespaces list. +// This avoids O(N*M) complexity when there are many PVCs and pods. +// See issue #9179 for details. +// Returns an error if cache building fails - callers should not proceed with backup in this case. +func NewVolumeHelperImplWithNamespaces( volumePolicy *resourcepolicies.Policies, snapshotVolumes *bool, logger logrus.FieldLogger, client crclient.Client, defaultVolumesToFSBackup bool, backupExcludePVC bool, - pvcPodCache *podvolumeutil.PVCPodCache, -) VolumeHelper { + namespaces []string, +) (VolumeHelper, error) { + var pvcPodCache *podvolumeutil.PVCPodCache + if len(namespaces) > 0 { + pvcPodCache = podvolumeutil.NewPVCPodCache() + if err := pvcPodCache.BuildCacheForNamespaces(context.Background(), namespaces, client); err != nil { + return nil, err + } + logger.Infof("Built PVC-to-Pod cache for %d namespaces", len(namespaces)) + } + return &volumeHelperImpl{ volumePolicy: volumePolicy, snapshotVolumes: snapshotVolumes, @@ -76,7 +92,7 @@ func NewVolumeHelperImplWithCache( defaultVolumesToFSBackup: defaultVolumesToFSBackup, backupExcludePVC: backupExcludePVC, pvcPodCache: pvcPodCache, - } + }, nil } func (v *volumeHelperImpl) ShouldPerformSnapshot(obj runtime.Unstructured, groupResource schema.GroupResource) (bool, error) { diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index 862081725..57c99e862 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -34,7 +34,6 @@ import ( "github.com/vmware-tanzu/velero/pkg/builder" "github.com/vmware-tanzu/velero/pkg/kuberesource" velerotest "github.com/vmware-tanzu/velero/pkg/test" - podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume" ) func TestVolumeHelperImpl_ShouldPerformSnapshot(t *testing.T) { @@ -876,22 +875,21 @@ func TestVolumeHelperImplWithCache_ShouldPerformSnapshot(t *testing.T) { require.NoError(t, err) } - var cache *podvolumeutil.PVCPodCache + var namespaces []string if tc.buildCache { - cache = podvolumeutil.NewPVCPodCache() - err := cache.BuildCacheForNamespaces(t.Context(), []string{"ns"}, fakeClient) - require.NoError(t, err) + namespaces = []string{"ns"} } - vh := NewVolumeHelperImplWithCache( + vh, err := NewVolumeHelperImplWithNamespaces( p, tc.snapshotVolumesFlag, logrus.StandardLogger(), fakeClient, tc.defaultVolumesToFSBackup, false, - cache, + namespaces, ) + require.NoError(t, err) obj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(tc.inputObj) require.NoError(t, err) @@ -1029,22 +1027,21 @@ func TestVolumeHelperImplWithCache_ShouldPerformFSBackup(t *testing.T) { require.NoError(t, err) } - var cache *podvolumeutil.PVCPodCache + var namespaces []string if tc.buildCache { - cache = podvolumeutil.NewPVCPodCache() - err := cache.BuildCacheForNamespaces(t.Context(), []string{"ns"}, fakeClient) - require.NoError(t, err) + namespaces = []string{"ns"} } - vh := NewVolumeHelperImplWithCache( + vh, err := NewVolumeHelperImplWithNamespaces( p, tc.snapshotVolumesFlag, logrus.StandardLogger(), fakeClient, tc.defaultVolumesToFSBackup, false, - cache, + namespaces, ) + require.NoError(t, err) actualShouldFSBackup, actualError := vh.ShouldPerformFSBackup(tc.pod.Spec.Volumes[0], *tc.pod) if tc.expectedErr { diff --git a/pkg/backup/backup.go b/pkg/backup/backup.go index db6eec3a4..1a1c54247 100644 --- a/pkg/backup/backup.go +++ b/pkg/backup/backup.go @@ -64,7 +64,6 @@ import ( "github.com/vmware-tanzu/velero/pkg/util/boolptr" "github.com/vmware-tanzu/velero/pkg/util/collections" "github.com/vmware-tanzu/velero/pkg/util/kube" - podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume" ) // BackupVersion is the current backup major version for Velero. @@ -409,22 +408,26 @@ func (kb *kubernetesBackupper) BackupWithResolvers( } backupRequest.Status.Progress = &velerov1api.BackupProgress{TotalItems: len(items)} - // Build PVC-to-Pod cache for improved volume policy lookup performance. - // This avoids O(N*M) complexity when there are many PVCs and pods. + // Resolve namespaces for PVC-to-Pod cache building in volumehelper. // See issue #9179 for details. - pvcPodCache := podvolumeutil.NewPVCPodCache() namespaces, err := backupRequest.NamespaceIncludesExcludes.ResolveNamespaceList() if err != nil { - log.WithError(err).Warn("Failed to resolve namespace list for PVC-to-Pod cache, falling back to direct lookups") - pvcPodCache = nil - } else if len(namespaces) > 0 { - if err := pvcPodCache.BuildCacheForNamespaces(context.Background(), namespaces, kb.kbClient); err != nil { - // Log warning but continue - the cache will fall back to direct lookups - log.WithError(err).Warn("Failed to build PVC-to-Pod cache, falling back to direct lookups") - pvcPodCache = nil - } else { - log.Infof("Built PVC-to-Pod cache for %d namespaces", len(namespaces)) - } + log.WithError(err).Error("Failed to resolve namespace list for PVC-to-Pod cache") + return err + } + + volumeHelperImpl, err := volumehelper.NewVolumeHelperImplWithNamespaces( + backupRequest.ResPolicies, + backupRequest.Spec.SnapshotVolumes, + log, + kb.kbClient, + boolptr.IsSetToTrue(backupRequest.Spec.DefaultVolumesToFsBackup), + !backupRequest.ResourceIncludesExcludes.ShouldInclude(kuberesource.PersistentVolumeClaims.String()), + namespaces, + ) + if err != nil { + log.WithError(err).Error("Failed to build PVC-to-Pod cache for volume policy lookups") + return err } itemBackupper := &itemBackupper{ @@ -440,16 +443,8 @@ func (kb *kubernetesBackupper) BackupWithResolvers( itemHookHandler: &hook.DefaultItemHookHandler{ PodCommandExecutor: kb.podCommandExecutor, }, - hookTracker: hook.NewHookTracker(), - volumeHelperImpl: volumehelper.NewVolumeHelperImplWithCache( - backupRequest.ResPolicies, - backupRequest.Spec.SnapshotVolumes, - log, - kb.kbClient, - boolptr.IsSetToTrue(backupRequest.Spec.DefaultVolumesToFsBackup), - !backupRequest.ResourceIncludesExcludes.ShouldInclude(kuberesource.PersistentVolumeClaims.String()), - pvcPodCache, - ), + hookTracker: hook.NewHookTracker(), + volumeHelperImpl: volumeHelperImpl, kubernetesBackupper: kb, } diff --git a/pkg/util/podvolume/pod_volume.go b/pkg/util/podvolume/pod_volume.go index aa03e1b2e..1bf87a5fa 100644 --- a/pkg/util/podvolume/pod_volume.go +++ b/pkg/util/podvolume/pod_volume.go @@ -206,6 +206,8 @@ func IsPVCDefaultToFSBackup(pvcNamespace, pvcName string, crClient crclient.Clie // IsPVCDefaultToFSBackupWithCache is the cached version of IsPVCDefaultToFSBackup. // If cache is nil or not built, it falls back to the non-cached version. +// Note: In the main backup path, the cache is always built (via NewVolumeHelperImplWithNamespaces), +// so the fallback is only used by plugins that don't need cache optimization. func IsPVCDefaultToFSBackupWithCache( pvcNamespace, pvcName string, crClient crclient.Client, @@ -287,6 +289,8 @@ func GetPodsUsingPVC( // GetPodsUsingPVCWithCache returns all pods that use the specified PVC. // If cache is available and built, it uses the cache for O(1) lookup. // Otherwise, it falls back to the original GetPodsUsingPVC function. +// Note: In the main backup path, the cache is always built (via NewVolumeHelperImplWithNamespaces), +// so the fallback is only used by plugins that don't need cache optimization. func GetPodsUsingPVCWithCache( pvcNamespace, pvcName string, crClient crclient.Client, From 987edf50372b1aaa0ef1349f70dca644051a32a5 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Mon, 15 Dec 2025 13:55:32 -0800 Subject: [PATCH 13/19] Add global VolumeHelper caching in CSI PVC BIA plugin Address review feedback to have a global VolumeHelper instance per plugin process instead of creating one on each ShouldPerformSnapshot call. Changes: - Add volumeHelper, cachedForBackup, and mu fields to pvcBackupItemAction struct for caching the VolumeHelper per backup - Add getOrCreateVolumeHelper() method for thread-safe lazy initialization - Update Execute() to use cached VolumeHelper via ShouldPerformSnapshotWithVolumeHelper() - Update filterPVCsByVolumePolicy() to accept VolumeHelper parameter - Add ShouldPerformSnapshotWithVolumeHelper() that accepts optional VolumeHelper for reuse across multiple calls - Add NewVolumeHelperForBackup() factory function for BIA plugins - Add comprehensive unit tests for both nil and non-nil VolumeHelper paths This completes the fix for issue #9179 by ensuring the PVC-to-Pod cache is built once per backup and reused across all PVC processing, avoiding O(N*M) complexity. Fixes #9179 Signed-off-by: Shubham Pampattiwar --- pkg/backup/actions/csi/pvc_action.go | 61 ++- pkg/backup/actions/csi/pvc_action_test.go | 238 +++++++++- .../volumehelper/volume_policy_helper.go | 121 +++++ .../volumehelper/volume_policy_helper_test.go | 440 ++++++++++++++++++ 4 files changed, 855 insertions(+), 5 deletions(-) create mode 100644 pkg/plugin/utils/volumehelper/volume_policy_helper_test.go diff --git a/pkg/backup/actions/csi/pvc_action.go b/pkg/backup/actions/csi/pvc_action.go index 62c51accc..c5a83c07e 100644 --- a/pkg/backup/actions/csi/pvc_action.go +++ b/pkg/backup/actions/csi/pvc_action.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strconv" + "sync" "time" "k8s.io/client-go/util/retry" @@ -44,6 +45,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" + internalvolumehelper "github.com/vmware-tanzu/velero/internal/volumehelper" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" velerov2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1" veleroclient "github.com/vmware-tanzu/velero/pkg/client" @@ -72,6 +74,16 @@ const ( type pvcBackupItemAction struct { log logrus.FieldLogger crClient crclient.Client + + // volumeHelper caches the VolumeHelper instance for the current backup. + // This avoids the O(N*M) performance issue when there are many PVCs and pods. + // See issue #9179 for details. + volumeHelper internalvolumehelper.VolumeHelper + // cachedForBackup tracks which backup the volumeHelper was built for. + // If the backup UID changes, we need to rebuild the cache. + cachedForBackup types.UID + // mu protects volumeHelper and cachedForBackup for concurrent access. + mu sync.Mutex } // AppliesTo returns information indicating that the PVCBackupItemAction @@ -97,6 +109,31 @@ func (p *pvcBackupItemAction) validateBackup(backup velerov1api.Backup) (valid b return true } +// getOrCreateVolumeHelper returns a cached VolumeHelper for the given backup. +// If the backup UID has changed or no VolumeHelper exists, a new one is created. +// This avoids the O(N*M) performance issue when there are many PVCs and pods. +// See issue #9179 for details. +func (p *pvcBackupItemAction) getOrCreateVolumeHelper(backup *velerov1api.Backup) (internalvolumehelper.VolumeHelper, error) { + p.mu.Lock() + defer p.mu.Unlock() + + // Check if we already have a VolumeHelper for this backup + if p.volumeHelper != nil && p.cachedForBackup == backup.UID { + return p.volumeHelper, nil + } + + // Build a new VolumeHelper with cache for this backup + p.log.Infof("Building VolumeHelper with PVC-to-Pod cache for backup %s/%s", backup.Namespace, backup.Name) + vh, err := volumehelper.NewVolumeHelperForBackup(*backup, p.crClient, p.log, nil) + if err != nil { + return nil, errors.Wrap(err, "failed to create VolumeHelper for backup") + } + + p.volumeHelper = vh + p.cachedForBackup = backup.UID + return vh, nil +} + func (p *pvcBackupItemAction) validatePVCandPV( pvc corev1api.PersistentVolumeClaim, item runtime.Unstructured, @@ -248,12 +285,19 @@ func (p *pvcBackupItemAction) Execute( return item, nil, "", nil, nil } - shouldSnapshot, err := volumehelper.ShouldPerformSnapshotWithBackup( + // Get or create the cached VolumeHelper for this backup + vh, err := p.getOrCreateVolumeHelper(backup) + if err != nil { + return nil, nil, "", nil, err + } + + shouldSnapshot, err := volumehelper.ShouldPerformSnapshotWithVolumeHelper( item, kuberesource.PersistentVolumeClaims, *backup, p.crClient, p.log, + vh, ) if err != nil { return nil, nil, "", nil, err @@ -621,8 +665,14 @@ func (p *pvcBackupItemAction) getVolumeSnapshotReference( return nil, errors.Wrapf(err, "failed to list PVCs in VolumeGroupSnapshot group %q in namespace %q", group, pvc.Namespace) } + // Get the cached VolumeHelper for filtering PVCs by volume policy + vh, err := p.getOrCreateVolumeHelper(backup) + if err != nil { + return nil, errors.Wrapf(err, "failed to get VolumeHelper for filtering PVCs in group %q", group) + } + // Filter PVCs by volume policy - filteredPVCs, err := p.filterPVCsByVolumePolicy(groupedPVCs, backup) + filteredPVCs, err := p.filterPVCsByVolumePolicy(groupedPVCs, backup, vh) if err != nil { return nil, errors.Wrapf(err, "failed to filter PVCs by volume policy for VolumeGroupSnapshot group %q", group) } @@ -759,11 +809,12 @@ func (p *pvcBackupItemAction) listGroupedPVCs(ctx context.Context, namespace, la func (p *pvcBackupItemAction) filterPVCsByVolumePolicy( pvcs []corev1api.PersistentVolumeClaim, backup *velerov1api.Backup, + vh internalvolumehelper.VolumeHelper, ) ([]corev1api.PersistentVolumeClaim, error) { var filteredPVCs []corev1api.PersistentVolumeClaim for _, pvc := range pvcs { - // Convert PVC to unstructured for ShouldPerformSnapshotWithBackup + // Convert PVC to unstructured for ShouldPerformSnapshotWithVolumeHelper pvcMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&pvc) if err != nil { return nil, errors.Wrapf(err, "failed to convert PVC %s/%s to unstructured", pvc.Namespace, pvc.Name) @@ -771,12 +822,14 @@ func (p *pvcBackupItemAction) filterPVCsByVolumePolicy( unstructuredPVC := &unstructured.Unstructured{Object: pvcMap} // Check if this PVC should be snapshotted according to volume policies - shouldSnapshot, err := volumehelper.ShouldPerformSnapshotWithBackup( + // Uses the cached VolumeHelper for better performance with many PVCs/pods + shouldSnapshot, err := volumehelper.ShouldPerformSnapshotWithVolumeHelper( unstructuredPVC, kuberesource.PersistentVolumeClaims, *backup, p.crClient, p.log, + vh, ) if err != nil { return nil, errors.Wrapf(err, "failed to check volume policy for PVC %s/%s", pvc.Namespace, pvc.Name) diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index 6280b3ebd..eeb6d16e2 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -842,7 +842,9 @@ volumePolicies: crClient: client, } - result, err := action.filterPVCsByVolumePolicy(tt.pvcs, backup) + // Pass nil for VolumeHelper in tests - it will fall back to creating a new one per call + // This is the expected behavior for testing and third-party plugins + result, err := action.filterPVCsByVolumePolicy(tt.pvcs, backup, nil) if tt.expectError { require.Error(t, err) } else { @@ -860,6 +862,111 @@ volumePolicies: } } +// TestFilterPVCsByVolumePolicyWithVolumeHelper tests filterPVCsByVolumePolicy when a +// pre-created VolumeHelper is passed (non-nil). This exercises the cached path used +// by the CSI PVC BIA plugin for better performance. +func TestFilterPVCsByVolumePolicyWithVolumeHelper(t *testing.T) { + // Create test PVCs and PVs + pvcs := []corev1api.PersistentVolumeClaim{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pvc-csi", Namespace: "ns-1"}, + Spec: corev1api.PersistentVolumeClaimSpec{ + VolumeName: "pv-csi", + StorageClassName: pointer.String("sc-csi"), + }, + Status: corev1api.PersistentVolumeClaimStatus{Phase: corev1api.ClaimBound}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pvc-nfs", Namespace: "ns-1"}, + Spec: corev1api.PersistentVolumeClaimSpec{ + VolumeName: "pv-nfs", + StorageClassName: pointer.String("sc-nfs"), + }, + Status: corev1api.PersistentVolumeClaimStatus{Phase: corev1api.ClaimBound}, + }, + } + + pvs := []corev1api.PersistentVolume{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pv-csi"}, + Spec: corev1api.PersistentVolumeSpec{ + PersistentVolumeSource: corev1api.PersistentVolumeSource{ + CSI: &corev1api.CSIPersistentVolumeSource{Driver: "csi-driver"}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pv-nfs"}, + Spec: corev1api.PersistentVolumeSpec{ + PersistentVolumeSource: corev1api.PersistentVolumeSource{ + NFS: &corev1api.NFSVolumeSource{ + Server: "nfs-server", + Path: "/export", + }, + }, + }, + }, + } + + // Create fake client with PVs + objs := []runtime.Object{} + for i := range pvs { + objs = append(objs, &pvs[i]) + } + client := velerotest.NewFakeControllerRuntimeClient(t, objs...) + + // Create backup with volume policy that skips NFS volumes + volumePolicyStr := ` +version: v1 +volumePolicies: +- conditions: + nfs: {} + action: + type: skip +` + cm := &corev1api.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "volume-policy", + Namespace: "velero", + }, + Data: map[string]string{ + "volume-policy": volumePolicyStr, + }, + } + require.NoError(t, client.Create(t.Context(), cm)) + + backup := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + ResourcePolicy: &corev1api.TypedLocalObjectReference{ + Kind: "ConfigMap", + Name: "volume-policy", + }, + }, + } + + action := &pvcBackupItemAction{ + log: velerotest.NewLogger(), + crClient: client, + } + + // Create a VolumeHelper using the same method the plugin would use + vh, err := action.getOrCreateVolumeHelper(backup) + require.NoError(t, err) + require.NotNil(t, vh) + + // Test with the pre-created VolumeHelper (non-nil path) + result, err := action.filterPVCsByVolumePolicy(pvcs, backup, vh) + require.NoError(t, err) + + // Should filter out the NFS PVC, leaving only the CSI PVC + require.Len(t, result, 1) + require.Equal(t, "pvc-csi", result[0].Name) +} + func TestDetermineCSIDriver(t *testing.T) { tests := []struct { name string @@ -1959,3 +2066,132 @@ func TestPVCRequestSize(t *testing.T) { }) } } + +// TestGetOrCreateVolumeHelper tests the VolumeHelper caching behavior +func TestGetOrCreateVolumeHelper(t *testing.T) { + tests := []struct { + name string + setup func() (*pvcBackupItemAction, *velerov1api.Backup, *velerov1api.Backup) + wantSameCache bool + }{ + { + name: "Returns same VolumeHelper for same backup UID", + setup: func() (*pvcBackupItemAction, *velerov1api.Backup, *velerov1api.Backup) { + client := velerotest.NewFakeControllerRuntimeClient(t) + action := &pvcBackupItemAction{ + log: velerotest.NewLogger(), + crClient: client, + } + backup := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + UID: types.UID("test-uid-1"), + }, + } + return action, backup, backup // Same backup instance + }, + wantSameCache: true, + }, + { + name: "Returns new VolumeHelper for different backup UID", + setup: func() (*pvcBackupItemAction, *velerov1api.Backup, *velerov1api.Backup) { + client := velerotest.NewFakeControllerRuntimeClient(t) + action := &pvcBackupItemAction{ + log: velerotest.NewLogger(), + crClient: client, + } + backup1 := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup-1", + Namespace: "velero", + UID: types.UID("test-uid-1"), + }, + } + backup2 := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup-2", + Namespace: "velero", + UID: types.UID("test-uid-2"), + }, + } + return action, backup1, backup2 // Different backup instances + }, + wantSameCache: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + action, backup1, backup2 := tt.setup() + + // Get VolumeHelper for first backup + vh1, err := action.getOrCreateVolumeHelper(backup1) + require.NoError(t, err) + require.NotNil(t, vh1) + + // Get VolumeHelper for second backup + vh2, err := action.getOrCreateVolumeHelper(backup2) + require.NoError(t, err) + require.NotNil(t, vh2) + + if tt.wantSameCache { + // Same backup UID should return same VolumeHelper pointer + require.Same(t, vh1, vh2, "Expected same VolumeHelper instance for same backup UID") + } else { + // Different backup UID should return different VolumeHelper pointer + require.NotSame(t, vh1, vh2, "Expected different VolumeHelper instance for different backup UID") + } + }) + } +} + +// TestGetOrCreateVolumeHelperConcurrency tests thread-safety of VolumeHelper caching +func TestGetOrCreateVolumeHelperConcurrency(t *testing.T) { + client := velerotest.NewFakeControllerRuntimeClient(t) + action := &pvcBackupItemAction{ + log: velerotest.NewLogger(), + crClient: client, + } + backup := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + UID: types.UID("test-uid"), + }, + } + + // Run multiple goroutines concurrently to get VolumeHelper + const numGoroutines = 10 + results := make(chan interface{}, numGoroutines) + errors := make(chan error, numGoroutines) + + for i := 0; i < numGoroutines; i++ { + go func() { + vh, err := action.getOrCreateVolumeHelper(backup) + if err != nil { + errors <- err + return + } + results <- vh + }() + } + + // Collect all results + var volumeHelpers []interface{} + for i := 0; i < numGoroutines; i++ { + select { + case vh := <-results: + volumeHelpers = append(volumeHelpers, vh) + case err := <-errors: + t.Fatalf("Unexpected error: %v", err) + } + } + + // All goroutines should get the same VolumeHelper instance + require.Len(t, volumeHelpers, numGoroutines) + firstVH := volumeHelpers[0] + for i := 1; i < len(volumeHelpers); i++ { + require.Same(t, firstVH, volumeHelpers[i], "All goroutines should get the same VolumeHelper instance") + } +} diff --git a/pkg/plugin/utils/volumehelper/volume_policy_helper.go b/pkg/plugin/utils/volumehelper/volume_policy_helper.go index 0a7960faa..715db15d9 100644 --- a/pkg/plugin/utils/volumehelper/volume_policy_helper.go +++ b/pkg/plugin/utils/volumehelper/volume_policy_helper.go @@ -17,7 +17,10 @@ limitations under the License. package volumehelper import ( + "context" + "github.com/sirupsen/logrus" + corev1api "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" crclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -40,6 +43,35 @@ func ShouldPerformSnapshotWithBackup( crClient crclient.Client, logger logrus.FieldLogger, ) (bool, error) { + return ShouldPerformSnapshotWithVolumeHelper( + unstructured, + groupResource, + backup, + crClient, + logger, + nil, // no cached VolumeHelper, will create one + ) +} + +// ShouldPerformSnapshotWithVolumeHelper is like ShouldPerformSnapshotWithBackup +// but accepts an optional VolumeHelper. If vh is non-nil, it will be used directly, +// avoiding the overhead of creating a new VolumeHelper on each call. +// This is useful for BIA plugins that process multiple PVCs during a single backup +// and want to reuse the same VolumeHelper (with its internal cache) across calls. +func ShouldPerformSnapshotWithVolumeHelper( + unstructured runtime.Unstructured, + groupResource schema.GroupResource, + backup velerov1api.Backup, + crClient crclient.Client, + logger logrus.FieldLogger, + vh volumehelper.VolumeHelper, +) (bool, error) { + // If a VolumeHelper is provided, use it directly + if vh != nil { + return vh.ShouldPerformSnapshot(unstructured, groupResource) + } + + // Otherwise, create a new VolumeHelper (original behavior for third-party plugins) resourcePolicies, err := resourcepolicies.GetResourcePoliciesFromBackup( backup, crClient, @@ -60,3 +92,92 @@ func ShouldPerformSnapshotWithBackup( return volumeHelperImpl.ShouldPerformSnapshot(unstructured, groupResource) } + +// NewVolumeHelperForBackup creates a VolumeHelper for the given backup with a PVC-to-Pod cache. +// The cache is built for the provided namespaces list to avoid O(N*M) complexity when there +// are many PVCs and pods. See issue #9179 for details. +// +// This function is intended for BIA plugins to create a VolumeHelper once and reuse it +// across multiple Execute() calls for the same backup. +// +// If namespaces is nil or empty, the function will resolve the namespace list from the backup spec. +// If backup.Spec.IncludedNamespaces is empty (meaning all namespaces), it will list all namespaces +// from the cluster. +func NewVolumeHelperForBackup( + backup velerov1api.Backup, + crClient crclient.Client, + logger logrus.FieldLogger, + namespaces []string, +) (volumehelper.VolumeHelper, error) { + // If no namespaces provided, resolve from backup spec + if len(namespaces) == 0 { + var err error + namespaces, err = resolveNamespacesForBackup(backup, crClient) + if err != nil { + logger.WithError(err).Warn("Failed to resolve namespaces for cache, proceeding without cache") + namespaces = nil + } + } + + resourcePolicies, err := resourcepolicies.GetResourcePoliciesFromBackup( + backup, + crClient, + logger, + ) + if err != nil { + return nil, err + } + + return volumehelper.NewVolumeHelperImplWithNamespaces( + resourcePolicies, + backup.Spec.SnapshotVolumes, + logger, + crClient, + boolptr.IsSetToTrue(backup.Spec.DefaultVolumesToFsBackup), + true, + namespaces, + ) +} + +// resolveNamespacesForBackup determines which namespaces will be backed up. +// If IncludedNamespaces is specified, it returns those (excluding any in ExcludedNamespaces). +// If IncludedNamespaces is empty (meaning all namespaces), it lists all namespaces from the cluster. +func resolveNamespacesForBackup(backup velerov1api.Backup, crClient crclient.Client) ([]string, error) { + // If specific namespaces are included, use those + if len(backup.Spec.IncludedNamespaces) > 0 { + // Filter out excluded namespaces + excludeSet := make(map[string]bool) + for _, ns := range backup.Spec.ExcludedNamespaces { + excludeSet[ns] = true + } + + var namespaces []string + for _, ns := range backup.Spec.IncludedNamespaces { + if !excludeSet[ns] { + namespaces = append(namespaces, ns) + } + } + return namespaces, nil + } + + // IncludedNamespaces is empty, meaning all namespaces - list from cluster + nsList := &corev1api.NamespaceList{} + if err := crClient.List(context.Background(), nsList); err != nil { + return nil, err + } + + // Filter out excluded namespaces + excludeSet := make(map[string]bool) + for _, ns := range backup.Spec.ExcludedNamespaces { + excludeSet[ns] = true + } + + var namespaces []string + for _, ns := range nsList.Items { + if !excludeSet[ns.Name] { + namespaces = append(namespaces, ns.Name) + } + } + + return namespaces, nil +} diff --git a/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go b/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go new file mode 100644 index 000000000..b4d835a76 --- /dev/null +++ b/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go @@ -0,0 +1,440 @@ +/* +Copyright the Velero contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumehelper + +import ( + "testing" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + corev1api "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + "github.com/vmware-tanzu/velero/pkg/kuberesource" + velerotest "github.com/vmware-tanzu/velero/pkg/test" +) + +func TestShouldPerformSnapshotWithBackup(t *testing.T) { + tests := []struct { + name string + pvc *corev1api.PersistentVolumeClaim + pv *corev1api.PersistentVolume + backup *velerov1api.Backup + wantSnapshot bool + wantError bool + }{ + { + name: "Returns true when snapshotVolumes not set", + pvc: &corev1api.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pvc", + Namespace: "default", + }, + Spec: corev1api.PersistentVolumeClaimSpec{ + VolumeName: "test-pv", + }, + Status: corev1api.PersistentVolumeClaimStatus{ + Phase: corev1api.ClaimBound, + }, + }, + pv: &corev1api.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pv", + }, + Spec: corev1api.PersistentVolumeSpec{ + PersistentVolumeSource: corev1api.PersistentVolumeSource{ + CSI: &corev1api.CSIPersistentVolumeSource{ + Driver: "test-driver", + }, + }, + ClaimRef: &corev1api.ObjectReference{ + Namespace: "default", + Name: "test-pvc", + }, + }, + }, + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + }, + wantSnapshot: true, + wantError: false, + }, + { + name: "Returns false when snapshotVolumes is false", + pvc: &corev1api.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pvc", + Namespace: "default", + }, + Spec: corev1api.PersistentVolumeClaimSpec{ + VolumeName: "test-pv", + }, + Status: corev1api.PersistentVolumeClaimStatus{ + Phase: corev1api.ClaimBound, + }, + }, + pv: &corev1api.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pv", + }, + Spec: corev1api.PersistentVolumeSpec{ + PersistentVolumeSource: corev1api.PersistentVolumeSource{ + CSI: &corev1api.CSIPersistentVolumeSource{ + Driver: "test-driver", + }, + }, + ClaimRef: &corev1api.ObjectReference{ + Namespace: "default", + Name: "test-pvc", + }, + }, + }, + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + SnapshotVolumes: boolPtr(false), + }, + }, + wantSnapshot: false, + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake client with PV and PVC + client := velerotest.NewFakeControllerRuntimeClient(t, tt.pv, tt.pvc) + + // Convert PVC to unstructured + pvcMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(tt.pvc) + require.NoError(t, err) + unstructuredPVC := &unstructured.Unstructured{Object: pvcMap} + + logger := logrus.New() + + // Call the function under test - this is the wrapper for third-party plugins + result, err := ShouldPerformSnapshotWithBackup( + unstructuredPVC, + kuberesource.PersistentVolumeClaims, + *tt.backup, + client, + logger, + ) + + if tt.wantError { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tt.wantSnapshot, result) + } + }) + } +} + +func boolPtr(b bool) *bool { + return &b +} + +func TestShouldPerformSnapshotWithVolumeHelper(t *testing.T) { + tests := []struct { + name string + pvc *corev1api.PersistentVolumeClaim + pv *corev1api.PersistentVolume + backup *velerov1api.Backup + wantSnapshot bool + wantError bool + }{ + { + name: "Returns true with nil VolumeHelper when snapshotVolumes not set", + pvc: &corev1api.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pvc", + Namespace: "default", + }, + Spec: corev1api.PersistentVolumeClaimSpec{ + VolumeName: "test-pv", + }, + Status: corev1api.PersistentVolumeClaimStatus{ + Phase: corev1api.ClaimBound, + }, + }, + pv: &corev1api.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pv", + }, + Spec: corev1api.PersistentVolumeSpec{ + PersistentVolumeSource: corev1api.PersistentVolumeSource{ + CSI: &corev1api.CSIPersistentVolumeSource{ + Driver: "test-driver", + }, + }, + ClaimRef: &corev1api.ObjectReference{ + Namespace: "default", + Name: "test-pvc", + }, + }, + }, + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + }, + wantSnapshot: true, + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake client with PV + client := velerotest.NewFakeControllerRuntimeClient(t, tt.pv, tt.pvc) + + // Convert PVC to unstructured + pvcMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(tt.pvc) + require.NoError(t, err) + unstructuredPVC := &unstructured.Unstructured{Object: pvcMap} + + logger := logrus.New() + + // Call the function under test with nil VolumeHelper + // This exercises the fallback path that creates a new VolumeHelper per call + result, err := ShouldPerformSnapshotWithVolumeHelper( + unstructuredPVC, + kuberesource.PersistentVolumeClaims, + *tt.backup, + client, + logger, + nil, // Pass nil for VolumeHelper - exercises fallback path + ) + + if tt.wantError { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tt.wantSnapshot, result) + } + }) + } +} + +// TestShouldPerformSnapshotWithNonNilVolumeHelper tests the ShouldPerformSnapshotWithVolumeHelper +// function when a pre-created VolumeHelper is passed. This exercises the cached path used +// by BIA plugins for better performance. +func TestShouldPerformSnapshotWithNonNilVolumeHelper(t *testing.T) { + pvc := &corev1api.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pvc", + Namespace: "default", + }, + Spec: corev1api.PersistentVolumeClaimSpec{ + VolumeName: "test-pv", + }, + Status: corev1api.PersistentVolumeClaimStatus{ + Phase: corev1api.ClaimBound, + }, + } + + pv := &corev1api.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pv", + }, + Spec: corev1api.PersistentVolumeSpec{ + PersistentVolumeSource: corev1api.PersistentVolumeSource{ + CSI: &corev1api.CSIPersistentVolumeSource{ + Driver: "test-driver", + }, + }, + ClaimRef: &corev1api.ObjectReference{ + Namespace: "default", + Name: "test-pvc", + }, + }, + } + + backup := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + IncludedNamespaces: []string{"default"}, + }, + } + + // Create fake client with PV and PVC + client := velerotest.NewFakeControllerRuntimeClient(t, pv, pvc) + + logger := logrus.New() + + // Create VolumeHelper using the factory function + vh, err := NewVolumeHelperForBackup(*backup, client, logger, []string{"default"}) + require.NoError(t, err) + require.NotNil(t, vh) + + // Convert PVC to unstructured + pvcMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(pvc) + require.NoError(t, err) + unstructuredPVC := &unstructured.Unstructured{Object: pvcMap} + + // Call with non-nil VolumeHelper - exercises the cached path + result, err := ShouldPerformSnapshotWithVolumeHelper( + unstructuredPVC, + kuberesource.PersistentVolumeClaims, + *backup, + client, + logger, + vh, // Pass non-nil VolumeHelper - exercises cached path + ) + + require.NoError(t, err) + require.True(t, result, "Should return true for snapshot when snapshotVolumes not set") +} + +func TestNewVolumeHelperForBackup(t *testing.T) { + tests := []struct { + name string + backup *velerov1api.Backup + namespaces []string + wantError bool + }{ + { + name: "Creates VolumeHelper with explicit namespaces", + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + IncludedNamespaces: []string{"ns1", "ns2"}, + }, + }, + namespaces: []string{"ns1", "ns2"}, + wantError: false, + }, + { + name: "Creates VolumeHelper with namespaces from backup spec", + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + IncludedNamespaces: []string{"ns1", "ns2"}, + }, + }, + namespaces: nil, // Will be resolved from backup spec + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client := velerotest.NewFakeControllerRuntimeClient(t) + logger := logrus.New() + + vh, err := NewVolumeHelperForBackup(*tt.backup, client, logger, tt.namespaces) + + if tt.wantError { + require.Error(t, err) + require.Nil(t, vh) + } else { + require.NoError(t, err) + require.NotNil(t, vh) + } + }) + } +} + +func TestResolveNamespacesForBackup(t *testing.T) { + tests := []struct { + name string + backup *velerov1api.Backup + existingNS []string + expectedResult []string + }{ + { + name: "Returns included namespaces", + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + IncludedNamespaces: []string{"ns1", "ns2", "ns3"}, + }, + }, + existingNS: []string{"ns1", "ns2", "ns3", "ns4"}, + expectedResult: []string{"ns1", "ns2", "ns3"}, + }, + { + name: "Excludes specified namespaces from included list", + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + IncludedNamespaces: []string{"ns1", "ns2", "ns3"}, + ExcludedNamespaces: []string{"ns2"}, + }, + }, + existingNS: []string{"ns1", "ns2", "ns3", "ns4"}, + expectedResult: []string{"ns1", "ns3"}, + }, + { + name: "Returns all namespaces when IncludedNamespaces is empty", + backup: &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{}, + }, + existingNS: []string{"default", "kube-system", "app-ns"}, + expectedResult: []string{"default", "kube-system", "app-ns"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake client with namespaces + var objects []runtime.Object + for _, ns := range tt.existingNS { + objects = append(objects, &corev1api.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + }, + }) + } + client := velerotest.NewFakeControllerRuntimeClient(t, objects...) + + result, err := resolveNamespacesForBackup(*tt.backup, client) + require.NoError(t, err) + require.ElementsMatch(t, tt.expectedResult, result) + }) + } +} From f4c4653c08f68e564839061603edee5636ad0ba7 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Mon, 15 Dec 2025 14:07:52 -0800 Subject: [PATCH 14/19] Fix linter errors: use 'any' instead of 'interface{}' Signed-off-by: Shubham Pampattiwar --- pkg/backup/actions/csi/pvc_action_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index eeb6d16e2..8fad3821a 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -2163,7 +2163,7 @@ func TestGetOrCreateVolumeHelperConcurrency(t *testing.T) { // Run multiple goroutines concurrently to get VolumeHelper const numGoroutines = 10 - results := make(chan interface{}, numGoroutines) + results := make(chan any, numGoroutines) errors := make(chan error, numGoroutines) for i := 0; i < numGoroutines; i++ { @@ -2178,7 +2178,7 @@ func TestGetOrCreateVolumeHelperConcurrency(t *testing.T) { } // Collect all results - var volumeHelpers []interface{} + var volumeHelpers []any for i := 0; i < numGoroutines; i++ { select { case vh := <-results: From b7052c2cb1a81a0b1a7bf8faca9335c150cbc41a Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 16 Dec 2025 12:28:47 -0800 Subject: [PATCH 15/19] Implement lazy per-namespace PVC-to-Pod caching for plugin path This commit addresses reviewer feedback on PR #9441 regarding concurrent backup caching concerns. Key changes: 1. Added lazy per-namespace caching for the CSI PVC BIA plugin path: - Added IsNamespaceBuilt() method to check if namespace is cached - Added BuildCacheForNamespace() for lazy, per-namespace cache building - Plugin builds cache incrementally as namespaces are encountered 2. Added NewVolumeHelperImplWithCache constructor for plugins: - Accepts externally-managed PVC-to-Pod cache - Follows pattern from PR #9226 (Scott Seago's design) 3. Plugin instance lifecycle clarification: - Plugin instances are unique per backup (created via newPluginManager) - Cleaned up via CleanupClients at backup completion - No mutex or backup UID tracking needed 4. Test coverage: - Added tests for IsNamespaceBuilt and BuildCacheForNamespace - Added tests for NewVolumeHelperImplWithCache constructor - Added test verifying cache usage for fs-backup determination This maintains the O(N+M) complexity improvement from issue #9179 while addressing architectural concerns about concurrent access. Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper.go | 27 +++ .../volumehelper/volume_policy_helper_test.go | 181 ++++++++++++++ pkg/backup/actions/csi/pvc_action.go | 90 ++++--- pkg/backup/actions/csi/pvc_action_test.go | 132 ++--------- pkg/util/podvolume/pod_volume.go | 60 +++++ pkg/util/podvolume/pod_volume_test.go | 221 ++++++++++++++++++ 6 files changed, 573 insertions(+), 138 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper.go b/internal/volumehelper/volume_policy_helper.go index 22a88a146..63115cba2 100644 --- a/internal/volumehelper/volume_policy_helper.go +++ b/internal/volumehelper/volume_policy_helper.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" + "github.com/pkg/errors" "github.com/sirupsen/logrus" corev1api "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" @@ -12,6 +13,7 @@ import ( crclient "sigs.k8s.io/controller-runtime/pkg/client" "github.com/vmware-tanzu/velero/internal/resourcepolicies" + velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/util/boolptr" kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube" @@ -95,6 +97,31 @@ func NewVolumeHelperImplWithNamespaces( }, nil } +// NewVolumeHelperImplWithCache creates a VolumeHelper using an externally managed PVC-to-Pod cache. +// This is used by plugins that build the cache lazily per-namespace (following the pattern from PR #9226). +// The cache can be nil, in which case PVC-to-Pod lookups will fall back to direct API calls. +func NewVolumeHelperImplWithCache( + backup velerov1api.Backup, + client crclient.Client, + logger logrus.FieldLogger, + pvcPodCache *podvolumeutil.PVCPodCache, +) (VolumeHelper, error) { + resourcePolicies, err := resourcepolicies.GetResourcePoliciesFromBackup(backup, client, logger) + if err != nil { + return nil, errors.Wrap(err, "failed to get volume policies from backup") + } + + return &volumeHelperImpl{ + volumePolicy: resourcePolicies, + snapshotVolumes: backup.Spec.SnapshotVolumes, + logger: logger, + client: client, + defaultVolumesToFSBackup: boolptr.IsSetToTrue(backup.Spec.DefaultVolumesToFsBackup), + backupExcludePVC: boolptr.IsSetToTrue(backup.Spec.SnapshotMoveData), + pvcPodCache: pvcPodCache, + }, nil +} + func (v *volumeHelperImpl) ShouldPerformSnapshot(obj runtime.Unstructured, groupResource schema.GroupResource) (bool, error) { // check if volume policy exists and also check if the object(pv/pvc) fits a volume policy criteria and see if the associated action is snapshot // if it is not snapshot then skip the code path for snapshotting the PV/PVC diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index 57c99e862..c0f93f94f 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -34,6 +34,7 @@ import ( "github.com/vmware-tanzu/velero/pkg/builder" "github.com/vmware-tanzu/velero/pkg/kuberesource" velerotest "github.com/vmware-tanzu/velero/pkg/test" + podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume" ) func TestVolumeHelperImpl_ShouldPerformSnapshot(t *testing.T) { @@ -1053,3 +1054,183 @@ func TestVolumeHelperImplWithCache_ShouldPerformFSBackup(t *testing.T) { }) } } + +// TestNewVolumeHelperImplWithCache tests the NewVolumeHelperImplWithCache constructor +// which is used by plugins that build the cache lazily per-namespace. +func TestNewVolumeHelperImplWithCache(t *testing.T) { + testCases := []struct { + name string + backup velerov1api.Backup + resourcePolicyConfigMap *corev1api.ConfigMap + pvcPodCache bool // whether to pass a cache + expectError bool + }{ + { + name: "creates VolumeHelper with nil cache", + backup: velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + SnapshotVolumes: ptr.To(true), + DefaultVolumesToFsBackup: ptr.To(false), + }, + }, + pvcPodCache: false, + expectError: false, + }, + { + name: "creates VolumeHelper with non-nil cache", + backup: velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + SnapshotVolumes: ptr.To(true), + DefaultVolumesToFsBackup: ptr.To(true), + SnapshotMoveData: ptr.To(true), + }, + }, + pvcPodCache: true, + expectError: false, + }, + { + name: "creates VolumeHelper with resource policies", + backup: velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + SnapshotVolumes: ptr.To(true), + ResourcePolicy: &corev1api.TypedLocalObjectReference{ + Kind: "ConfigMap", + Name: "resource-policy", + }, + }, + }, + resourcePolicyConfigMap: &corev1api.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "resource-policy", + Namespace: "velero", + }, + Data: map[string]string{ + "policy": `version: v1 +volumePolicies: +- conditions: + storageClass: + - gp2-csi + action: + type: snapshot`, + }, + }, + pvcPodCache: true, + expectError: false, + }, + { + name: "fails when resource policy ConfigMap not found", + backup: velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + ResourcePolicy: &corev1api.TypedLocalObjectReference{ + Kind: "ConfigMap", + Name: "non-existent-policy", + }, + }, + }, + pvcPodCache: false, + expectError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var objs []runtime.Object + if tc.resourcePolicyConfigMap != nil { + objs = append(objs, tc.resourcePolicyConfigMap) + } + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) + + var cache *podvolumeutil.PVCPodCache + if tc.pvcPodCache { + cache = podvolumeutil.NewPVCPodCache() + } + + vh, err := NewVolumeHelperImplWithCache( + tc.backup, + fakeClient, + logrus.StandardLogger(), + cache, + ) + + if tc.expectError { + require.Error(t, err) + require.Nil(t, vh) + } else { + require.NoError(t, err) + require.NotNil(t, vh) + } + }) + } +} + +// TestNewVolumeHelperImplWithCache_UsesCache verifies that the VolumeHelper created +// via NewVolumeHelperImplWithCache actually uses the provided cache for lookups. +func TestNewVolumeHelperImplWithCache_UsesCache(t *testing.T) { + // Create a pod that uses a PVC via opt-out (defaultVolumesToFsBackup=true) + pod := builder.ForPod("ns", "pod-1").Volumes( + &corev1api.Volume{ + Name: "volume", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc-1", + }, + }, + }, + ).Result() + + pvc := &corev1api.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "pvc-1", + }, + } + + pv := builder.ForPersistentVolume("example-pv").StorageClass("gp2-csi").ClaimRef("ns", "pvc-1").Result() + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, pvc, pv, pod) + + // Build cache for the namespace + cache := podvolumeutil.NewPVCPodCache() + err := cache.BuildCacheForNamespace(t.Context(), "ns", fakeClient) + require.NoError(t, err) + + backup := velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: "velero", + }, + Spec: velerov1api.BackupSpec{ + SnapshotVolumes: ptr.To(true), + DefaultVolumesToFsBackup: ptr.To(true), // opt-out mode + }, + } + + vh, err := NewVolumeHelperImplWithCache(backup, fakeClient, logrus.StandardLogger(), cache) + require.NoError(t, err) + + // Convert PV to unstructured + obj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(pv) + require.NoError(t, err) + + // ShouldPerformSnapshot should return false because the volume is selected for fs-backup + // This relies on the cache to find the pod using the PVC + shouldSnapshot, err := vh.ShouldPerformSnapshot(&unstructured.Unstructured{Object: obj}, kuberesource.PersistentVolumes) + require.NoError(t, err) + require.False(t, shouldSnapshot, "Expected snapshot to be skipped due to fs-backup selection via cache") +} diff --git a/pkg/backup/actions/csi/pvc_action.go b/pkg/backup/actions/csi/pvc_action.go index c5a83c07e..8e2e77316 100644 --- a/pkg/backup/actions/csi/pvc_action.go +++ b/pkg/backup/actions/csi/pvc_action.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "strconv" - "sync" "time" "k8s.io/client-go/util/retry" @@ -59,6 +58,7 @@ import ( "github.com/vmware-tanzu/velero/pkg/util/boolptr" "github.com/vmware-tanzu/velero/pkg/util/csi" kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube" + podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume" ) // TODO: Replace hardcoded VolumeSnapshot finalizer strings with constants from @@ -75,15 +75,13 @@ type pvcBackupItemAction struct { log logrus.FieldLogger crClient crclient.Client - // volumeHelper caches the VolumeHelper instance for the current backup. + // pvcPodCache provides lazy per-namespace caching of PVC-to-Pod mappings. + // Since plugin instances are unique per backup (created via newPluginManager and + // cleaned up via CleanupClients at backup completion), we can safely cache this + // without mutex or backup UID tracking. // This avoids the O(N*M) performance issue when there are many PVCs and pods. - // See issue #9179 for details. - volumeHelper internalvolumehelper.VolumeHelper - // cachedForBackup tracks which backup the volumeHelper was built for. - // If the backup UID changes, we need to rebuild the cache. - cachedForBackup types.UID - // mu protects volumeHelper and cachedForBackup for concurrent access. - mu sync.Mutex + // See issue #9179 and PR #9226 for details. + pvcPodCache *podvolumeutil.PVCPodCache } // AppliesTo returns information indicating that the PVCBackupItemAction @@ -109,29 +107,57 @@ func (p *pvcBackupItemAction) validateBackup(backup velerov1api.Backup) (valid b return true } -// getOrCreateVolumeHelper returns a cached VolumeHelper for the given backup. -// If the backup UID has changed or no VolumeHelper exists, a new one is created. -// This avoids the O(N*M) performance issue when there are many PVCs and pods. -// See issue #9179 for details. -func (p *pvcBackupItemAction) getOrCreateVolumeHelper(backup *velerov1api.Backup) (internalvolumehelper.VolumeHelper, error) { - p.mu.Lock() - defer p.mu.Unlock() - - // Check if we already have a VolumeHelper for this backup - if p.volumeHelper != nil && p.cachedForBackup == backup.UID { - return p.volumeHelper, nil +// ensurePVCPodCacheForNamespace ensures the PVC-to-Pod cache is built for the given namespace. +// This uses lazy per-namespace caching following the pattern from PR #9226. +// Since plugin instances are unique per backup, we can safely cache without mutex or backup UID tracking. +func (p *pvcBackupItemAction) ensurePVCPodCacheForNamespace(ctx context.Context, namespace string) error { + // Initialize cache if needed + if p.pvcPodCache == nil { + p.pvcPodCache = podvolumeutil.NewPVCPodCache() } - // Build a new VolumeHelper with cache for this backup - p.log.Infof("Building VolumeHelper with PVC-to-Pod cache for backup %s/%s", backup.Namespace, backup.Name) - vh, err := volumehelper.NewVolumeHelperForBackup(*backup, p.crClient, p.log, nil) + // Build cache for namespace if not already done + if !p.pvcPodCache.IsNamespaceBuilt(namespace) { + p.log.Debugf("Building PVC-to-Pod cache for namespace %s", namespace) + if err := p.pvcPodCache.BuildCacheForNamespace(ctx, namespace, p.crClient); err != nil { + return errors.Wrapf(err, "failed to build PVC-to-Pod cache for namespace %s", namespace) + } + } + return nil +} + +// getVolumeHelperWithCache creates a VolumeHelper using the pre-built PVC-to-Pod cache. +// The cache should be ensured for the relevant namespace(s) before calling this. +func (p *pvcBackupItemAction) getVolumeHelperWithCache(backup *velerov1api.Backup) (internalvolumehelper.VolumeHelper, error) { + // Create VolumeHelper with our lazy-built cache + vh, err := internalvolumehelper.NewVolumeHelperImplWithCache( + *backup, + p.crClient, + p.log, + p.pvcPodCache, + ) if err != nil { - return nil, errors.Wrap(err, "failed to create VolumeHelper for backup") + return nil, errors.Wrap(err, "failed to create VolumeHelper") + } + return vh, nil +} + +// getOrCreateVolumeHelper returns a VolumeHelper with lazy per-namespace caching. +// The VolumeHelper uses the pvcPodCache which is populated lazily as namespaces are encountered. +// Callers should use ensurePVCPodCacheForNamespace before calling methods that need +// PVC-to-Pod lookups for a specific namespace. +// Since plugin instances are unique per backup (created via newPluginManager and +// cleaned up via CleanupClients at backup completion), we can safely cache this. +// See issue #9179 and PR #9226 for details. +func (p *pvcBackupItemAction) getOrCreateVolumeHelper(backup *velerov1api.Backup) (internalvolumehelper.VolumeHelper, error) { + // Initialize the PVC-to-Pod cache if needed + if p.pvcPodCache == nil { + p.pvcPodCache = podvolumeutil.NewPVCPodCache() } - p.volumeHelper = vh - p.cachedForBackup = backup.UID - return vh, nil + // Return the VolumeHelper with our lazily-built cache + // The cache will be populated incrementally as namespaces are encountered + return p.getVolumeHelperWithCache(backup) } func (p *pvcBackupItemAction) validatePVCandPV( @@ -285,6 +311,11 @@ func (p *pvcBackupItemAction) Execute( return item, nil, "", nil, nil } + // Ensure PVC-to-Pod cache is built for this namespace (lazy per-namespace caching) + if err := p.ensurePVCPodCacheForNamespace(context.TODO(), pvc.Namespace); err != nil { + return nil, nil, "", nil, err + } + // Get or create the cached VolumeHelper for this backup vh, err := p.getOrCreateVolumeHelper(backup) if err != nil { @@ -665,6 +696,11 @@ func (p *pvcBackupItemAction) getVolumeSnapshotReference( return nil, errors.Wrapf(err, "failed to list PVCs in VolumeGroupSnapshot group %q in namespace %q", group, pvc.Namespace) } + // Ensure PVC-to-Pod cache is built for this namespace (lazy per-namespace caching) + if err := p.ensurePVCPodCacheForNamespace(ctx, pvc.Namespace); err != nil { + return nil, errors.Wrapf(err, "failed to build PVC-to-Pod cache for namespace %s", pvc.Namespace) + } + // Get the cached VolumeHelper for filtering PVCs by volume policy vh, err := p.getOrCreateVolumeHelper(backup) if err != nil { diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index 8fad3821a..b94d63701 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -2067,87 +2067,11 @@ func TestPVCRequestSize(t *testing.T) { } } -// TestGetOrCreateVolumeHelper tests the VolumeHelper caching behavior +// TestGetOrCreateVolumeHelper tests the VolumeHelper and PVC-to-Pod cache behavior. +// Since plugin instances are unique per backup (created via newPluginManager and +// cleaned up via CleanupClients at backup completion), we verify that the pvcPodCache +// is properly initialized and reused across calls. func TestGetOrCreateVolumeHelper(t *testing.T) { - tests := []struct { - name string - setup func() (*pvcBackupItemAction, *velerov1api.Backup, *velerov1api.Backup) - wantSameCache bool - }{ - { - name: "Returns same VolumeHelper for same backup UID", - setup: func() (*pvcBackupItemAction, *velerov1api.Backup, *velerov1api.Backup) { - client := velerotest.NewFakeControllerRuntimeClient(t) - action := &pvcBackupItemAction{ - log: velerotest.NewLogger(), - crClient: client, - } - backup := &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup", - Namespace: "velero", - UID: types.UID("test-uid-1"), - }, - } - return action, backup, backup // Same backup instance - }, - wantSameCache: true, - }, - { - name: "Returns new VolumeHelper for different backup UID", - setup: func() (*pvcBackupItemAction, *velerov1api.Backup, *velerov1api.Backup) { - client := velerotest.NewFakeControllerRuntimeClient(t) - action := &pvcBackupItemAction{ - log: velerotest.NewLogger(), - crClient: client, - } - backup1 := &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup-1", - Namespace: "velero", - UID: types.UID("test-uid-1"), - }, - } - backup2 := &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup-2", - Namespace: "velero", - UID: types.UID("test-uid-2"), - }, - } - return action, backup1, backup2 // Different backup instances - }, - wantSameCache: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - action, backup1, backup2 := tt.setup() - - // Get VolumeHelper for first backup - vh1, err := action.getOrCreateVolumeHelper(backup1) - require.NoError(t, err) - require.NotNil(t, vh1) - - // Get VolumeHelper for second backup - vh2, err := action.getOrCreateVolumeHelper(backup2) - require.NoError(t, err) - require.NotNil(t, vh2) - - if tt.wantSameCache { - // Same backup UID should return same VolumeHelper pointer - require.Same(t, vh1, vh2, "Expected same VolumeHelper instance for same backup UID") - } else { - // Different backup UID should return different VolumeHelper pointer - require.NotSame(t, vh1, vh2, "Expected different VolumeHelper instance for different backup UID") - } - }) - } -} - -// TestGetOrCreateVolumeHelperConcurrency tests thread-safety of VolumeHelper caching -func TestGetOrCreateVolumeHelperConcurrency(t *testing.T) { client := velerotest.NewFakeControllerRuntimeClient(t) action := &pvcBackupItemAction{ log: velerotest.NewLogger(), @@ -2157,41 +2081,27 @@ func TestGetOrCreateVolumeHelperConcurrency(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-backup", Namespace: "velero", - UID: types.UID("test-uid"), + UID: types.UID("test-uid-1"), }, } - // Run multiple goroutines concurrently to get VolumeHelper - const numGoroutines = 10 - results := make(chan any, numGoroutines) - errors := make(chan error, numGoroutines) + // Initially, pvcPodCache should be nil + require.Nil(t, action.pvcPodCache, "pvcPodCache should be nil initially") - for i := 0; i < numGoroutines; i++ { - go func() { - vh, err := action.getOrCreateVolumeHelper(backup) - if err != nil { - errors <- err - return - } - results <- vh - }() - } + // Get VolumeHelper first time - should create new cache and VolumeHelper + vh1, err := action.getOrCreateVolumeHelper(backup) + require.NoError(t, err) + require.NotNil(t, vh1) - // Collect all results - var volumeHelpers []any - for i := 0; i < numGoroutines; i++ { - select { - case vh := <-results: - volumeHelpers = append(volumeHelpers, vh) - case err := <-errors: - t.Fatalf("Unexpected error: %v", err) - } - } + // pvcPodCache should now be initialized + require.NotNil(t, action.pvcPodCache, "pvcPodCache should be initialized after first call") + cache1 := action.pvcPodCache - // All goroutines should get the same VolumeHelper instance - require.Len(t, volumeHelpers, numGoroutines) - firstVH := volumeHelpers[0] - for i := 1; i < len(volumeHelpers); i++ { - require.Same(t, firstVH, volumeHelpers[i], "All goroutines should get the same VolumeHelper instance") - } + // Get VolumeHelper second time - should reuse the same cache + vh2, err := action.getOrCreateVolumeHelper(backup) + require.NoError(t, err) + require.NotNil(t, vh2) + + // The pvcPodCache should be the same instance + require.Same(t, cache1, action.pvcPodCache, "Expected same pvcPodCache instance on repeated calls") } diff --git a/pkg/util/podvolume/pod_volume.go b/pkg/util/podvolume/pod_volume.go index 1bf87a5fa..33d67dc05 100644 --- a/pkg/util/podvolume/pod_volume.go +++ b/pkg/util/podvolume/pod_volume.go @@ -113,6 +113,66 @@ func (c *PVCPodCache) IsBuilt() bool { return c.built } +// IsNamespaceBuilt returns true if the cache has been built for the given namespace. +func (c *PVCPodCache) IsNamespaceBuilt(namespace string) bool { + c.mu.RLock() + defer c.mu.RUnlock() + _, ok := c.cache[namespace] + return ok +} + +// BuildCacheForNamespace builds the cache for a single namespace lazily. +// This is used by plugins where namespaces are encountered one at a time. +// If the namespace is already cached, this is a no-op. +func (c *PVCPodCache) BuildCacheForNamespace( + ctx context.Context, + namespace string, + crClient crclient.Client, +) error { + // Check if already built (read lock first for performance) + c.mu.RLock() + if _, ok := c.cache[namespace]; ok { + c.mu.RUnlock() + return nil + } + c.mu.RUnlock() + + // Need to build - acquire write lock + c.mu.Lock() + defer c.mu.Unlock() + + // Double-check after acquiring write lock + if _, ok := c.cache[namespace]; ok { + return nil + } + + podList := new(corev1api.PodList) + if err := crClient.List( + ctx, + podList, + &crclient.ListOptions{Namespace: namespace}, + ); err != nil { + return errors.Wrapf(err, "failed to list pods in namespace %s", namespace) + } + + c.cache[namespace] = make(map[string][]corev1api.Pod) + + // Build mapping from PVC name to pods + for i := range podList.Items { + pod := podList.Items[i] + for _, v := range pod.Spec.Volumes { + if v.PersistentVolumeClaim != nil { + pvcName := v.PersistentVolumeClaim.ClaimName + c.cache[namespace][pvcName] = append(c.cache[namespace][pvcName], pod) + } + } + } + + // Mark as built for GetPodsUsingPVCWithCache fallback logic + c.built = true + return nil +} + // GetVolumesByPod returns a list of volume names to backup for the provided pod. func GetVolumesByPod(pod *corev1api.Pod, defaultVolumesToFsBackup, backupExcludePVC bool, volsToProcessByLegacyApproach []string) ([]string, []string) { // tracks the volumes that have been explicitly opted out of backup via the annotation in the pod diff --git a/pkg/util/podvolume/pod_volume_test.go b/pkg/util/podvolume/pod_volume_test.go index 6732904ef..af28ff9ef 100644 --- a/pkg/util/podvolume/pod_volume_test.go +++ b/pkg/util/podvolume/pod_volume_test.go @@ -1252,3 +1252,224 @@ func TestIsPVCDefaultToFSBackupWithCache(t *testing.T) { }) } } + +// TestIsNamespaceBuilt tests the IsNamespaceBuilt method for lazy per-namespace caching. +func TestIsNamespaceBuilt(t *testing.T) { + cache := NewPVCPodCache() + + // Initially no namespace should be built + assert.False(t, cache.IsNamespaceBuilt("ns1"), "namespace should not be built initially") + assert.False(t, cache.IsNamespaceBuilt("ns2"), "namespace should not be built initially") + + // Create a fake client with a pod in ns1 + pod := &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "ns1", + }, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + } + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, pod) + + // Build cache for ns1 + err := cache.BuildCacheForNamespace(t.Context(), "ns1", fakeClient) + require.NoError(t, err) + + // ns1 should be built, ns2 should not + assert.True(t, cache.IsNamespaceBuilt("ns1"), "namespace ns1 should be built") + assert.False(t, cache.IsNamespaceBuilt("ns2"), "namespace ns2 should not be built") + + // Build cache for ns2 (empty namespace) + err = cache.BuildCacheForNamespace(t.Context(), "ns2", fakeClient) + require.NoError(t, err) + + // Both should now be built + assert.True(t, cache.IsNamespaceBuilt("ns1"), "namespace ns1 should still be built") + assert.True(t, cache.IsNamespaceBuilt("ns2"), "namespace ns2 should now be built") +} + +// TestBuildCacheForNamespace tests the lazy per-namespace cache building. +func TestBuildCacheForNamespace(t *testing.T) { + tests := []struct { + name string + pods []runtime.Object + namespace string + expectedPVCs map[string]int // pvcName -> expected pod count + expectError bool + }{ + { + name: "build cache for namespace with pods using PVCs", + namespace: "ns1", + pods: []runtime.Object{ + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns1"}, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod2", Namespace: "ns1"}, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + }, + expectedPVCs: map[string]int{"pvc1": 2}, + }, + { + name: "build cache for empty namespace", + namespace: "empty-ns", + pods: []runtime.Object{}, + expectedPVCs: map[string]int{}, + }, + { + name: "build cache ignores pods without PVCs", + namespace: "ns1", + pods: []runtime.Object{ + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns1"}, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "config-vol", + VolumeSource: corev1api.VolumeSource{ + ConfigMap: &corev1api.ConfigMapVolumeSource{ + LocalObjectReference: corev1api.LocalObjectReference{ + Name: "my-config", + }, + }, + }, + }, + }, + }, + }, + }, + expectedPVCs: map[string]int{}, + }, + { + name: "build cache only for specified namespace", + namespace: "ns1", + pods: []runtime.Object{ + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns1"}, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + }, + &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod2", Namespace: "ns2"}, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc2", + }, + }, + }, + }, + }, + }, + }, + expectedPVCs: map[string]int{"pvc1": 1}, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, tc.pods...) + cache := NewPVCPodCache() + + // Build cache for the namespace + err := cache.BuildCacheForNamespace(t.Context(), tc.namespace, fakeClient) + if tc.expectError { + require.Error(t, err) + return + } + require.NoError(t, err) + + // Verify namespace is marked as built + assert.True(t, cache.IsNamespaceBuilt(tc.namespace)) + + // Verify PVC to pod mappings + for pvcName, expectedCount := range tc.expectedPVCs { + pods := cache.GetPodsUsingPVC(tc.namespace, pvcName) + assert.Len(t, pods, expectedCount, "unexpected pod count for PVC %s", pvcName) + } + + // Calling BuildCacheForNamespace again should be a no-op + err = cache.BuildCacheForNamespace(t.Context(), tc.namespace, fakeClient) + require.NoError(t, err) + }) + } +} + +// TestBuildCacheForNamespaceIdempotent verifies that building cache multiple times is safe. +func TestBuildCacheForNamespaceIdempotent(t *testing.T) { + pod := &corev1api.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns1"}, + Spec: corev1api.PodSpec{ + Volumes: []corev1api.Volume{ + { + Name: "vol1", + VolumeSource: corev1api.VolumeSource{ + PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ + ClaimName: "pvc1", + }, + }, + }, + }, + }, + } + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, pod) + cache := NewPVCPodCache() + + // Build cache multiple times - should be idempotent + for i := 0; i < 3; i++ { + err := cache.BuildCacheForNamespace(t.Context(), "ns1", fakeClient) + require.NoError(t, err) + assert.True(t, cache.IsNamespaceBuilt("ns1")) + + pods := cache.GetPodsUsingPVC("ns1", "pvc1") + assert.Len(t, pods, 1, "should have exactly 1 pod using pvc1") + } +} From 10ef43e14792a397958c110cc4b44e38264e5ca7 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Tue, 16 Dec 2025 12:40:55 -0800 Subject: [PATCH 16/19] Fix gofmt formatting issues in test files Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper_test.go | 10 +++++----- pkg/util/podvolume/pod_volume_test.go | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index c0f93f94f..8d6073c2b 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -1059,11 +1059,11 @@ func TestVolumeHelperImplWithCache_ShouldPerformFSBackup(t *testing.T) { // which is used by plugins that build the cache lazily per-namespace. func TestNewVolumeHelperImplWithCache(t *testing.T) { testCases := []struct { - name string - backup velerov1api.Backup - resourcePolicyConfigMap *corev1api.ConfigMap - pvcPodCache bool // whether to pass a cache - expectError bool + name string + backup velerov1api.Backup + resourcePolicyConfigMap *corev1api.ConfigMap + pvcPodCache bool // whether to pass a cache + expectError bool }{ { name: "creates VolumeHelper with nil cache", diff --git a/pkg/util/podvolume/pod_volume_test.go b/pkg/util/podvolume/pod_volume_test.go index af28ff9ef..3ba2a05e9 100644 --- a/pkg/util/podvolume/pod_volume_test.go +++ b/pkg/util/podvolume/pod_volume_test.go @@ -1302,11 +1302,11 @@ func TestIsNamespaceBuilt(t *testing.T) { // TestBuildCacheForNamespace tests the lazy per-namespace cache building. func TestBuildCacheForNamespace(t *testing.T) { tests := []struct { - name string - pods []runtime.Object - namespace string - expectedPVCs map[string]int // pvcName -> expected pod count - expectError bool + name string + pods []runtime.Object + namespace string + expectedPVCs map[string]int // pvcName -> expected pod count + expectError bool }{ { name: "build cache for namespace with pods using PVCs", From f592a264a6ea67409d1ce0be740ec521b245ddc0 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Wed, 17 Dec 2025 10:44:15 -0800 Subject: [PATCH 17/19] Address review feedback: remove deprecated functions Remove deprecated functions that were marked for removal per review: - Remove GetPodsUsingPVC (replaced by GetPodsUsingPVCWithCache) - Remove IsPVCDefaultToFSBackup (replaced by IsPVCDefaultToFSBackupWithCache) - Remove associated tests for deprecated functions - Add deprecation marker to NewVolumeHelperImpl - Add deprecation marker to ShouldPerformSnapshotWithBackup Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper.go | 5 + .../volumehelper/volume_policy_helper.go | 97 +----- .../volumehelper/volume_policy_helper_test.go | 138 +------- pkg/util/podvolume/pod_volume.go | 70 ++-- pkg/util/podvolume/pod_volume_test.go | 306 ------------------ 5 files changed, 50 insertions(+), 566 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper.go b/internal/volumehelper/volume_policy_helper.go index 63115cba2..160a2005e 100644 --- a/internal/volumehelper/volume_policy_helper.go +++ b/internal/volumehelper/volume_policy_helper.go @@ -41,6 +41,11 @@ type volumeHelperImpl struct { pvcPodCache *podvolumeutil.PVCPodCache } +// NewVolumeHelperImpl creates a VolumeHelper without PVC-to-Pod caching. +// +// Deprecated: Use NewVolumeHelperImplWithNamespaces or NewVolumeHelperImplWithCache instead +// for better performance. These functions provide PVC-to-Pod caching which avoids O(N*M) +// complexity when there are many PVCs and pods. See issue #9179 for details. func NewVolumeHelperImpl( volumePolicy *resourcepolicies.Policies, snapshotVolumes *bool, diff --git a/pkg/plugin/utils/volumehelper/volume_policy_helper.go b/pkg/plugin/utils/volumehelper/volume_policy_helper.go index 715db15d9..8fda7f5af 100644 --- a/pkg/plugin/utils/volumehelper/volume_policy_helper.go +++ b/pkg/plugin/utils/volumehelper/volume_policy_helper.go @@ -17,10 +17,7 @@ limitations under the License. package volumehelper import ( - "context" - "github.com/sirupsen/logrus" - corev1api "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" crclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -36,6 +33,11 @@ import ( // up on demand. On the other hand, the volumeHelperImpl assume there // is a VolumeHelper instance initialized before calling the // ShouldPerformXXX functions. +// +// Deprecated: Use ShouldPerformSnapshotWithVolumeHelper instead for better performance. +// ShouldPerformSnapshotWithVolumeHelper allows passing a pre-created VolumeHelper with +// an internal PVC-to-Pod cache, which avoids O(N*M) complexity when there are many PVCs and pods. +// See issue #9179 for details. func ShouldPerformSnapshotWithBackup( unstructured runtime.Unstructured, groupResource schema.GroupResource, @@ -92,92 +94,3 @@ func ShouldPerformSnapshotWithVolumeHelper( return volumeHelperImpl.ShouldPerformSnapshot(unstructured, groupResource) } - -// NewVolumeHelperForBackup creates a VolumeHelper for the given backup with a PVC-to-Pod cache. -// The cache is built for the provided namespaces list to avoid O(N*M) complexity when there -// are many PVCs and pods. See issue #9179 for details. -// -// This function is intended for BIA plugins to create a VolumeHelper once and reuse it -// across multiple Execute() calls for the same backup. -// -// If namespaces is nil or empty, the function will resolve the namespace list from the backup spec. -// If backup.Spec.IncludedNamespaces is empty (meaning all namespaces), it will list all namespaces -// from the cluster. -func NewVolumeHelperForBackup( - backup velerov1api.Backup, - crClient crclient.Client, - logger logrus.FieldLogger, - namespaces []string, -) (volumehelper.VolumeHelper, error) { - // If no namespaces provided, resolve from backup spec - if len(namespaces) == 0 { - var err error - namespaces, err = resolveNamespacesForBackup(backup, crClient) - if err != nil { - logger.WithError(err).Warn("Failed to resolve namespaces for cache, proceeding without cache") - namespaces = nil - } - } - - resourcePolicies, err := resourcepolicies.GetResourcePoliciesFromBackup( - backup, - crClient, - logger, - ) - if err != nil { - return nil, err - } - - return volumehelper.NewVolumeHelperImplWithNamespaces( - resourcePolicies, - backup.Spec.SnapshotVolumes, - logger, - crClient, - boolptr.IsSetToTrue(backup.Spec.DefaultVolumesToFsBackup), - true, - namespaces, - ) -} - -// resolveNamespacesForBackup determines which namespaces will be backed up. -// If IncludedNamespaces is specified, it returns those (excluding any in ExcludedNamespaces). -// If IncludedNamespaces is empty (meaning all namespaces), it lists all namespaces from the cluster. -func resolveNamespacesForBackup(backup velerov1api.Backup, crClient crclient.Client) ([]string, error) { - // If specific namespaces are included, use those - if len(backup.Spec.IncludedNamespaces) > 0 { - // Filter out excluded namespaces - excludeSet := make(map[string]bool) - for _, ns := range backup.Spec.ExcludedNamespaces { - excludeSet[ns] = true - } - - var namespaces []string - for _, ns := range backup.Spec.IncludedNamespaces { - if !excludeSet[ns] { - namespaces = append(namespaces, ns) - } - } - return namespaces, nil - } - - // IncludedNamespaces is empty, meaning all namespaces - list from cluster - nsList := &corev1api.NamespaceList{} - if err := crClient.List(context.Background(), nsList); err != nil { - return nil, err - } - - // Filter out excluded namespaces - excludeSet := make(map[string]bool) - for _, ns := range backup.Spec.ExcludedNamespaces { - excludeSet[ns] = true - } - - var namespaces []string - for _, ns := range nsList.Items { - if !excludeSet[ns.Name] { - namespaces = append(namespaces, ns.Name) - } - } - - return namespaces, nil -} diff --git a/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go b/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go index b4d835a76..08b23ae04 100644 --- a/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go +++ b/pkg/plugin/utils/volumehelper/volume_policy_helper_test.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "github.com/vmware-tanzu/velero/internal/volumehelper" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/kuberesource" velerotest "github.com/vmware-tanzu/velero/pkg/test" @@ -290,8 +291,16 @@ func TestShouldPerformSnapshotWithNonNilVolumeHelper(t *testing.T) { logger := logrus.New() - // Create VolumeHelper using the factory function - vh, err := NewVolumeHelperForBackup(*backup, client, logger, []string{"default"}) + // Create VolumeHelper using the internal function with namespace caching + vh, err := volumehelper.NewVolumeHelperImplWithNamespaces( + nil, // no resource policies for this test + nil, // snapshotVolumes not set + logger, + client, + false, // defaultVolumesToFSBackup + true, // backupExcludePVC + []string{"default"}, + ) require.NoError(t, err) require.NotNil(t, vh) @@ -313,128 +322,3 @@ func TestShouldPerformSnapshotWithNonNilVolumeHelper(t *testing.T) { require.NoError(t, err) require.True(t, result, "Should return true for snapshot when snapshotVolumes not set") } - -func TestNewVolumeHelperForBackup(t *testing.T) { - tests := []struct { - name string - backup *velerov1api.Backup - namespaces []string - wantError bool - }{ - { - name: "Creates VolumeHelper with explicit namespaces", - backup: &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup", - Namespace: "velero", - }, - Spec: velerov1api.BackupSpec{ - IncludedNamespaces: []string{"ns1", "ns2"}, - }, - }, - namespaces: []string{"ns1", "ns2"}, - wantError: false, - }, - { - name: "Creates VolumeHelper with namespaces from backup spec", - backup: &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup", - Namespace: "velero", - }, - Spec: velerov1api.BackupSpec{ - IncludedNamespaces: []string{"ns1", "ns2"}, - }, - }, - namespaces: nil, // Will be resolved from backup spec - wantError: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - client := velerotest.NewFakeControllerRuntimeClient(t) - logger := logrus.New() - - vh, err := NewVolumeHelperForBackup(*tt.backup, client, logger, tt.namespaces) - - if tt.wantError { - require.Error(t, err) - require.Nil(t, vh) - } else { - require.NoError(t, err) - require.NotNil(t, vh) - } - }) - } -} - -func TestResolveNamespacesForBackup(t *testing.T) { - tests := []struct { - name string - backup *velerov1api.Backup - existingNS []string - expectedResult []string - }{ - { - name: "Returns included namespaces", - backup: &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup", - Namespace: "velero", - }, - Spec: velerov1api.BackupSpec{ - IncludedNamespaces: []string{"ns1", "ns2", "ns3"}, - }, - }, - existingNS: []string{"ns1", "ns2", "ns3", "ns4"}, - expectedResult: []string{"ns1", "ns2", "ns3"}, - }, - { - name: "Excludes specified namespaces from included list", - backup: &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup", - Namespace: "velero", - }, - Spec: velerov1api.BackupSpec{ - IncludedNamespaces: []string{"ns1", "ns2", "ns3"}, - ExcludedNamespaces: []string{"ns2"}, - }, - }, - existingNS: []string{"ns1", "ns2", "ns3", "ns4"}, - expectedResult: []string{"ns1", "ns3"}, - }, - { - name: "Returns all namespaces when IncludedNamespaces is empty", - backup: &velerov1api.Backup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-backup", - Namespace: "velero", - }, - Spec: velerov1api.BackupSpec{}, - }, - existingNS: []string{"default", "kube-system", "app-ns"}, - expectedResult: []string{"default", "kube-system", "app-ns"}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create fake client with namespaces - var objects []runtime.Object - for _, ns := range tt.existingNS { - objects = append(objects, &corev1api.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: ns, - }, - }) - } - client := velerotest.NewFakeControllerRuntimeClient(t, objects...) - - result, err := resolveNamespacesForBackup(*tt.backup, client) - require.NoError(t, err) - require.ElementsMatch(t, tt.expectedResult, result) - }) - } -} diff --git a/pkg/util/podvolume/pod_volume.go b/pkg/util/podvolume/pod_volume.go index 33d67dc05..7c7e0f9c4 100644 --- a/pkg/util/podvolume/pod_volume.go +++ b/pkg/util/podvolume/pod_volume.go @@ -253,19 +253,8 @@ func GetVolumesToExclude(obj metav1.Object) []string { return strings.Split(annotations[velerov1api.VolumesToExcludeAnnotation], ",") } -// IsPVCDefaultToFSBackup checks if a PVC should default to fs-backup based on pod annotations. -// Deprecated: Use IsPVCDefaultToFSBackupWithCache for better performance. -func IsPVCDefaultToFSBackup(pvcNamespace, pvcName string, crClient crclient.Client, defaultVolumesToFsBackup bool) (bool, error) { - pods, err := GetPodsUsingPVC(pvcNamespace, pvcName, crClient) - if err != nil { - return false, errors.WithStack(err) - } - - return checkPodsForFSBackup(pods, pvcName, defaultVolumesToFsBackup) -} - -// IsPVCDefaultToFSBackupWithCache is the cached version of IsPVCDefaultToFSBackup. -// If cache is nil or not built, it falls back to the non-cached version. +// IsPVCDefaultToFSBackupWithCache checks if a PVC should default to fs-backup based on pod annotations. +// If cache is nil or not built, it falls back to listing pods directly. // Note: In the main backup path, the cache is always built (via NewVolumeHelperImplWithNamespaces), // so the fallback is only used by plugins that don't need cache optimization. func IsPVCDefaultToFSBackupWithCache( @@ -281,7 +270,7 @@ func IsPVCDefaultToFSBackupWithCache( if cache != nil && cache.IsBuilt() { pods = cache.GetPodsUsingPVC(pvcNamespace, pvcName) } else { - pods, err = GetPodsUsingPVC(pvcNamespace, pvcName, crClient) + pods, err = getPodsUsingPVCDirect(pvcNamespace, pvcName, crClient) if err != nil { return false, errors.WithStack(err) } @@ -318,10 +307,32 @@ func getPodVolumeNameForPVC(pod corev1api.Pod, pvcName string) (string, error) { return "", errors.Errorf("Pod %s/%s does not use PVC %s/%s", pod.Namespace, pod.Name, pod.Namespace, pvcName) } -// GetPodsUsingPVC returns all pods in the given namespace that use the specified PVC. -// This function lists all pods in the namespace and filters them, which can be slow -// when there are many pods. Consider using GetPodsUsingPVCWithCache for better performance. -func GetPodsUsingPVC( +// GetPodsUsingPVCWithCache returns all pods that use the specified PVC. +// If cache is available and built, it uses the cache for O(1) lookup. +// Otherwise, it falls back to listing pods directly. +// Note: In the main backup path, the cache is always built (via NewVolumeHelperImplWithNamespaces), +// so the fallback is only used by plugins that don't need cache optimization. +func GetPodsUsingPVCWithCache( + pvcNamespace, pvcName string, + crClient crclient.Client, + cache *PVCPodCache, +) ([]corev1api.Pod, error) { + // Use cache if available + if cache != nil && cache.IsBuilt() { + pods := cache.GetPodsUsingPVC(pvcNamespace, pvcName) + if pods == nil { + return []corev1api.Pod{}, nil + } + return pods, nil + } + + // Fall back to direct lookup (for plugins without cache) + return getPodsUsingPVCDirect(pvcNamespace, pvcName, crClient) +} + +// getPodsUsingPVCDirect returns all pods in the given namespace that use the specified PVC. +// This is an internal function that lists all pods in the namespace and filters them. +func getPodsUsingPVCDirect( pvcNamespace, pvcName string, crClient crclient.Client, ) ([]corev1api.Pod, error) { @@ -346,29 +357,6 @@ func GetPodsUsingPVC( return podsUsingPVC, nil } -// GetPodsUsingPVCWithCache returns all pods that use the specified PVC. -// If cache is available and built, it uses the cache for O(1) lookup. -// Otherwise, it falls back to the original GetPodsUsingPVC function. -// Note: In the main backup path, the cache is always built (via NewVolumeHelperImplWithNamespaces), -// so the fallback is only used by plugins that don't need cache optimization. -func GetPodsUsingPVCWithCache( - pvcNamespace, pvcName string, - crClient crclient.Client, - cache *PVCPodCache, -) ([]corev1api.Pod, error) { - // Use cache if available - if cache != nil && cache.IsBuilt() { - pods := cache.GetPodsUsingPVC(pvcNamespace, pvcName) - if pods == nil { - return []corev1api.Pod{}, nil - } - return pods, nil - } - - // Fall back to direct lookup - return GetPodsUsingPVC(pvcNamespace, pvcName, crClient) -} - func GetVolumesToProcess(volumes []corev1api.Volume, volsToProcessByLegacyApproach []string) []corev1api.Volume { volsToProcess := make([]corev1api.Volume, 0) diff --git a/pkg/util/podvolume/pod_volume_test.go b/pkg/util/podvolume/pod_volume_test.go index 3ba2a05e9..a3484c2e3 100644 --- a/pkg/util/podvolume/pod_volume_test.go +++ b/pkg/util/podvolume/pod_volume_test.go @@ -382,196 +382,6 @@ func TestGetVolumesByPod(t *testing.T) { } } -func TestIsPVCDefaultToFSBackup(t *testing.T) { - objs := []runtime.Object{ - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: "default", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod2", - Namespace: "default", - Annotations: map[string]string{ - "backup.velero.io/backup-volumes": "csi-vol1", - }, - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod3", - Namespace: "default", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - EmptyDir: &corev1api.EmptyDirVolumeSource{}, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "awesome-pod-1", - Namespace: "awesome-ns", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "awesome-csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "awesome-pod-2", - Namespace: "awesome-ns", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "awesome-csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: "uploader-ns", - Annotations: map[string]string{ - "backup.velero.io/backup-volumes": "csi-vol1", - }, - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod2", - Namespace: "uploader-ns", - Annotations: map[string]string{ - "backup.velero.io/backup-volumes": "csi-vol1", - }, - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - } - fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) - - testCases := []struct { - name string - inPVCNamespace string - inPVCName string - expectedIsFSUploaderUsed bool - defaultVolumesToFSBackup bool - }{ - { - name: "2 pods using PVC, 1 pod using uploader", - inPVCNamespace: "default", - inPVCName: "csi-pvc1", - expectedIsFSUploaderUsed: true, - defaultVolumesToFSBackup: false, - }, - { - name: "2 pods using PVC, 2 pods using uploader", - inPVCNamespace: "uploader-ns", - inPVCName: "csi-pvc1", - expectedIsFSUploaderUsed: true, - defaultVolumesToFSBackup: false, - }, - { - name: "2 pods using PVC, 0 pods using uploader", - inPVCNamespace: "awesome-ns", - inPVCName: "awesome-csi-pvc1", - expectedIsFSUploaderUsed: false, - defaultVolumesToFSBackup: false, - }, - { - name: "0 pods using PVC", - inPVCNamespace: "default", - inPVCName: "does-not-exist", - expectedIsFSUploaderUsed: false, - defaultVolumesToFSBackup: false, - }, - { - name: "2 pods using PVC, using uploader by default", - inPVCNamespace: "awesome-ns", - inPVCName: "awesome-csi-pvc1", - expectedIsFSUploaderUsed: true, - defaultVolumesToFSBackup: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - actualIsFSUploaderUsed, _ := IsPVCDefaultToFSBackup(tc.inPVCNamespace, tc.inPVCName, fakeClient, tc.defaultVolumesToFSBackup) - assert.Equal(t, tc.expectedIsFSUploaderUsed, actualIsFSUploaderUsed) - }) - } -} - func TestGetPodVolumeNameForPVC(t *testing.T) { testCases := []struct { name string @@ -677,122 +487,6 @@ func TestGetPodVolumeNameForPVC(t *testing.T) { } } -func TestGetPodsUsingPVC(t *testing.T) { - objs := []runtime.Object{ - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: "default", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod2", - Namespace: "default", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod3", - Namespace: "default", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - EmptyDir: &corev1api.EmptyDirVolumeSource{}, - }, - }, - }, - }, - }, - &corev1api.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: "awesome-ns", - }, - Spec: corev1api.PodSpec{ - Volumes: []corev1api.Volume{ - { - Name: "csi-vol1", - VolumeSource: corev1api.VolumeSource{ - PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{ - ClaimName: "csi-pvc1", - }, - }, - }, - }, - }, - }, - } - fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...) - - testCases := []struct { - name string - pvcNamespace string - pvcName string - expectedPodCount int - }{ - { - name: "should find exactly 2 pods using the PVC", - pvcNamespace: "default", - pvcName: "csi-pvc1", - expectedPodCount: 2, - }, - { - name: "should find exactly 1 pod using the PVC", - pvcNamespace: "awesome-ns", - pvcName: "csi-pvc1", - expectedPodCount: 1, - }, - { - name: "should find 0 pods using the PVC", - pvcNamespace: "default", - pvcName: "unused-pvc", - expectedPodCount: 0, - }, - { - name: "should find 0 pods in non-existent namespace", - pvcNamespace: "does-not-exist", - pvcName: "csi-pvc1", - expectedPodCount: 0, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - actualPods, err := GetPodsUsingPVC(tc.pvcNamespace, tc.pvcName, fakeClient) - require.NoErrorf(t, err, "Want error=nil; Got error=%v", err) - assert.Lenf(t, actualPods, tc.expectedPodCount, "unexpected number of pods in result; Want: %d; Got: %d", tc.expectedPodCount, len(actualPods)) - }) - } -} - func TestGetVolumesToProcess(t *testing.T) { testCases := []struct { name string From 4ba2effaac47be137918e43bb3f5beaf100db7d5 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Wed, 17 Dec 2025 10:54:11 -0800 Subject: [PATCH 18/19] Add nolint directives for intentional deprecated function usage The ShouldPerformSnapshotWithVolumeHelper function and tests intentionally use NewVolumeHelperImpl (deprecated) for backwards compatibility with third-party plugins. Add nolint:staticcheck to suppress the linter warnings with explanatory comments. Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper_test.go | 2 ++ pkg/plugin/utils/volumehelper/volume_policy_helper.go | 1 + 2 files changed, 3 insertions(+) diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index 8d6073c2b..278c2af68 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -322,6 +322,7 @@ func TestVolumeHelperImpl_ShouldPerformSnapshot(t *testing.T) { t.Fatalf("failed to build policy with error %v", err) } } + //nolint:staticcheck // Testing deprecated function for backwards compatibility vh := NewVolumeHelperImpl( p, tc.snapshotVolumesFlag, @@ -687,6 +688,7 @@ func TestVolumeHelperImpl_ShouldPerformFSBackup(t *testing.T) { t.Fatalf("failed to build policy with error %v", err) } } + //nolint:staticcheck // Testing deprecated function for backwards compatibility vh := NewVolumeHelperImpl( p, tc.snapshotVolumesFlag, diff --git a/pkg/plugin/utils/volumehelper/volume_policy_helper.go b/pkg/plugin/utils/volumehelper/volume_policy_helper.go index 8fda7f5af..a19f8d7c8 100644 --- a/pkg/plugin/utils/volumehelper/volume_policy_helper.go +++ b/pkg/plugin/utils/volumehelper/volume_policy_helper.go @@ -83,6 +83,7 @@ func ShouldPerformSnapshotWithVolumeHelper( return false, err } + //nolint:staticcheck // Intentional use of deprecated function for backwards compatibility volumeHelperImpl := volumehelper.NewVolumeHelperImpl( resourcePolicies, backup.Spec.SnapshotVolumes, From 78e94700286fdf1a56a66504023e80908d47c02d Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Wed, 17 Dec 2025 11:01:00 -0800 Subject: [PATCH 19/19] Remove unnecessary nolint directives from test file The nolint:staticcheck directives are not needed in the test file because it calls NewVolumeHelperImpl within the same package, which doesn't trigger deprecation warnings. Only cross-package calls need the directive. Signed-off-by: Shubham Pampattiwar --- internal/volumehelper/volume_policy_helper_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/volumehelper/volume_policy_helper_test.go b/internal/volumehelper/volume_policy_helper_test.go index 278c2af68..8d6073c2b 100644 --- a/internal/volumehelper/volume_policy_helper_test.go +++ b/internal/volumehelper/volume_policy_helper_test.go @@ -322,7 +322,6 @@ func TestVolumeHelperImpl_ShouldPerformSnapshot(t *testing.T) { t.Fatalf("failed to build policy with error %v", err) } } - //nolint:staticcheck // Testing deprecated function for backwards compatibility vh := NewVolumeHelperImpl( p, tc.snapshotVolumesFlag, @@ -688,7 +687,6 @@ func TestVolumeHelperImpl_ShouldPerformFSBackup(t *testing.T) { t.Fatalf("failed to build policy with error %v", err) } } - //nolint:staticcheck // Testing deprecated function for backwards compatibility vh := NewVolumeHelperImpl( p, tc.snapshotVolumesFlag,