Add PVC-to-Pod cache to improve volume policy performance

The GetPodsUsingPVC function had O(N*M) complexity - for each PVC,
it listed ALL pods in the namespace and iterated through each pod.
With many PVCs and pods, this caused significant performance
degradation (2+ seconds per PV in some cases).

This change introduces a PVC-to-Pod cache that is built once per
backup and reused for all PVC lookups, reducing complexity from
O(N*M) to O(N+M).

Changes:
- Add PVCPodCache struct with thread-safe caching in podvolume pkg
- Add NewVolumeHelperImplWithCache constructor for cache support
- Build cache before backup item processing in backup.go
- Add comprehensive unit tests for cache functionality
- Graceful fallback to direct lookups if cache fails

Fixes #9179

Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
This commit is contained in:
Shubham Pampattiwar
2025-12-02 12:27:35 -08:00
parent a1026cb531
commit bcdc30b59a
4 changed files with 553 additions and 2 deletions

View File

@@ -33,6 +33,9 @@ type volumeHelperImpl struct {
// to the volume policy check, but fs-backup is based on the pod resource,
// the resource filter on PVC and PV doesn't work on this scenario.
backupExcludePVC bool
// pvcPodCache provides cached PVC to Pod mappings for improved performance.
// When there are many PVCs and pods, using this cache avoids O(N*M) lookups.
pvcPodCache *podvolumeutil.PVCPodCache
}
func NewVolumeHelperImpl(
@@ -50,6 +53,29 @@ func NewVolumeHelperImpl(
client: client,
defaultVolumesToFSBackup: defaultVolumesToFSBackup,
backupExcludePVC: backupExcludePVC,
pvcPodCache: nil, // Cache will be nil by default for backward compatibility
}
}
// NewVolumeHelperImplWithCache creates a VolumeHelper with a PVC-to-Pod cache for improved performance.
// The cache should be built before backup processing begins.
func NewVolumeHelperImplWithCache(
volumePolicy *resourcepolicies.Policies,
snapshotVolumes *bool,
logger logrus.FieldLogger,
client crclient.Client,
defaultVolumesToFSBackup bool,
backupExcludePVC bool,
pvcPodCache *podvolumeutil.PVCPodCache,
) VolumeHelper {
return &volumeHelperImpl{
volumePolicy: volumePolicy,
snapshotVolumes: snapshotVolumes,
logger: logger,
client: client,
defaultVolumesToFSBackup: defaultVolumesToFSBackup,
backupExcludePVC: backupExcludePVC,
pvcPodCache: pvcPodCache,
}
}
@@ -105,10 +131,12 @@ func (v *volumeHelperImpl) ShouldPerformSnapshot(obj runtime.Unstructured, group
// If this PV is claimed, see if we've already taken a (pod volume backup)
// snapshot of the contents of this PV. If so, don't take a snapshot.
if pv.Spec.ClaimRef != nil {
pods, err := podvolumeutil.GetPodsUsingPVC(
// Use cached lookup if available for better performance with many PVCs/pods
pods, err := podvolumeutil.GetPodsUsingPVCWithCache(
pv.Spec.ClaimRef.Namespace,
pv.Spec.ClaimRef.Name,
v.client,
v.pvcPodCache,
)
if err != nil {
v.logger.WithError(err).Errorf("fail to get pod for PV %s", pv.Name)

View File

@@ -64,6 +64,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/util/boolptr"
"github.com/vmware-tanzu/velero/pkg/util/collections"
"github.com/vmware-tanzu/velero/pkg/util/kube"
podvolumeutil "github.com/vmware-tanzu/velero/pkg/util/podvolume"
)
// BackupVersion is the current backup major version for Velero.
@@ -408,6 +409,21 @@ func (kb *kubernetesBackupper) BackupWithResolvers(
}
backupRequest.Status.Progress = &velerov1api.BackupProgress{TotalItems: len(items)}
// Build PVC-to-Pod cache for improved volume policy lookup performance.
// This avoids O(N*M) complexity when there are many PVCs and pods.
// See issue #9179 for details.
pvcPodCache := podvolumeutil.NewPVCPodCache()
namespaces := backupRequest.NamespaceIncludesExcludes.GetIncludes()
if len(namespaces) > 0 {
if err := pvcPodCache.BuildCacheForNamespaces(context.Background(), namespaces, kb.kbClient); err != nil {
// Log warning but continue - the cache will fall back to direct lookups
log.WithError(err).Warn("Failed to build PVC-to-Pod cache, falling back to direct lookups")
pvcPodCache = nil
} else {
log.Infof("Built PVC-to-Pod cache for %d namespaces", len(namespaces))
}
}
itemBackupper := &itemBackupper{
backupRequest: backupRequest,
tarWriter: tw,
@@ -422,13 +438,14 @@ func (kb *kubernetesBackupper) BackupWithResolvers(
PodCommandExecutor: kb.podCommandExecutor,
},
hookTracker: hook.NewHookTracker(),
volumeHelperImpl: volumehelper.NewVolumeHelperImpl(
volumeHelperImpl: volumehelper.NewVolumeHelperImplWithCache(
backupRequest.ResPolicies,
backupRequest.Spec.SnapshotVolumes,
log,
kb.kbClient,
boolptr.IsSetToTrue(backupRequest.Spec.DefaultVolumesToFsBackup),
!backupRequest.ResourceIncludesExcludes.ShouldInclude(kuberesource.PersistentVolumeClaims.String()),
pvcPodCache,
),
kubernetesBackupper: kb,
}

View File

@@ -19,6 +19,7 @@ package podvolume
import (
"context"
"strings"
"sync"
"github.com/pkg/errors"
corev1api "k8s.io/api/core/v1"
@@ -29,6 +30,89 @@ import (
"github.com/vmware-tanzu/velero/pkg/util"
)
// PVCPodCache provides a cached mapping from PVC to the pods that use it.
// This cache is built once per backup to avoid repeated pod listings which
// cause O(N*M) performance issues when there are many PVCs and pods.
type PVCPodCache struct {
mu sync.RWMutex
// cache maps namespace -> pvcName -> []Pod
cache map[string]map[string][]corev1api.Pod
// built indicates whether the cache has been populated
built bool
}
// NewPVCPodCache creates a new empty PVC to Pod cache.
func NewPVCPodCache() *PVCPodCache {
return &PVCPodCache{
cache: make(map[string]map[string][]corev1api.Pod),
built: false,
}
}
// BuildCacheForNamespaces builds the cache by listing pods once per namespace.
// This is much more efficient than listing pods for each PVC lookup.
func (c *PVCPodCache) BuildCacheForNamespaces(
ctx context.Context,
namespaces []string,
crClient crclient.Client,
) error {
c.mu.Lock()
defer c.mu.Unlock()
for _, ns := range namespaces {
podList := new(corev1api.PodList)
if err := crClient.List(
ctx,
podList,
&crclient.ListOptions{Namespace: ns},
); err != nil {
return errors.Wrapf(err, "failed to list pods in namespace %s", ns)
}
if c.cache[ns] == nil {
c.cache[ns] = make(map[string][]corev1api.Pod)
}
// Build mapping from PVC name to pods
for i := range podList.Items {
pod := podList.Items[i]
for _, v := range pod.Spec.Volumes {
if v.PersistentVolumeClaim != nil {
pvcName := v.PersistentVolumeClaim.ClaimName
c.cache[ns][pvcName] = append(c.cache[ns][pvcName], pod)
}
}
}
}
c.built = true
return nil
}
// GetPodsUsingPVC retrieves pods using a specific PVC from the cache.
// Returns nil slice if the PVC is not found in the cache.
func (c *PVCPodCache) GetPodsUsingPVC(namespace, pvcName string) []corev1api.Pod {
c.mu.RLock()
defer c.mu.RUnlock()
if nsPods, ok := c.cache[namespace]; ok {
if pods, ok := nsPods[pvcName]; ok {
// Return a copy to avoid race conditions
result := make([]corev1api.Pod, len(pods))
copy(result, pods)
return result
}
}
return nil
}
// IsBuilt returns true if the cache has been built.
func (c *PVCPodCache) IsBuilt() bool {
c.mu.RLock()
defer c.mu.RUnlock()
return c.built
}
// GetVolumesByPod returns a list of volume names to backup for the provided pod.
func GetVolumesByPod(pod *corev1api.Pod, defaultVolumesToFsBackup, backupExcludePVC bool, volsToProcessByLegacyApproach []string) ([]string, []string) {
// tracks the volumes that have been explicitly opted out of backup via the annotation in the pod
@@ -109,12 +193,44 @@ func GetVolumesToExclude(obj metav1.Object) []string {
return strings.Split(annotations[velerov1api.VolumesToExcludeAnnotation], ",")
}
// IsPVCDefaultToFSBackup checks if a PVC should default to fs-backup based on pod annotations.
// Deprecated: Use IsPVCDefaultToFSBackupWithCache for better performance.
func IsPVCDefaultToFSBackup(pvcNamespace, pvcName string, crClient crclient.Client, defaultVolumesToFsBackup bool) (bool, error) {
pods, err := GetPodsUsingPVC(pvcNamespace, pvcName, crClient)
if err != nil {
return false, errors.WithStack(err)
}
return checkPodsForFSBackup(pods, pvcName, defaultVolumesToFsBackup)
}
// IsPVCDefaultToFSBackupWithCache is the cached version of IsPVCDefaultToFSBackup.
// If cache is nil or not built, it falls back to the non-cached version.
func IsPVCDefaultToFSBackupWithCache(
pvcNamespace, pvcName string,
crClient crclient.Client,
defaultVolumesToFsBackup bool,
cache *PVCPodCache,
) (bool, error) {
var pods []corev1api.Pod
var err error
// Use cache if available, otherwise fall back to direct lookup
if cache != nil && cache.IsBuilt() {
pods = cache.GetPodsUsingPVC(pvcNamespace, pvcName)
} else {
pods, err = GetPodsUsingPVC(pvcNamespace, pvcName, crClient)
if err != nil {
return false, errors.WithStack(err)
}
}
return checkPodsForFSBackup(pods, pvcName, defaultVolumesToFsBackup)
}
// checkPodsForFSBackup is a helper function that checks if any pod using the PVC
// has the volume selected for fs-backup.
func checkPodsForFSBackup(pods []corev1api.Pod, pvcName string, defaultVolumesToFsBackup bool) (bool, error) {
for index := range pods {
vols, _ := GetVolumesByPod(&pods[index], defaultVolumesToFsBackup, false, []string{})
if len(vols) > 0 {
@@ -140,6 +256,9 @@ func getPodVolumeNameForPVC(pod corev1api.Pod, pvcName string) (string, error) {
return "", errors.Errorf("Pod %s/%s does not use PVC %s/%s", pod.Namespace, pod.Name, pod.Namespace, pvcName)
}
// GetPodsUsingPVC returns all pods in the given namespace that use the specified PVC.
// This function lists all pods in the namespace and filters them, which can be slow
// when there are many pods. Consider using GetPodsUsingPVCWithCache for better performance.
func GetPodsUsingPVC(
pvcNamespace, pvcName string,
crClient crclient.Client,
@@ -165,6 +284,27 @@ func GetPodsUsingPVC(
return podsUsingPVC, nil
}
// GetPodsUsingPVCWithCache returns all pods that use the specified PVC.
// If cache is available and built, it uses the cache for O(1) lookup.
// Otherwise, it falls back to the original GetPodsUsingPVC function.
func GetPodsUsingPVCWithCache(
pvcNamespace, pvcName string,
crClient crclient.Client,
cache *PVCPodCache,
) ([]corev1api.Pod, error) {
// Use cache if available
if cache != nil && cache.IsBuilt() {
pods := cache.GetPodsUsingPVC(pvcNamespace, pvcName)
if pods == nil {
return []corev1api.Pod{}, nil
}
return pods, nil
}
// Fall back to direct lookup
return GetPodsUsingPVC(pvcNamespace, pvcName, crClient)
}
func GetVolumesToProcess(volumes []corev1api.Volume, volsToProcessByLegacyApproach []string) []corev1api.Volume {
volsToProcess := make([]corev1api.Volume, 0)

View File

@@ -886,3 +886,369 @@ func TestGetVolumesToProcess(t *testing.T) {
})
}
}
func TestPVCPodCache_BuildAndGet(t *testing.T) {
objs := []runtime.Object{
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
Namespace: "default",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc1",
},
},
},
},
},
},
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod2",
Namespace: "default",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc1",
},
},
},
{
Name: "vol2",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc2",
},
},
},
},
},
},
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod3",
Namespace: "default",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
EmptyDir: &corev1api.EmptyDirVolumeSource{},
},
},
},
},
},
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod4",
Namespace: "other-ns",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc1",
},
},
},
},
},
},
}
fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...)
testCases := []struct {
name string
namespaces []string
pvcNamespace string
pvcName string
expectedPodCount int
}{
{
name: "should find 2 pods using pvc1 in default namespace",
namespaces: []string{"default", "other-ns"},
pvcNamespace: "default",
pvcName: "pvc1",
expectedPodCount: 2,
},
{
name: "should find 1 pod using pvc2 in default namespace",
namespaces: []string{"default", "other-ns"},
pvcNamespace: "default",
pvcName: "pvc2",
expectedPodCount: 1,
},
{
name: "should find 1 pod using pvc1 in other-ns",
namespaces: []string{"default", "other-ns"},
pvcNamespace: "other-ns",
pvcName: "pvc1",
expectedPodCount: 1,
},
{
name: "should find 0 pods for non-existent PVC",
namespaces: []string{"default", "other-ns"},
pvcNamespace: "default",
pvcName: "non-existent",
expectedPodCount: 0,
},
{
name: "should find 0 pods for non-existent namespace",
namespaces: []string{"default", "other-ns"},
pvcNamespace: "non-existent-ns",
pvcName: "pvc1",
expectedPodCount: 0,
},
{
name: "should find 0 pods when namespace not in cache",
namespaces: []string{"default"},
pvcNamespace: "other-ns",
pvcName: "pvc1",
expectedPodCount: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
cache := NewPVCPodCache()
err := cache.BuildCacheForNamespaces(t.Context(), tc.namespaces, fakeClient)
require.NoError(t, err)
assert.True(t, cache.IsBuilt())
pods := cache.GetPodsUsingPVC(tc.pvcNamespace, tc.pvcName)
assert.Len(t, pods, tc.expectedPodCount, "unexpected number of pods")
})
}
}
func TestGetPodsUsingPVCWithCache(t *testing.T) {
objs := []runtime.Object{
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
Namespace: "default",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc1",
},
},
},
},
},
},
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod2",
Namespace: "default",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc1",
},
},
},
},
},
},
}
fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...)
testCases := []struct {
name string
pvcNamespace string
pvcName string
buildCache bool
useNilCache bool
expectedPodCount int
}{
{
name: "returns cached results when cache is available",
pvcNamespace: "default",
pvcName: "pvc1",
buildCache: true,
useNilCache: false,
expectedPodCount: 2,
},
{
name: "falls back to direct lookup when cache is nil",
pvcNamespace: "default",
pvcName: "pvc1",
buildCache: false,
useNilCache: true,
expectedPodCount: 2,
},
{
name: "falls back to direct lookup when cache is not built",
pvcNamespace: "default",
pvcName: "pvc1",
buildCache: false,
useNilCache: false,
expectedPodCount: 2,
},
{
name: "returns empty slice for non-existent PVC with cache",
pvcNamespace: "default",
pvcName: "non-existent",
buildCache: true,
useNilCache: false,
expectedPodCount: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var cache *PVCPodCache
if !tc.useNilCache {
cache = NewPVCPodCache()
if tc.buildCache {
err := cache.BuildCacheForNamespaces(t.Context(), []string{"default"}, fakeClient)
require.NoError(t, err)
}
}
pods, err := GetPodsUsingPVCWithCache(tc.pvcNamespace, tc.pvcName, fakeClient, cache)
require.NoError(t, err)
assert.Len(t, pods, tc.expectedPodCount, "unexpected number of pods")
})
}
}
func TestIsPVCDefaultToFSBackupWithCache(t *testing.T) {
objs := []runtime.Object{
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
Namespace: "default",
Annotations: map[string]string{
"backup.velero.io/backup-volumes": "vol1",
},
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc1",
},
},
},
},
},
},
&corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod2",
Namespace: "default",
},
Spec: corev1api.PodSpec{
Volumes: []corev1api.Volume{
{
Name: "vol1",
VolumeSource: corev1api.VolumeSource{
PersistentVolumeClaim: &corev1api.PersistentVolumeClaimVolumeSource{
ClaimName: "pvc2",
},
},
},
},
},
},
}
fakeClient := velerotest.NewFakeControllerRuntimeClient(t, objs...)
testCases := []struct {
name string
pvcNamespace string
pvcName string
defaultVolumesToFsBackup bool
buildCache bool
useNilCache bool
expectedResult bool
}{
{
name: "returns true for PVC with opt-in annotation using cache",
pvcNamespace: "default",
pvcName: "pvc1",
defaultVolumesToFsBackup: false,
buildCache: true,
useNilCache: false,
expectedResult: true,
},
{
name: "returns false for PVC without annotation using cache",
pvcNamespace: "default",
pvcName: "pvc2",
defaultVolumesToFsBackup: false,
buildCache: true,
useNilCache: false,
expectedResult: false,
},
{
name: "returns true for any PVC with defaultVolumesToFsBackup using cache",
pvcNamespace: "default",
pvcName: "pvc2",
defaultVolumesToFsBackup: true,
buildCache: true,
useNilCache: false,
expectedResult: true,
},
{
name: "falls back to direct lookup when cache is nil",
pvcNamespace: "default",
pvcName: "pvc1",
defaultVolumesToFsBackup: false,
buildCache: false,
useNilCache: true,
expectedResult: true,
},
{
name: "returns false for non-existent PVC",
pvcNamespace: "default",
pvcName: "non-existent",
defaultVolumesToFsBackup: false,
buildCache: true,
useNilCache: false,
expectedResult: false,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var cache *PVCPodCache
if !tc.useNilCache {
cache = NewPVCPodCache()
if tc.buildCache {
err := cache.BuildCacheForNamespaces(t.Context(), []string{"default"}, fakeClient)
require.NoError(t, err)
}
}
result, err := IsPVCDefaultToFSBackupWithCache(tc.pvcNamespace, tc.pvcName, fakeClient, tc.defaultVolumesToFsBackup, cache)
require.NoError(t, err)
assert.Equal(t, tc.expectedResult, result)
})
}
}