From 5ba00dfb097de68c6312d7497c2ef2675a64d1a9 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Thu, 11 Sep 2025 15:51:46 -0700 Subject: [PATCH 01/11] Fix maintenance jobs toleration inheritance from Velero deployment Signed-off-by: Shubham Pampattiwar fix codespell and add changelog file Signed-off-by: Shubham Pampattiwar --- .../unreleased/9256-shubham-pampattiwar | 1 + pkg/repository/maintenance/maintenance.go | 40 ++- .../maintenance/maintenance_test.go | 288 ++++++++++++++++++ 3 files changed, 320 insertions(+), 9 deletions(-) create mode 100644 changelogs/unreleased/9256-shubham-pampattiwar diff --git a/changelogs/unreleased/9256-shubham-pampattiwar b/changelogs/unreleased/9256-shubham-pampattiwar new file mode 100644 index 000000000..2400fb6b6 --- /dev/null +++ b/changelogs/unreleased/9256-shubham-pampattiwar @@ -0,0 +1 @@ +Fix repository maintenance jobs to inherit allowlisted tolerations from Velero deployment diff --git a/pkg/repository/maintenance/maintenance.go b/pkg/repository/maintenance/maintenance.go index 16a94535f..e43918d4d 100644 --- a/pkg/repository/maintenance/maintenance.go +++ b/pkg/repository/maintenance/maintenance.go @@ -449,6 +449,35 @@ func StartNewJob( return maintenanceJob.Name, nil } +// buildTolerationsForMaintenanceJob builds the tolerations for maintenance jobs. +// It includes the required Windows toleration for backward compatibility and filters +// tolerations from the Velero deployment to only include those with keys that are +// in the ThirdPartyTolerations allowlist, following the same pattern as labels and annotations. +func buildTolerationsForMaintenanceJob(deployment *appsv1api.Deployment) []corev1api.Toleration { + // Start with the Windows toleration for backward compatibility + windowsToleration := corev1api.Toleration{ + Key: "os", + Operator: "Equal", + Effect: "NoSchedule", + Value: "windows", + } + result := []corev1api.Toleration{windowsToleration} + + // Filter tolerations from the Velero deployment to only include allowed ones + // Only tolerations that exist on the deployment AND have keys in the allowlist are inherited + deploymentTolerations := veleroutil.GetTolerationsFromVeleroServer(deployment) + for _, k := range util.ThirdPartyTolerations { + for _, toleration := range deploymentTolerations { + if toleration.Key == k { + result = append(result, toleration) + break // Only add the first matching toleration for each allowed key + } + } + } + + return result +} + func getPriorityClassName(ctx context.Context, cli client.Client, config *velerotypes.JobConfigs, logger logrus.FieldLogger) string { // Use the priority class name from the global job configuration if available // Note: Priority class is only read from global config, not per-repository @@ -593,15 +622,8 @@ func buildJob( SecurityContext: podSecurityContext, Volumes: volumes, ServiceAccountName: serviceAccount, - Tolerations: []corev1api.Toleration{ - { - Key: "os", - Operator: "Equal", - Effect: "NoSchedule", - Value: "windows", - }, - }, - ImagePullSecrets: imagePullSecrets, + Tolerations: buildTolerationsForMaintenanceJob(deployment), + ImagePullSecrets: imagePullSecrets, }, }, }, diff --git a/pkg/repository/maintenance/maintenance_test.go b/pkg/repository/maintenance/maintenance_test.go index 925a94043..93d8f9b2f 100644 --- a/pkg/repository/maintenance/maintenance_test.go +++ b/pkg/repository/maintenance/maintenance_test.go @@ -1481,3 +1481,291 @@ func TestBuildJobWithPriorityClassName(t *testing.T) { }) } } + +func TestBuildTolerationsForMaintenanceJob(t *testing.T) { + windowsToleration := corev1api.Toleration{ + Key: "os", + Operator: "Equal", + Effect: "NoSchedule", + Value: "windows", + } + + testCases := []struct { + name string + deploymentTolerations []corev1api.Toleration + expectedTolerations []corev1api.Toleration + }{ + { + name: "no tolerations should only include Windows toleration", + deploymentTolerations: nil, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + }, + }, + { + name: "empty tolerations should only include Windows toleration", + deploymentTolerations: []corev1api.Toleration{}, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + }, + }, + { + name: "non-allowed toleration should not be inherited", + deploymentTolerations: []corev1api.Toleration{ + { + Key: "vng-ondemand", + Operator: "Equal", + Effect: "NoSchedule", + Value: "amd64", + }, + }, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + }, + }, + { + name: "allowed toleration should be inherited", + deploymentTolerations: []corev1api.Toleration{ + { + Key: "kubernetes.azure.com/scalesetpriority", + Operator: "Equal", + Effect: "NoSchedule", + Value: "spot", + }, + }, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + { + Key: "kubernetes.azure.com/scalesetpriority", + Operator: "Equal", + Effect: "NoSchedule", + Value: "spot", + }, + }, + }, + { + name: "mixed allowed and non-allowed tolerations should only inherit allowed", + deploymentTolerations: []corev1api.Toleration{ + { + Key: "vng-ondemand", // not in allowlist + Operator: "Equal", + Effect: "NoSchedule", + Value: "amd64", + }, + { + Key: "CriticalAddonsOnly", // in allowlist + Operator: "Exists", + Effect: "NoSchedule", + }, + { + Key: "custom-key", // not in allowlist + Operator: "Equal", + Effect: "NoSchedule", + Value: "custom-value", + }, + }, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + { + Key: "CriticalAddonsOnly", + Operator: "Exists", + Effect: "NoSchedule", + }, + }, + }, + { + name: "multiple allowed tolerations should all be inherited", + deploymentTolerations: []corev1api.Toleration{ + { + Key: "kubernetes.azure.com/scalesetpriority", + Operator: "Equal", + Effect: "NoSchedule", + Value: "spot", + }, + { + Key: "CriticalAddonsOnly", + Operator: "Exists", + Effect: "NoSchedule", + }, + }, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + { + Key: "kubernetes.azure.com/scalesetpriority", + Operator: "Equal", + Effect: "NoSchedule", + Value: "spot", + }, + { + Key: "CriticalAddonsOnly", + Operator: "Exists", + Effect: "NoSchedule", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create a deployment with the specified tolerations + deployment := &appsv1api.Deployment{ + Spec: appsv1api.DeploymentSpec{ + Template: corev1api.PodTemplateSpec{ + Spec: corev1api.PodSpec{ + Tolerations: tc.deploymentTolerations, + }, + }, + }, + } + + result := buildTolerationsForMaintenanceJob(deployment) + assert.Equal(t, tc.expectedTolerations, result) + }) + } +} + +func TestBuildJobWithTolerationsInheritance(t *testing.T) { + // Define allowed tolerations that would be set on Velero deployment + allowedTolerations := []corev1api.Toleration{ + { + Key: "kubernetes.azure.com/scalesetpriority", + Operator: "Equal", + Effect: "NoSchedule", + Value: "spot", + }, + { + Key: "CriticalAddonsOnly", + Operator: "Exists", + Effect: "NoSchedule", + }, + } + + // Mixed tolerations (allowed and non-allowed) + mixedTolerations := []corev1api.Toleration{ + { + Key: "vng-ondemand", // not in allowlist + Operator: "Equal", + Effect: "NoSchedule", + Value: "amd64", + }, + { + Key: "CriticalAddonsOnly", // in allowlist + Operator: "Exists", + Effect: "NoSchedule", + }, + } + + // Windows toleration that should always be present + windowsToleration := corev1api.Toleration{ + Key: "os", + Operator: "Equal", + Effect: "NoSchedule", + Value: "windows", + } + + testCases := []struct { + name string + deploymentTolerations []corev1api.Toleration + expectedTolerations []corev1api.Toleration + }{ + { + name: "no tolerations on deployment should only have Windows toleration", + deploymentTolerations: nil, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + }, + }, + { + name: "allowed tolerations should be inherited along with Windows toleration", + deploymentTolerations: allowedTolerations, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + { + Key: "kubernetes.azure.com/scalesetpriority", + Operator: "Equal", + Effect: "NoSchedule", + Value: "spot", + }, + { + Key: "CriticalAddonsOnly", + Operator: "Exists", + Effect: "NoSchedule", + }, + }, + }, + { + name: "mixed tolerations should only inherit allowed ones", + deploymentTolerations: mixedTolerations, + expectedTolerations: []corev1api.Toleration{ + windowsToleration, + { + Key: "CriticalAddonsOnly", + Operator: "Exists", + Effect: "NoSchedule", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create a new scheme and add necessary API types + localScheme := runtime.NewScheme() + err := velerov1api.AddToScheme(localScheme) + require.NoError(t, err) + err = appsv1api.AddToScheme(localScheme) + require.NoError(t, err) + err = batchv1api.AddToScheme(localScheme) + require.NoError(t, err) + + // Create a deployment with the specified tolerations + deployment := &appsv1api.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "velero", + Namespace: "velero", + }, + Spec: appsv1api.DeploymentSpec{ + Template: corev1api.PodTemplateSpec{ + Spec: corev1api.PodSpec{ + Containers: []corev1api.Container{ + { + Name: "velero", + Image: "velero/velero:latest", + }, + }, + Tolerations: tc.deploymentTolerations, + }, + }, + }, + } + + // Create a backup repository + repo := &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-repo", + Namespace: "velero", + }, + Spec: velerov1api.BackupRepositorySpec{ + VolumeNamespace: "velero", + BackupStorageLocation: "default", + }, + } + + // Create fake client and add the deployment + client := fake.NewClientBuilder().WithScheme(localScheme).WithObjects(deployment).Build() + + // Create minimal job configs and resources + jobConfig := &velerotypes.JobConfigs{} + logLevel := logrus.InfoLevel + logFormat := logging.NewFormatFlag() + logFormat.Set("text") + + // Call buildJob + job, err := buildJob(client, t.Context(), repo, "default", jobConfig, logLevel, logFormat, logrus.New()) + require.NoError(t, err) + + // Verify the tolerations are set correctly + assert.Equal(t, tc.expectedTolerations, job.Spec.Template.Spec.Tolerations) + }) + } +} From 59289fba76ad39096543c79bf493ed551f795a10 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Mon, 15 Sep 2025 16:01:33 -0700 Subject: [PATCH 02/11] Fix Schedule Backup Queue Accumulation During Extended Blocking Scenarios Signed-off-by: Shubham Pampattiwar --- pkg/controller/schedule_controller.go | 2 +- pkg/controller/schedule_controller_test.go | 26 ++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pkg/controller/schedule_controller.go b/pkg/controller/schedule_controller.go index 799a8c77a..ec8894571 100644 --- a/pkg/controller/schedule_controller.go +++ b/pkg/controller/schedule_controller.go @@ -229,7 +229,7 @@ func (c *scheduleReconciler) checkIfBackupInNewOrProgress(schedule *velerov1.Sch } for _, backup := range backupList.Items { - if backup.Status.Phase == velerov1.BackupPhaseNew || backup.Status.Phase == velerov1.BackupPhaseInProgress { + if backup.Status.Phase == "" || backup.Status.Phase == velerov1.BackupPhaseNew || backup.Status.Phase == velerov1.BackupPhaseInProgress { log.Debugf("%s/%s still has backups that are in InProgress or New...", schedule.Namespace, schedule.Name) return true } diff --git a/pkg/controller/schedule_controller_test.go b/pkg/controller/schedule_controller_test.go index ab0a3f66d..f4585763c 100644 --- a/pkg/controller/schedule_controller_test.go +++ b/pkg/controller/schedule_controller_test.go @@ -149,6 +149,13 @@ func TestReconcileOfSchedule(t *testing.T) { expectedPhase: string(velerov1.SchedulePhaseEnabled), backup: builder.ForBackup("ns", "name-20220905120000").ObjectMeta(builder.WithLabels(velerov1.ScheduleNameLabel, "name")).Phase(velerov1.BackupPhaseNew).Result(), }, + { + name: "schedule already has backup with empty phase (not yet reconciled).", + schedule: newScheduleBuilder(velerov1.SchedulePhaseEnabled).CronSchedule("@every 5m").LastBackupTime("2000-01-01 00:00:00").Result(), + fakeClockTime: "2017-01-01 12:00:00", + expectedPhase: string(velerov1.SchedulePhaseEnabled), + backup: builder.ForBackup("ns", "name-20220905120000").ObjectMeta(builder.WithLabels(velerov1.ScheduleNameLabel, "name")).Phase("").Result(), + }, } for _, test := range tests { @@ -215,10 +222,10 @@ func TestReconcileOfSchedule(t *testing.T) { backups := &velerov1.BackupList{} require.NoError(t, client.List(ctx, backups)) - // If backup associated with schedule's status is in New or InProgress, + // If backup associated with schedule's status is in New or InProgress or empty phase, // new backup shouldn't be submitted. if test.backup != nil && - (test.backup.Status.Phase == velerov1.BackupPhaseNew || test.backup.Status.Phase == velerov1.BackupPhaseInProgress) { + (test.backup.Status.Phase == "" || test.backup.Status.Phase == velerov1.BackupPhaseNew || test.backup.Status.Phase == velerov1.BackupPhaseInProgress) { assert.Len(t, backups.Items, 1) require.NoError(t, client.Delete(ctx, test.backup)) } @@ -479,4 +486,19 @@ func TestCheckIfBackupInNewOrProgress(t *testing.T) { reconciler = NewScheduleReconciler("namespace", logger, client, metrics.NewServerMetrics(), false) result = reconciler.checkIfBackupInNewOrProgress(testSchedule) assert.True(t, result) + + // Clean backup in InProgress phase. + err = client.Delete(ctx, inProgressBackup) + require.NoError(t, err, "fail to delete backup in InProgress phase in TestCheckIfBackupInNewOrProgress: %v", err) + + // Create backup with empty phase (not yet reconciled). + emptyPhaseBackup := builder.ForBackup("ns", "backup-3"). + ObjectMeta(builder.WithLabels(velerov1.ScheduleNameLabel, "name")). + Phase("").Result() + err = client.Create(ctx, emptyPhaseBackup) + require.NoError(t, err, "fail to create backup with empty phase in TestCheckIfBackupInNewOrProgress: %v", err) + + reconciler = NewScheduleReconciler("namespace", logger, client, metrics.NewServerMetrics(), false) + result = reconciler.checkIfBackupInNewOrProgress(testSchedule) + assert.True(t, result) } From a7b2985c8396c5db9a259e62abcd47acc4da7343 Mon Sep 17 00:00:00 2001 From: Shubham Pampattiwar Date: Mon, 15 Sep 2025 16:07:40 -0700 Subject: [PATCH 03/11] add changelog file Signed-off-by: Shubham Pampattiwar --- changelogs/unreleased/9264-shubham-pampattiwar | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9264-shubham-pampattiwar diff --git a/changelogs/unreleased/9264-shubham-pampattiwar b/changelogs/unreleased/9264-shubham-pampattiwar new file mode 100644 index 000000000..711ea4b57 --- /dev/null +++ b/changelogs/unreleased/9264-shubham-pampattiwar @@ -0,0 +1 @@ +Fix schedule controller to prevent backup queue accumulation during extended blocking scenarios by properly handling empty backup phases \ No newline at end of file From 9b6c4b1d471aa0d6205feb617321b8badfb79e4e Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Wed, 17 Sep 2025 16:36:41 -0400 Subject: [PATCH 04/11] Fix E2E tests: Build MinIO from Bitnami Dockerfile to replace deprecated image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Bitnami MinIO image bitnami/minio:2021.6.17-debian-10-r7 is no longer available on Docker Hub, causing E2E tests to fail. This change implements a solution to build the MinIO image locally from Bitnami's public Dockerfile and cache it for subsequent runs: - Fetches the latest commit hash of the Bitnami MinIO Dockerfile - Uses GitHub Actions cache to store/retrieve built images - Only rebuilds when the upstream Dockerfile changes - Maintains compatibility with existing environment variables Fixes #9279 🤖 Generated with [Claude Code](https://claude.ai/code) Update .github/workflows/e2e-test-kind.yaml Signed-off-by: Tiger Kaovilai Co-Authored-By: Claude Signed-off-by: Tiger Kaovilai --- .github/workflows/e2e-test-kind.yaml | 37 ++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-kind.yaml b/.github/workflows/e2e-test-kind.yaml index 65517804c..662fe0488 100644 --- a/.github/workflows/e2e-test-kind.yaml +++ b/.github/workflows/e2e-test-kind.yaml @@ -11,6 +11,8 @@ jobs: # Build the Velero CLI and image once for all Kubernetes versions, and cache it so the fan-out workers can get it. build: runs-on: ubuntu-latest + outputs: + minio-dockerfile-sha: ${{ steps.minio-version.outputs.dockerfile_sha }} steps: - name: Check out the code uses: actions/checkout@v5 @@ -44,6 +46,26 @@ jobs: run: | IMAGE=velero VERSION=pr-test BUILD_OUTPUT_TYPE=docker make container docker save velero:pr-test-linux-amd64 -o ./velero.tar + # Check and build MinIO image once for all e2e tests + - name: Check Bitnami MinIO Dockerfile version + id: minio-version + run: | + DOCKERFILE_SHA=$(curl -s https://api.github.com/repos/bitnami/containers/commits?path=bitnami/minio/2025/debian-12/Dockerfile\&per_page=1 | jq -r '.[0].sha') + echo "dockerfile_sha=${DOCKERFILE_SHA}" >> $GITHUB_OUTPUT + - name: Cache MinIO Image + uses: actions/cache@v4 + id: minio-cache + with: + path: ./minio-image.tar + key: minio-bitnami-${{ steps.minio-version.outputs.dockerfile_sha }} + - name: Build MinIO Image from Bitnami Dockerfile + if: steps.minio-cache.outputs.cache-hit != 'true' + run: | + echo "Building MinIO image from Bitnami Dockerfile..." + git clone --depth 1 https://github.com/bitnami/containers.git /tmp/bitnami-containers + cd /tmp/bitnami-containers/bitnami/minio/2025/debian-12 + docker build -t bitnami/minio:local . + docker save bitnami/minio:local > ${{ github.workspace }}/minio-image.tar # Create json of k8s versions to test # from guide: https://stackoverflow.com/a/65094398/4590470 setup-test-matrix: @@ -86,9 +108,20 @@ jobs: uses: actions/setup-go@v5 with: go-version-file: 'go.mod' + # Fetch the pre-built MinIO image from the build job + - name: Fetch built MinIO Image + uses: actions/cache@v4 + id: minio-cache + with: + path: ./minio-image.tar + key: minio-bitnami-${{ needs.build.outputs.minio-dockerfile-sha }} + - name: Load MinIO Image + run: | + echo "Loading MinIO image..." + docker load < ./minio-image.tar - name: Install MinIO - run: - docker run -d --rm -p 9000:9000 -e "MINIO_ACCESS_KEY=minio" -e "MINIO_SECRET_KEY=minio123" -e "MINIO_DEFAULT_BUCKETS=bucket,additional-bucket" bitnami/minio:2021.6.17-debian-10-r7 + run: | + docker run -d --rm -p 9000:9000 -e "MINIO_ROOT_USER=minio" -e "MINIO_ROOT_PASSWORD=minio123" -e "MINIO_DEFAULT_BUCKETS=bucket,additional-bucket" bitnami/minio:local - uses: engineerd/setup-kind@v0.6.2 with: skipClusterLogsExport: true From e21b21c19ec882df38111fc07c146385124bab19 Mon Sep 17 00:00:00 2001 From: 0xLeo258 Date: Thu, 18 Sep 2025 17:21:25 +0800 Subject: [PATCH 05/11] fix 9234: Add safe VolumeSnapshotterCache Signed-off-by: 0xLeo258 --- pkg/backup/backup.go | 2 +- pkg/backup/item_backupper.go | 36 +++------------------- pkg/backup/volume_snapshotter_cache.go | 42 ++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 32 deletions(-) create mode 100644 pkg/backup/volume_snapshotter_cache.go diff --git a/pkg/backup/backup.go b/pkg/backup/backup.go index a11acd52a..824c44875 100644 --- a/pkg/backup/backup.go +++ b/pkg/backup/backup.go @@ -366,7 +366,7 @@ func (kb *kubernetesBackupper) BackupWithResolvers( discoveryHelper: kb.discoveryHelper, podVolumeBackupper: podVolumeBackupper, podVolumeSnapshotTracker: podvolume.NewTracker(), - volumeSnapshotterGetter: volumeSnapshotterGetter, + volumeSnapshotterCache: NewVolumeSnapshotterCache(volumeSnapshotterGetter), itemHookHandler: &hook.DefaultItemHookHandler{ PodCommandExecutor: kb.podCommandExecutor, }, diff --git a/pkg/backup/item_backupper.go b/pkg/backup/item_backupper.go index b890b23c3..b0e5fadfa 100644 --- a/pkg/backup/item_backupper.go +++ b/pkg/backup/item_backupper.go @@ -70,13 +70,11 @@ type itemBackupper struct { discoveryHelper discovery.Helper podVolumeBackupper podvolume.Backupper podVolumeSnapshotTracker *podvolume.Tracker - volumeSnapshotterGetter VolumeSnapshotterGetter kubernetesBackupper *kubernetesBackupper - - itemHookHandler hook.ItemHookHandler - snapshotLocationVolumeSnapshotters map[string]vsv1.VolumeSnapshotter - hookTracker *hook.HookTracker - volumeHelperImpl volumehelper.VolumeHelper + volumeSnapshotterCache *VolumeSnapshotterCache + itemHookHandler hook.ItemHookHandler + hookTracker *hook.HookTracker + volumeHelperImpl volumehelper.VolumeHelper } type FileForArchive struct { @@ -502,30 +500,6 @@ func (ib *itemBackupper) executeActions( return obj, itemFiles, nil } -// volumeSnapshotter instantiates and initializes a VolumeSnapshotter given a VolumeSnapshotLocation, -// or returns an existing one if one's already been initialized for the location. -func (ib *itemBackupper) volumeSnapshotter(snapshotLocation *velerov1api.VolumeSnapshotLocation) (vsv1.VolumeSnapshotter, error) { - if bs, ok := ib.snapshotLocationVolumeSnapshotters[snapshotLocation.Name]; ok { - return bs, nil - } - - bs, err := ib.volumeSnapshotterGetter.GetVolumeSnapshotter(snapshotLocation.Spec.Provider) - if err != nil { - return nil, err - } - - if err := bs.Init(snapshotLocation.Spec.Config); err != nil { - return nil, err - } - - if ib.snapshotLocationVolumeSnapshotters == nil { - ib.snapshotLocationVolumeSnapshotters = make(map[string]vsv1.VolumeSnapshotter) - } - ib.snapshotLocationVolumeSnapshotters[snapshotLocation.Name] = bs - - return bs, nil -} - // zoneLabelDeprecated is the label that stores availability-zone info // on PVs this is deprecated on Kubernetes >= 1.17.0 // zoneLabel is the label that stores availability-zone info @@ -641,7 +615,7 @@ func (ib *itemBackupper) takePVSnapshot(obj runtime.Unstructured, log logrus.Fie for _, snapshotLocation := range ib.backupRequest.SnapshotLocations { log := log.WithField("volumeSnapshotLocation", snapshotLocation.Name) - bs, err := ib.volumeSnapshotter(snapshotLocation) + bs, err := ib.volumeSnapshotterCache.SetNX(snapshotLocation) if err != nil { log.WithError(err).Error("Error getting volume snapshotter for volume snapshot location") continue diff --git a/pkg/backup/volume_snapshotter_cache.go b/pkg/backup/volume_snapshotter_cache.go new file mode 100644 index 000000000..620ebc337 --- /dev/null +++ b/pkg/backup/volume_snapshotter_cache.go @@ -0,0 +1,42 @@ +package backup + +import ( + "sync" + + velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + vsv1 "github.com/vmware-tanzu/velero/pkg/plugin/velero/volumesnapshotter/v1" +) + +type VolumeSnapshotterCache struct { + cache map[string]vsv1.VolumeSnapshotter + mutex sync.Mutex + getter VolumeSnapshotterGetter +} + +func NewVolumeSnapshotterCache(getter VolumeSnapshotterGetter) *VolumeSnapshotterCache { + return &VolumeSnapshotterCache{ + cache: make(map[string]vsv1.VolumeSnapshotter), + getter: getter, + } +} + +func (c *VolumeSnapshotterCache) SetNX(location *velerov1api.VolumeSnapshotLocation) (vsv1.VolumeSnapshotter, error) { + c.mutex.Lock() + defer c.mutex.Unlock() + + if snapshotter, exists := c.cache[location.Name]; exists { + return snapshotter, nil + } + + snapshotter, err := c.getter.GetVolumeSnapshotter(location.Spec.Provider) + if err != nil { + return nil, err + } + + if err := snapshotter.Init(location.Spec.Config); err != nil { + return nil, err + } + + c.cache[location.Name] = snapshotter + return snapshotter, nil +} From 25de1bb3b6bf646e233ad401ee498c7f6d92287a Mon Sep 17 00:00:00 2001 From: 0xLeo258 Date: Thu, 18 Sep 2025 17:36:07 +0800 Subject: [PATCH 06/11] add changelog Signed-off-by: 0xLeo258 --- changelogs/unreleased/9281-0xLeo258 | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9281-0xLeo258 diff --git a/changelogs/unreleased/9281-0xLeo258 b/changelogs/unreleased/9281-0xLeo258 new file mode 100644 index 000000000..eb5bf3f5d --- /dev/null +++ b/changelogs/unreleased/9281-0xLeo258 @@ -0,0 +1 @@ +Implement concurrency control for cache of native VolumeSnapshotter plugin. From 1ec281a64eae4dbd37c7ddf5f6b0ba4a9f5678da Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:29:45 -0400 Subject: [PATCH 07/11] Bump actions/setup-go from 5 to 6 (#9231) Bumps [actions/setup-go](https://github.com/actions/setup-go) from 5 to 6. - [Release notes](https://github.com/actions/setup-go/releases) - [Commits](https://github.com/actions/setup-go/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/setup-go dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/e2e-test-kind.yaml | 4 ++-- .github/workflows/pr-ci-check.yml | 2 +- .github/workflows/pr-linter-check.yml | 2 +- .github/workflows/push.yml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/e2e-test-kind.yaml b/.github/workflows/e2e-test-kind.yaml index 662fe0488..df092d5d3 100644 --- a/.github/workflows/e2e-test-kind.yaml +++ b/.github/workflows/e2e-test-kind.yaml @@ -17,7 +17,7 @@ jobs: - name: Check out the code uses: actions/checkout@v5 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: 'go.mod' # Look for a CLI that's made for this PR @@ -105,7 +105,7 @@ jobs: - name: Check out the code uses: actions/checkout@v5 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: 'go.mod' # Fetch the pre-built MinIO image from the build job diff --git a/.github/workflows/pr-ci-check.yml b/.github/workflows/pr-ci-check.yml index 0a394560a..84c63047f 100644 --- a/.github/workflows/pr-ci-check.yml +++ b/.github/workflows/pr-ci-check.yml @@ -10,7 +10,7 @@ jobs: - name: Check out the code uses: actions/checkout@v5 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - name: Make ci diff --git a/.github/workflows/pr-linter-check.yml b/.github/workflows/pr-linter-check.yml index 997466ccf..586acc5fd 100644 --- a/.github/workflows/pr-linter-check.yml +++ b/.github/workflows/pr-linter-check.yml @@ -14,7 +14,7 @@ jobs: - name: Check out the code uses: actions/checkout@v5 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - name: Linter check diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 8dee8799a..99f42a498 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -17,7 +17,7 @@ jobs: - name: Check out the code uses: actions/checkout@v5 - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - name: Set up QEMU From 4847eeaf62c9359038eb4edc26b6be5cd3c61f31 Mon Sep 17 00:00:00 2001 From: Xun Jiang Date: Thu, 11 Sep 2025 16:58:33 +0800 Subject: [PATCH 08/11] Use different go version check logic for main and other branches. main branch will read go version from go.mod's go primitive, and only keep major and minor version, because we want the actions to use the lastest patch version automatically, even the go.mod specify version like 1.24.0. release branch can read the go version from go.mod file by setup-go action's own logic. Refactor the get Go version to reusable workflow. Signed-off-by: Xun Jiang --- .github/workflows/e2e-test-kind.yaml | 19 +++++++++++---- .github/workflows/get-go-version.yaml | 33 +++++++++++++++++++++++++++ .github/workflows/pr-ci-check.yml | 12 ++++++++-- .github/workflows/pr-linter-check.yml | 12 ++++++++-- .github/workflows/push.yml | 11 +++++++-- 5 files changed, 77 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/get-go-version.yaml diff --git a/.github/workflows/e2e-test-kind.yaml b/.github/workflows/e2e-test-kind.yaml index df092d5d3..47979a27c 100644 --- a/.github/workflows/e2e-test-kind.yaml +++ b/.github/workflows/e2e-test-kind.yaml @@ -8,18 +8,26 @@ on: - "design/**" - "**/*.md" jobs: + get-go-version: + uses: ./.github/workflows/get-go-version.yaml + with: + ref: ${{ github.event.pull_request.base.ref }} + # Build the Velero CLI and image once for all Kubernetes versions, and cache it so the fan-out workers can get it. build: runs-on: ubuntu-latest + needs: get-go-version outputs: minio-dockerfile-sha: ${{ steps.minio-version.outputs.dockerfile_sha }} steps: - name: Check out the code uses: actions/checkout@v5 - - name: Set up Go + + - name: Set up Go version uses: actions/setup-go@v6 with: - go-version-file: 'go.mod' + go-version: ${{ needs.get-go-version.outputs.version }} + # Look for a CLI that's made for this PR - name: Fetch built CLI id: cli-cache @@ -97,6 +105,7 @@ jobs: needs: - build - setup-test-matrix + - get-go-version runs-on: ubuntu-latest strategy: matrix: ${{fromJson(needs.setup-test-matrix.outputs.matrix)}} @@ -104,10 +113,12 @@ jobs: steps: - name: Check out the code uses: actions/checkout@v5 - - name: Set up Go + + - name: Set up Go version uses: actions/setup-go@v6 with: - go-version-file: 'go.mod' + go-version: ${{ needs.get-go-version.outputs.version }} + # Fetch the pre-built MinIO image from the build job - name: Fetch built MinIO Image uses: actions/cache@v4 diff --git a/.github/workflows/get-go-version.yaml b/.github/workflows/get-go-version.yaml new file mode 100644 index 000000000..3bc3f8e53 --- /dev/null +++ b/.github/workflows/get-go-version.yaml @@ -0,0 +1,33 @@ +on: + workflow_call: + inputs: + ref: + description: "The target branch's ref" + required: true + type: string + outputs: + version: + description: "The expected Go version" + value: ${{ jobs.extract.outputs.version }} + +jobs: + extract: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.pick-version.outputs.version }} + steps: + - name: Check out the code + uses: actions/checkout@v5 + + - id: pick-version + run: | + if [ "${{ inputs.ref }}" == "main" ]; then + version=$(grep '^go ' go.mod | awk '{print $2}' | cut -d. -f1-2) + else + goDirectiveVersion=$(grep '^go ' go.mod | awk '{print $2}') + toolChainVersion=$(grep '^toolchain ' go.mod | awk '{print $2}') + version=$(printf "%s\n%s\n" "$goDirectiveVersion" "$toolChainVersion" | sort -V | tail -n1) + fi + + echo "version=$version" + echo "version=$version" >> $GITHUB_OUTPUT diff --git a/.github/workflows/pr-ci-check.yml b/.github/workflows/pr-ci-check.yml index 84c63047f..c97f216b4 100644 --- a/.github/workflows/pr-ci-check.yml +++ b/.github/workflows/pr-ci-check.yml @@ -1,18 +1,26 @@ name: Pull Request CI Check on: [pull_request] jobs: + get-go-version: + uses: ./.github/workflows/get-go-version.yaml + with: + ref: ${{ github.event.pull_request.base.ref }} + build: name: Run CI + needs: get-go-version runs-on: ubuntu-latest strategy: fail-fast: false steps: - name: Check out the code uses: actions/checkout@v5 - - name: Set up Go + + - name: Set up Go version uses: actions/setup-go@v6 with: - go-version-file: 'go.mod' + go-version: ${{ needs.get-go-version.outputs.version }} + - name: Make ci run: make ci - name: Upload test coverage diff --git a/.github/workflows/pr-linter-check.yml b/.github/workflows/pr-linter-check.yml index 586acc5fd..81a2bdd64 100644 --- a/.github/workflows/pr-linter-check.yml +++ b/.github/workflows/pr-linter-check.yml @@ -7,16 +7,24 @@ on: - "design/**" - "**/*.md" jobs: + get-go-version: + uses: ./.github/workflows/get-go-version.yaml + with: + ref: ${{ github.event.pull_request.base.ref }} + build: name: Run Linter Check runs-on: ubuntu-latest + needs: get-go-version steps: - name: Check out the code uses: actions/checkout@v5 - - name: Set up Go + + - name: Set up Go version uses: actions/setup-go@v6 with: - go-version-file: 'go.mod' + go-version: ${{ needs.get-go-version.outputs.version }} + - name: Linter check uses: golangci/golangci-lint-action@v8 with: diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 99f42a498..8e1bc1219 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -9,17 +9,24 @@ on: - '*' jobs: + get-go-version: + uses: ./.github/workflows/get-go-version.yaml + with: + ref: ${ github.ref } build: name: Build runs-on: ubuntu-latest + needs: get-go-version steps: - name: Check out the code uses: actions/checkout@v5 - - name: Set up Go + + - name: Set up Go version uses: actions/setup-go@v6 with: - go-version-file: 'go.mod' + go-version: ${{ needs.get-go-version.outputs.version }} + - name: Set up QEMU id: qemu uses: docker/setup-qemu-action@v3 From f2a27c3864606e82f98b2ea0058598f179aec54f Mon Sep 17 00:00:00 2001 From: 0xLeo258 Date: Thu, 11 Sep 2025 14:45:39 +0800 Subject: [PATCH 09/11] fix9247: Protect VolumeSnapshot field Signed-off-by: 0xLeo258 --- pkg/backup/item_backupper.go | 2 ++ pkg/backup/request.go | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/backup/item_backupper.go b/pkg/backup/item_backupper.go index b890b23c3..7b7f5cf62 100644 --- a/pkg/backup/item_backupper.go +++ b/pkg/backup/item_backupper.go @@ -699,6 +699,8 @@ func (ib *itemBackupper) takePVSnapshot(obj runtime.Unstructured, log logrus.Fie snapshot.Status.Phase = volume.SnapshotPhaseCompleted snapshot.Status.ProviderSnapshotID = snapshotID } + ib.backupRequest.requestLock.Lock() + defer ib.backupRequest.requestLock.Unlock() ib.backupRequest.VolumeSnapshots = append(ib.backupRequest.VolumeSnapshots, snapshot) // nil errors are automatically removed diff --git a/pkg/backup/request.go b/pkg/backup/request.go index 3ec05ee04..b81bde74d 100644 --- a/pkg/backup/request.go +++ b/pkg/backup/request.go @@ -17,6 +17,8 @@ limitations under the License. package backup import ( + "sync" + "github.com/vmware-tanzu/velero/internal/hook" "github.com/vmware-tanzu/velero/internal/resourcepolicies" "github.com/vmware-tanzu/velero/internal/volume" @@ -36,7 +38,7 @@ type itemKey struct { // materialized (e.g. backup/snapshot locations, includes/excludes, etc.) type Request struct { *velerov1api.Backup - + requestLock sync.Mutex StorageLocation *velerov1api.BackupStorageLocation SnapshotLocations []*velerov1api.VolumeSnapshotLocation NamespaceIncludesExcludes *collections.IncludesExcludes From 9df17eb02b621722a4000481e67634344aa12fda Mon Sep 17 00:00:00 2001 From: 0xLeo258 Date: Thu, 11 Sep 2025 14:45:39 +0800 Subject: [PATCH 10/11] add changelog Signed-off-by: 0xLeo258 --- changelogs/unreleased/9248-0xLeo258 | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9248-0xLeo258 diff --git a/changelogs/unreleased/9248-0xLeo258 b/changelogs/unreleased/9248-0xLeo258 new file mode 100644 index 000000000..8332dcade --- /dev/null +++ b/changelogs/unreleased/9248-0xLeo258 @@ -0,0 +1 @@ +Protect VolumeSnapshot field from race condition during multi-thread backup From 1ebe357d189c6184a7a5d8f7ca30eb270aa48555 Mon Sep 17 00:00:00 2001 From: 0xLeo258 Date: Tue, 16 Sep 2025 13:25:54 +0800 Subject: [PATCH 11/11] Add built-in mutex for SynchronizedVSList && Update unit tests Signed-off-by: 0xLeo258 --- pkg/backup/backup_test.go | 4 ++-- pkg/backup/item_backupper.go | 4 +--- pkg/backup/request.go | 22 +++++++++++++++++++--- pkg/controller/backup_controller.go | 6 +++--- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pkg/backup/backup_test.go b/pkg/backup/backup_test.go index 1cf80fe0f..eb6c2d9d4 100644 --- a/pkg/backup/backup_test.go +++ b/pkg/backup/backup_test.go @@ -3269,7 +3269,7 @@ func TestBackupWithSnapshots(t *testing.T) { err := h.backupper.Backup(h.log, tc.req, backupFile, nil, nil, tc.snapshotterGetter) require.NoError(t, err) - assert.Equal(t, tc.want, tc.req.VolumeSnapshots) + assert.Equal(t, tc.want, tc.req.VolumeSnapshots.Get()) }) } } @@ -4213,7 +4213,7 @@ func TestBackupWithPodVolume(t *testing.T) { assert.Equal(t, tc.want, req.PodVolumeBackups) // this assumes that we don't have any test cases where some PVs should be snapshotted using a VolumeSnapshotter - assert.Nil(t, req.VolumeSnapshots) + assert.Nil(t, req.VolumeSnapshots.Get()) }) } } diff --git a/pkg/backup/item_backupper.go b/pkg/backup/item_backupper.go index 7b7f5cf62..068ea2315 100644 --- a/pkg/backup/item_backupper.go +++ b/pkg/backup/item_backupper.go @@ -699,9 +699,7 @@ func (ib *itemBackupper) takePVSnapshot(obj runtime.Unstructured, log logrus.Fie snapshot.Status.Phase = volume.SnapshotPhaseCompleted snapshot.Status.ProviderSnapshotID = snapshotID } - ib.backupRequest.requestLock.Lock() - defer ib.backupRequest.requestLock.Unlock() - ib.backupRequest.VolumeSnapshots = append(ib.backupRequest.VolumeSnapshots, snapshot) + ib.backupRequest.VolumeSnapshots.Add(snapshot) // nil errors are automatically removed return kubeerrs.NewAggregate(errs) diff --git a/pkg/backup/request.go b/pkg/backup/request.go index b81bde74d..c3dae48a6 100644 --- a/pkg/backup/request.go +++ b/pkg/backup/request.go @@ -34,11 +34,27 @@ type itemKey struct { name string } +type SynchronizedVSList struct { + sync.Mutex + VolumeSnapshotList []*volume.Snapshot +} + +func (s *SynchronizedVSList) Add(vs *volume.Snapshot) { + s.Lock() + defer s.Unlock() + s.VolumeSnapshotList = append(s.VolumeSnapshotList, vs) +} + +func (s *SynchronizedVSList) Get() []*volume.Snapshot { + s.Lock() + defer s.Unlock() + return s.VolumeSnapshotList +} + // Request is a request for a backup, with all references to other objects // materialized (e.g. backup/snapshot locations, includes/excludes, etc.) type Request struct { *velerov1api.Backup - requestLock sync.Mutex StorageLocation *velerov1api.BackupStorageLocation SnapshotLocations []*velerov1api.VolumeSnapshotLocation NamespaceIncludesExcludes *collections.IncludesExcludes @@ -46,7 +62,7 @@ type Request struct { ResourceHooks []hook.ResourceHook ResolvedActions []framework.BackupItemResolvedActionV2 ResolvedItemBlockActions []framework.ItemBlockResolvedAction - VolumeSnapshots []*volume.Snapshot + VolumeSnapshots SynchronizedVSList PodVolumeBackups []*velerov1api.PodVolumeBackup BackedUpItems *backedUpItemsMap itemOperationsList *[]*itemoperation.BackupOperation @@ -82,7 +98,7 @@ func (r *Request) FillVolumesInformation() { } r.VolumesInformation.SkippedPVs = skippedPVMap - r.VolumesInformation.NativeSnapshots = r.VolumeSnapshots + r.VolumesInformation.NativeSnapshots = r.VolumeSnapshots.Get() r.VolumesInformation.PodVolumeBackups = r.PodVolumeBackups r.VolumesInformation.BackupOperations = *r.GetItemOperationsList() r.VolumesInformation.BackupName = r.Backup.Name diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 532a5f332..c9728ea82 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -734,8 +734,8 @@ func (b *backupReconciler) runBackup(backup *pkgbackup.Request) error { // native snapshots phase will either be failed or completed right away // https://github.com/vmware-tanzu/velero/blob/de3ea52f0cc478e99efa7b9524c7f353514261a4/pkg/backup/item_backupper.go#L632-L639 - backup.Status.VolumeSnapshotsAttempted = len(backup.VolumeSnapshots) - for _, snap := range backup.VolumeSnapshots { + backup.Status.VolumeSnapshotsAttempted = len(backup.VolumeSnapshots.Get()) + for _, snap := range backup.VolumeSnapshots.Get() { if snap.Status.Phase == volume.SnapshotPhaseCompleted { backup.Status.VolumeSnapshotsCompleted++ } @@ -882,7 +882,7 @@ func persistBackup(backup *pkgbackup.Request, } // Velero-native volume snapshots (as opposed to CSI ones) - nativeVolumeSnapshots, errs := encode.ToJSONGzip(backup.VolumeSnapshots, "native volumesnapshots list") + nativeVolumeSnapshots, errs := encode.ToJSONGzip(backup.VolumeSnapshots.Get(), "native volumesnapshots list") if errs != nil { persistErrs = append(persistErrs, errs...) }