From d8d66381e779b009b0fa028c8e8f56080c5c8993 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Sun, 8 Oct 2023 18:27:58 +0800 Subject: [PATCH] issue 6734: spread backup pod evenly Signed-off-by: Lyndon-Li --- changelogs/CHANGELOG-1.11.md | 2 +- changelogs/CHANGELOG-1.8.md | 2 +- changelogs/unreleased/6926-Lyndon-Li | 1 + design/Implemented/delete-item-action.md | 2 +- pkg/controller/backup_controller.go | 8 ++++---- pkg/controller/backup_deletion_controller.go | 2 +- pkg/exposer/csi_snapshot.go | 18 ++++++++++++++++++ pkg/exposer/types.go | 2 ++ .../docs/main/csi-snapshot-data-movement.md | 2 +- site/content/docs/v0.5.0/faq.md | 2 +- site/content/docs/v0.6.0/faq.md | 2 +- site/content/docs/v0.7.0/faq.md | 2 +- site/content/docs/v0.7.1/faq.md | 2 +- site/content/docs/v0.8.0/faq.md | 2 +- site/content/docs/v0.8.1/faq.md | 2 +- .../docs/v1.12/csi-snapshot-data-movement.md | 2 +- site/content/docs/v1.5/restic.md | 2 +- test/e2e/backups/ttl.go | 2 +- test/e2e/bsl-mgmt/deletion.go | 2 +- 19 files changed, 40 insertions(+), 19 deletions(-) create mode 100644 changelogs/unreleased/6926-Lyndon-Li diff --git a/changelogs/CHANGELOG-1.11.md b/changelogs/CHANGELOG-1.11.md index 9846b7560..75d08d4df 100644 --- a/changelogs/CHANGELOG-1.11.md +++ b/changelogs/CHANGELOG-1.11.md @@ -100,7 +100,7 @@ To fix CVEs and keep pace with Golang, Velero made changes as follows: * Enable staticcheck linter. (#5788, @blackpiglet) * Set Kopia IgnoreUnknownTypes in ErrorHandlingPolicy to True for ignoring backup unknown file type (#5786, @qiuming-best) * Bump up Restic version to 0.15.0 (#5784, @qiuming-best) -* Add File system backup related matrics to Grafana dashboard +* Add File system backup related metrics to Grafana dashboard - Add metrics backup_warning_total for record of total warnings - Add metrics backup_last_status for record of last status of the backup (#5779, @allenxu404) * Design for Handling backup of volumes by resources filters (#5773, @qiuming-best) diff --git a/changelogs/CHANGELOG-1.8.md b/changelogs/CHANGELOG-1.8.md index 7c8f01946..e317849d1 100644 --- a/changelogs/CHANGELOG-1.8.md +++ b/changelogs/CHANGELOG-1.8.md @@ -61,7 +61,7 @@ in progress for 1.9. * Add rbac and annotation test cases (#4455, @mqiu) * remove --crds-version in velero install command. (#4446, @jxun) * Upgrade e2e test vsphere plugin (#4440, @mqiu) -* Fix e2e test failures for the inappropriate optimaze of velero install (#4438, @mqiu) +* Fix e2e test failures for the inappropriate optimize of velero install (#4438, @mqiu) * Limit backup namespaces on test resource filtering cases (#4437, @mqiu) * Bump up Go to 1.17 (#4431, @reasonerjt) * Added ``-itemsnapshots.json.gz to the backup format. This file exists diff --git a/changelogs/unreleased/6926-Lyndon-Li b/changelogs/unreleased/6926-Lyndon-Li new file mode 100644 index 000000000..b14c4f3cc --- /dev/null +++ b/changelogs/unreleased/6926-Lyndon-Li @@ -0,0 +1 @@ +Partially fix #6734, guide Kubernetes' scheduler to spread backup pods evenly across nodes as much as possible, so that data mover backup could achieve better parallelism \ No newline at end of file diff --git a/design/Implemented/delete-item-action.md b/design/Implemented/delete-item-action.md index 80baf685c..799173ff3 100644 --- a/design/Implemented/delete-item-action.md +++ b/design/Implemented/delete-item-action.md @@ -175,7 +175,7 @@ If there are one or more, download the backup tarball from backup storage, untar ## Alternatives Considered -Another proposal for higher level `DeleteItemActions` was initially included, which would require implementors to individually download the backup tarball themselves. +Another proposal for higher level `DeleteItemActions` was initially included, which would require implementers to individually download the backup tarball themselves. While this may be useful long term, it is not a good fit for the current goals as each plugin would be re-implementing a lot of boilerplate. See the deletion-plugins.md file for this alternative proposal in more detail. diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index c02ce7e48..ae4cbb9c9 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -540,8 +540,8 @@ func (b *backupReconciler) validateAndGetSnapshotLocations(backup *velerov1api.B if len(errors) > 0 { return nil, errors } - allLocations := &velerov1api.VolumeSnapshotLocationList{} - err := b.kbClient.List(context.Background(), allLocations, &kbclient.ListOptions{Namespace: backup.Namespace, LabelSelector: labels.Everything()}) + vsls := &velerov1api.VolumeSnapshotLocationList{} + err := b.kbClient.List(context.Background(), vsls, &kbclient.ListOptions{Namespace: backup.Namespace, LabelSelector: labels.Everything()}) if err != nil { errors = append(errors, fmt.Sprintf("error listing volume snapshot locations: %v", err)) return nil, errors @@ -549,8 +549,8 @@ func (b *backupReconciler) validateAndGetSnapshotLocations(backup *velerov1api.B // build a map of provider->list of all locations for the provider allProviderLocations := make(map[string][]*velerov1api.VolumeSnapshotLocation) - for i := range allLocations.Items { - loc := allLocations.Items[i] + for i := range vsls.Items { + loc := vsls.Items[i] allProviderLocations[loc.Spec.Provider] = append(allProviderLocations[loc.Spec.Provider], &loc) } diff --git a/pkg/controller/backup_deletion_controller.go b/pkg/controller/backup_deletion_controller.go index 5c9168a29..af439b834 100644 --- a/pkg/controller/backup_deletion_controller.go +++ b/pkg/controller/backup_deletion_controller.go @@ -570,7 +570,7 @@ func (r *backupDeletionReconciler) patchDeleteBackupRequest(ctx context.Context, } func (r *backupDeletionReconciler) patchBackup(ctx context.Context, backup *velerov1api.Backup, mutate func(*velerov1api.Backup)) (*velerov1api.Backup, error) { - //TODO: The patchHelper can't be used here because the `backup/xxx/status` does not exist, until the bakcup resource is refactored + //TODO: The patchHelper can't be used here because the `backup/xxx/status` does not exist, until the backup resource is refactored // Record original json oldData, err := json.Marshal(backup) diff --git a/pkg/exposer/csi_snapshot.go b/pkg/exposer/csi_snapshot.go index a0492d523..85f511524 100644 --- a/pkg/exposer/csi_snapshot.go +++ b/pkg/exposer/csi_snapshot.go @@ -361,6 +361,12 @@ func (e *csiSnapshotExposer) createBackupPod(ctx context.Context, ownerObject co var gracePeriod int64 = 0 volumeMounts, volumeDevices := kube.MakePodPVCAttachment(volumeName, backupPVC.Spec.VolumeMode) + if label == nil { + label = make(map[string]string) + } + + label[podGroupLabel] = podGroupSnapshot + pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, @@ -377,6 +383,18 @@ func (e *csiSnapshotExposer) createBackupPod(ctx context.Context, ownerObject co Labels: label, }, Spec: corev1.PodSpec{ + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "kubernetes.io/hostname", + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + podGroupLabel: podGroupSnapshot, + }, + }, + }, + }, Containers: []corev1.Container{ { Name: containerName, diff --git a/pkg/exposer/types.go b/pkg/exposer/types.go index 21c473366..f87620e8f 100644 --- a/pkg/exposer/types.go +++ b/pkg/exposer/types.go @@ -23,6 +23,8 @@ import ( const ( AccessModeFileSystem = "by-file-system" AccessModeBlock = "by-block-device" + podGroupLabel = "velero.io/exposer-pod-group" + podGroupSnapshot = "snapshot-exposer" ) // ExposeResult defines the result of expose. diff --git a/site/content/docs/main/csi-snapshot-data-movement.md b/site/content/docs/main/csi-snapshot-data-movement.md index 7a68821ab..c9ff42d3e 100644 --- a/site/content/docs/main/csi-snapshot-data-movement.md +++ b/site/content/docs/main/csi-snapshot-data-movement.md @@ -326,7 +326,7 @@ Velero backs up resources for CSI snapshot data movement backup in the same way - CSI plugin checks if a data movement is required, if so it creates a `DataUpload` CR and then returns to Velero backup. - Velero now is able to back up other resources, including other PVC objects. - Velero backup controller periodically queries the data movement status from CSI plugin, the period is configurable through the Velero server parameter `--item-operation-sync-frequency`, by default it is 10s. On the call, CSI plugin turns to check the phase of the `DataUpload` CRs. -- When all the `DataUpload` CRs come to a terminal state (i.e., `Completed`, `Failed` or `Cancelled`), Velero backup perists all the necessary information and finish the backup. +- When all the `DataUpload` CRs come to a terminal state (i.e., `Completed`, `Failed` or `Cancelled`), Velero backup persists all the necessary information and finish the backup. - CSI plugin expects a data mover to handle the `DataUpload` CR. If no data mover is configured for the backup, Velero built-in data mover will handle it. - If the `DataUpload` CR does not reach to the terminal state with in the given time, the `DataUpload` CR will be cancelled. You can set the timeout value per backup through the `--item-operation-timeout` parameter, the default value is `4 hours`. diff --git a/site/content/docs/v0.5.0/faq.md b/site/content/docs/v0.5.0/faq.md index c96c6d549..b33998709 100644 --- a/site/content/docs/v0.5.0/faq.md +++ b/site/content/docs/v0.5.0/faq.md @@ -6,7 +6,7 @@ layout: docs ## When is it appropriate to use Ark instead of etcd's built in backup/restore? Etcd's backup/restore tooling is good for recovering from data loss in a single etcd cluster. For -example, it is a good idea to take a backup of etcd prior to upgrading etcd istelf. For more +example, it is a good idea to take a backup of etcd prior to upgrading etcd itself. For more sophisticated management of your Kubernetes cluster backups and restores, we feel that Ark is generally a better approach. It gives you the ability to throw away an unstable cluster and restore your Kubernetes resources and data into a new cluster, which you can't do easily just by backing up diff --git a/site/content/docs/v0.6.0/faq.md b/site/content/docs/v0.6.0/faq.md index 0fbfbbb35..07c9c9458 100644 --- a/site/content/docs/v0.6.0/faq.md +++ b/site/content/docs/v0.6.0/faq.md @@ -6,7 +6,7 @@ layout: docs ## When is it appropriate to use Ark instead of etcd's built in backup/restore? Etcd's backup/restore tooling is good for recovering from data loss in a single etcd cluster. For -example, it is a good idea to take a backup of etcd prior to upgrading etcd istelf. For more +example, it is a good idea to take a backup of etcd prior to upgrading etcd itself. For more sophisticated management of your Kubernetes cluster backups and restores, we feel that Ark is generally a better approach. It gives you the ability to throw away an unstable cluster and restore your Kubernetes resources and data into a new cluster, which you can't do easily just by backing up diff --git a/site/content/docs/v0.7.0/faq.md b/site/content/docs/v0.7.0/faq.md index 0fbfbbb35..07c9c9458 100644 --- a/site/content/docs/v0.7.0/faq.md +++ b/site/content/docs/v0.7.0/faq.md @@ -6,7 +6,7 @@ layout: docs ## When is it appropriate to use Ark instead of etcd's built in backup/restore? Etcd's backup/restore tooling is good for recovering from data loss in a single etcd cluster. For -example, it is a good idea to take a backup of etcd prior to upgrading etcd istelf. For more +example, it is a good idea to take a backup of etcd prior to upgrading etcd itself. For more sophisticated management of your Kubernetes cluster backups and restores, we feel that Ark is generally a better approach. It gives you the ability to throw away an unstable cluster and restore your Kubernetes resources and data into a new cluster, which you can't do easily just by backing up diff --git a/site/content/docs/v0.7.1/faq.md b/site/content/docs/v0.7.1/faq.md index 0fbfbbb35..07c9c9458 100644 --- a/site/content/docs/v0.7.1/faq.md +++ b/site/content/docs/v0.7.1/faq.md @@ -6,7 +6,7 @@ layout: docs ## When is it appropriate to use Ark instead of etcd's built in backup/restore? Etcd's backup/restore tooling is good for recovering from data loss in a single etcd cluster. For -example, it is a good idea to take a backup of etcd prior to upgrading etcd istelf. For more +example, it is a good idea to take a backup of etcd prior to upgrading etcd itself. For more sophisticated management of your Kubernetes cluster backups and restores, we feel that Ark is generally a better approach. It gives you the ability to throw away an unstable cluster and restore your Kubernetes resources and data into a new cluster, which you can't do easily just by backing up diff --git a/site/content/docs/v0.8.0/faq.md b/site/content/docs/v0.8.0/faq.md index 0fbfbbb35..07c9c9458 100644 --- a/site/content/docs/v0.8.0/faq.md +++ b/site/content/docs/v0.8.0/faq.md @@ -6,7 +6,7 @@ layout: docs ## When is it appropriate to use Ark instead of etcd's built in backup/restore? Etcd's backup/restore tooling is good for recovering from data loss in a single etcd cluster. For -example, it is a good idea to take a backup of etcd prior to upgrading etcd istelf. For more +example, it is a good idea to take a backup of etcd prior to upgrading etcd itself. For more sophisticated management of your Kubernetes cluster backups and restores, we feel that Ark is generally a better approach. It gives you the ability to throw away an unstable cluster and restore your Kubernetes resources and data into a new cluster, which you can't do easily just by backing up diff --git a/site/content/docs/v0.8.1/faq.md b/site/content/docs/v0.8.1/faq.md index 0fbfbbb35..07c9c9458 100644 --- a/site/content/docs/v0.8.1/faq.md +++ b/site/content/docs/v0.8.1/faq.md @@ -6,7 +6,7 @@ layout: docs ## When is it appropriate to use Ark instead of etcd's built in backup/restore? Etcd's backup/restore tooling is good for recovering from data loss in a single etcd cluster. For -example, it is a good idea to take a backup of etcd prior to upgrading etcd istelf. For more +example, it is a good idea to take a backup of etcd prior to upgrading etcd itself. For more sophisticated management of your Kubernetes cluster backups and restores, we feel that Ark is generally a better approach. It gives you the ability to throw away an unstable cluster and restore your Kubernetes resources and data into a new cluster, which you can't do easily just by backing up diff --git a/site/content/docs/v1.12/csi-snapshot-data-movement.md b/site/content/docs/v1.12/csi-snapshot-data-movement.md index 1fa402638..6705a3145 100644 --- a/site/content/docs/v1.12/csi-snapshot-data-movement.md +++ b/site/content/docs/v1.12/csi-snapshot-data-movement.md @@ -339,7 +339,7 @@ Velero backs up resources for CSI snapshot data movement backup in the same way - CSI plugin checks if a data movement is required, if so it creates a `DataUpload` CR and then returns to Velero backup. - Velero now is able to back up other resources, including other PVC objects. - Velero backup controller periodically queries the data movement status from CSI plugin, the period is configurable through the Velero server parameter `--item-operation-sync-frequency`, by default it is 10s. On the call, CSI plugin turns to check the phase of the `DataUpload` CRs. -- When all the `DataUpload` CRs come to a terminal state (i.e., `Completed`, `Failed` or `Cancelled`), Velero backup perists all the necessary information and finish the backup. +- When all the `DataUpload` CRs come to a terminal state (i.e., `Completed`, `Failed` or `Cancelled`), Velero backup persists all the necessary information and finish the backup. - CSI plugin expects a data mover to handle the `DataUpload` CR. If no data mover is configured for the backup, Velero built-in data mover will handle it. - If the `DataUpload` CR does not reach to the terminal state with in the given time, the `DataUpload` CR will be cancelled. You can set the timeout value per backup through the `--item-operation-timeout` parameter, the default value is `4 hours`. diff --git a/site/content/docs/v1.5/restic.md b/site/content/docs/v1.5/restic.md index 375994e8e..8265e0003 100644 --- a/site/content/docs/v1.5/restic.md +++ b/site/content/docs/v1.5/restic.md @@ -10,7 +10,7 @@ the supported cloud providers’ block storage offerings (Amazon EBS Volumes, Az It also provides a plugin model that enables anyone to implement additional object and block storage backends, outside the main Velero repository. -The restic intergation was added to give you an out-of-the-box solution for backing up and restoring almost any type of Kubernetes volume. This integration is an addition to Velero's capabilities, not a replacement for existing functionality. If you're running on AWS, and taking EBS snapshots as part of your regular Velero backups, there's no need to switch to using restic. However, if you need a volume snapshot plugin for your storage platform, or if you're using EFS, AzureFile, NFS, emptyDir, +The restic integration was added to give you an out-of-the-box solution for backing up and restoring almost any type of Kubernetes volume. This integration is an addition to Velero's capabilities, not a replacement for existing functionality. If you're running on AWS, and taking EBS snapshots as part of your regular Velero backups, there's no need to switch to using restic. However, if you need a volume snapshot plugin for your storage platform, or if you're using EFS, AzureFile, NFS, emptyDir, local, or any other volume type that doesn't have a native snapshot concept, restic might be for you. Restic is not tied to a specific storage platform, which means that this integration also paves the way for future work to enable diff --git a/test/e2e/backups/ttl.go b/test/e2e/backups/ttl.go index 98bc3d9a4..52069863c 100644 --- a/test/e2e/backups/ttl.go +++ b/test/e2e/backups/ttl.go @@ -171,7 +171,7 @@ func TTLTest() { Expect(t).To(Equal(test.ttl)) }) - By(fmt.Sprintf("Waiting %s minutes for removing backup ralated resources by GC", test.ttl.String()), func() { + By(fmt.Sprintf("Waiting %s minutes for removing backup related resources by GC", test.ttl.String()), func() { time.Sleep(test.ttl) }) diff --git a/test/e2e/bsl-mgmt/deletion.go b/test/e2e/bsl-mgmt/deletion.go index f0d7b414a..90b3abf57 100644 --- a/test/e2e/bsl-mgmt/deletion.go +++ b/test/e2e/bsl-mgmt/deletion.go @@ -159,7 +159,7 @@ func BslDeletionTest(useVolumeSnapshots bool) { Expect(AddLabelToPod(context.Background(), "kibishii-deployment-1", bslDeletionTestNs, label_2)).To(Succeed()) }) - By("Get all 2 PVCs of Kibishii and label them seprately ", func() { + By("Get all 2 PVCs of Kibishii and label them separately ", func() { pvc, err := GetPvcByPVCName(context.Background(), bslDeletionTestNs, podName_1) Expect(err).To(Succeed()) fmt.Println(pvc)