diff --git a/changelogs/unreleased/9072-Lyndon-Li b/changelogs/unreleased/9072-Lyndon-Li new file mode 100644 index 000000000..c7ad770a4 --- /dev/null +++ b/changelogs/unreleased/9072-Lyndon-Li @@ -0,0 +1 @@ +Fix issue #8857, support third party tolerations for data mover pods \ No newline at end of file diff --git a/pkg/controller/data_download_controller.go b/pkg/controller/data_download_controller.go index 22f811af9..6cd007138 100644 --- a/pkg/controller/data_download_controller.go +++ b/pkg/controller/data_download_controller.go @@ -840,6 +840,17 @@ func (r *DataDownloadReconciler) setupExposeParam(dd *velerov2alpha1api.DataDown } } + hostingPodTolerations := []corev1api.Toleration{} + for _, k := range util.ThirdPartyTolerations { + if v, err := nodeagent.GetToleration(context.Background(), r.kubeClient, dd.Namespace, k, nodeOS); err != nil { + if err != nodeagent.ErrNodeAgentTolerationNotFound { + log.WithError(err).Warnf("Failed to check node-agent toleration, skip adding host pod toleration %s", k) + } + } else { + hostingPodTolerations = append(hostingPodTolerations, *v) + } + } + affinity := kube.GetLoadAffinityByStorageClass(r.loadAffinity, dd.Spec.BackupStorageLocation, log) return exposer.GenericRestoreExposeParam{ @@ -847,6 +858,7 @@ func (r *DataDownloadReconciler) setupExposeParam(dd *velerov2alpha1api.DataDown TargetNamespace: dd.Spec.TargetVolume.Namespace, HostingPodLabels: hostingPodLabels, HostingPodAnnotations: hostingPodAnnotation, + HostingPodTolerations: hostingPodTolerations, Resources: r.podResources, OperationTimeout: dd.Spec.OperationTimeout.Duration, ExposeTimeout: r.preparingTimeout, diff --git a/pkg/controller/data_upload_controller.go b/pkg/controller/data_upload_controller.go index 9f9563d6b..a3d119311 100644 --- a/pkg/controller/data_upload_controller.go +++ b/pkg/controller/data_upload_controller.go @@ -917,6 +917,17 @@ func (r *DataUploadReconciler) setupExposeParam(du *velerov2alpha1api.DataUpload } } + hostingPodTolerations := []corev1api.Toleration{} + for _, k := range util.ThirdPartyTolerations { + if v, err := nodeagent.GetToleration(context.Background(), r.kubeClient, du.Namespace, k, nodeOS); err != nil { + if err != nodeagent.ErrNodeAgentTolerationNotFound { + log.WithError(err).Warnf("Failed to check node-agent toleration, skip adding host pod toleration %s", k) + } + } else { + hostingPodTolerations = append(hostingPodTolerations, *v) + } + } + affinity := kube.GetLoadAffinityByStorageClass(r.loadAffinity, du.Spec.CSISnapshot.SnapshotClass, log) return &exposer.CSISnapshotExposeParam{ @@ -925,6 +936,7 @@ func (r *DataUploadReconciler) setupExposeParam(du *velerov2alpha1api.DataUpload StorageClass: du.Spec.CSISnapshot.StorageClass, HostingPodLabels: hostingPodLabels, HostingPodAnnotations: hostingPodAnnotation, + HostingPodTolerations: hostingPodTolerations, AccessMode: accessMode, OperationTimeout: du.Spec.OperationTimeout.Duration, ExposeTimeout: r.preparingTimeout, diff --git a/pkg/controller/pod_volume_backup_controller.go b/pkg/controller/pod_volume_backup_controller.go index 344ccba97..177bfbf55 100644 --- a/pkg/controller/pod_volume_backup_controller.go +++ b/pkg/controller/pod_volume_backup_controller.go @@ -756,9 +756,14 @@ func (r *PodVolumeBackupReconciler) closeDataPath(ctx context.Context, pvbName s func (r *PodVolumeBackupReconciler) setupExposeParam(pvb *velerov1api.PodVolumeBackup) exposer.PodVolumeExposeParam { log := r.logger.WithField("PVB", pvb.Name) + nodeOS, err := kube.GetNodeOS(context.Background(), pvb.Spec.Node, r.kubeClient.CoreV1()) + if err != nil { + log.WithError(err).Warnf("Failed to get nodeOS for node %s, use linux node-agent for hosting pod labels, annotations and tolerations", pvb.Spec.Node) + } + hostingPodLabels := map[string]string{velerov1api.PVBLabel: pvb.Name} for _, k := range util.ThirdPartyLabels { - if v, err := nodeagent.GetLabelValue(context.Background(), r.kubeClient, pvb.Namespace, k, ""); err != nil { + if v, err := nodeagent.GetLabelValue(context.Background(), r.kubeClient, pvb.Namespace, k, nodeOS); err != nil { if err != nodeagent.ErrNodeAgentLabelNotFound { log.WithError(err).Warnf("Failed to check node-agent label, skip adding host pod label %s", k) } @@ -769,7 +774,7 @@ func (r *PodVolumeBackupReconciler) setupExposeParam(pvb *velerov1api.PodVolumeB hostingPodAnnotation := map[string]string{} for _, k := range util.ThirdPartyAnnotations { - if v, err := nodeagent.GetAnnotationValue(context.Background(), r.kubeClient, pvb.Namespace, k, ""); err != nil { + if v, err := nodeagent.GetAnnotationValue(context.Background(), r.kubeClient, pvb.Namespace, k, nodeOS); err != nil { if err != nodeagent.ErrNodeAgentAnnotationNotFound { log.WithError(err).Warnf("Failed to check node-agent annotation, skip adding host pod annotation %s", k) } @@ -778,6 +783,17 @@ func (r *PodVolumeBackupReconciler) setupExposeParam(pvb *velerov1api.PodVolumeB } } + hostingPodTolerations := []corev1api.Toleration{} + for _, k := range util.ThirdPartyTolerations { + if v, err := nodeagent.GetToleration(context.Background(), r.kubeClient, pvb.Namespace, k, nodeOS); err != nil { + if err != nodeagent.ErrNodeAgentTolerationNotFound { + log.WithError(err).Warnf("Failed to check node-agent toleration, skip adding host pod toleration %s", k) + } + } else { + hostingPodTolerations = append(hostingPodTolerations, *v) + } + } + return exposer.PodVolumeExposeParam{ Type: exposer.PodVolumeExposeTypeBackup, ClientNamespace: pvb.Spec.Pod.Namespace, @@ -785,6 +801,7 @@ func (r *PodVolumeBackupReconciler) setupExposeParam(pvb *velerov1api.PodVolumeB ClientPodVolume: pvb.Spec.Volume, HostingPodLabels: hostingPodLabels, HostingPodAnnotations: hostingPodAnnotation, + HostingPodTolerations: hostingPodTolerations, OperationTimeout: r.resourceTimeout, Resources: r.podResources, } diff --git a/pkg/controller/pod_volume_restore_controller.go b/pkg/controller/pod_volume_restore_controller.go index 8894bceda..41a12a7b8 100644 --- a/pkg/controller/pod_volume_restore_controller.go +++ b/pkg/controller/pod_volume_restore_controller.go @@ -820,9 +820,14 @@ func (r *PodVolumeRestoreReconciler) OnDataPathProgress(ctx context.Context, nam func (r *PodVolumeRestoreReconciler) setupExposeParam(pvr *velerov1api.PodVolumeRestore) exposer.PodVolumeExposeParam { log := r.logger.WithField("PVR", pvr.Name) + nodeOS, err := kube.GetNodeOS(context.Background(), pvr.Status.Node, r.kubeClient.CoreV1()) + if err != nil { + log.WithError(err).Warnf("Failed to get nodeOS for node %s, use linux node-agent for hosting pod labels, annotations and tolerations", pvr.Status.Node) + } + hostingPodLabels := map[string]string{velerov1api.PVRLabel: pvr.Name} for _, k := range util.ThirdPartyLabels { - if v, err := nodeagent.GetLabelValue(context.Background(), r.kubeClient, pvr.Namespace, k, ""); err != nil { + if v, err := nodeagent.GetLabelValue(context.Background(), r.kubeClient, pvr.Namespace, k, nodeOS); err != nil { if err != nodeagent.ErrNodeAgentLabelNotFound { log.WithError(err).Warnf("Failed to check node-agent label, skip adding host pod label %s", k) } @@ -833,7 +838,7 @@ func (r *PodVolumeRestoreReconciler) setupExposeParam(pvr *velerov1api.PodVolume hostingPodAnnotation := map[string]string{} for _, k := range util.ThirdPartyAnnotations { - if v, err := nodeagent.GetAnnotationValue(context.Background(), r.kubeClient, pvr.Namespace, k, ""); err != nil { + if v, err := nodeagent.GetAnnotationValue(context.Background(), r.kubeClient, pvr.Namespace, k, nodeOS); err != nil { if err != nodeagent.ErrNodeAgentAnnotationNotFound { log.WithError(err).Warnf("Failed to check node-agent annotation, skip adding host pod annotation %s", k) } @@ -842,6 +847,17 @@ func (r *PodVolumeRestoreReconciler) setupExposeParam(pvr *velerov1api.PodVolume } } + hostingPodTolerations := []corev1api.Toleration{} + for _, k := range util.ThirdPartyTolerations { + if v, err := nodeagent.GetToleration(context.Background(), r.kubeClient, pvr.Namespace, k, nodeOS); err != nil { + if err != nodeagent.ErrNodeAgentTolerationNotFound { + log.WithError(err).Warnf("Failed to check node-agent toleration, skip adding host pod toleration %s", k) + } + } else { + hostingPodTolerations = append(hostingPodTolerations, *v) + } + } + return exposer.PodVolumeExposeParam{ Type: exposer.PodVolumeExposeTypeRestore, ClientNamespace: pvr.Spec.Pod.Namespace, @@ -849,6 +865,7 @@ func (r *PodVolumeRestoreReconciler) setupExposeParam(pvr *velerov1api.PodVolume ClientPodVolume: pvr.Spec.Volume, HostingPodLabels: hostingPodLabels, HostingPodAnnotations: hostingPodAnnotation, + HostingPodTolerations: hostingPodTolerations, OperationTimeout: r.resourceTimeout, Resources: r.podResources, } diff --git a/pkg/exposer/csi_snapshot.go b/pkg/exposer/csi_snapshot.go index a946c8244..f88696faf 100644 --- a/pkg/exposer/csi_snapshot.go +++ b/pkg/exposer/csi_snapshot.go @@ -59,6 +59,9 @@ type CSISnapshotExposeParam struct { // HostingPodAnnotations is the annotations that are going to apply to the hosting pod HostingPodAnnotations map[string]string + // HostingPodTolerations is the tolerations that are going to apply to the hosting pod + HostingPodTolerations []corev1api.Toleration + // OperationTimeout specifies the time wait for resources operations in Expose OperationTimeout time.Duration @@ -215,6 +218,7 @@ func (e *csiSnapshotExposer) Expose(ctx context.Context, ownerObject corev1api.O csiExposeParam.OperationTimeout, csiExposeParam.HostingPodLabels, csiExposeParam.HostingPodAnnotations, + csiExposeParam.HostingPodTolerations, csiExposeParam.Affinity, csiExposeParam.Resources, backupPVCReadOnly, @@ -528,6 +532,7 @@ func (e *csiSnapshotExposer) createBackupPod( operationTimeout time.Duration, label map[string]string, annotation map[string]string, + toleration []corev1api.Toleration, affinity *kube.LoadAffinity, resources corev1api.ResourceRequirements, backupPVCReadOnly bool, @@ -586,7 +591,6 @@ func (e *csiSnapshotExposer) createBackupPod( var securityCtx *corev1api.PodSecurityContext nodeSelector := map[string]string{} podOS := corev1api.PodOS{} - toleration := []corev1api.Toleration{} if nodeOS == kube.NodeOSWindows { userID := "ContainerAdministrator" securityCtx = &corev1api.PodSecurityContext{ diff --git a/pkg/exposer/generic_restore.go b/pkg/exposer/generic_restore.go index 53c8e09a7..fe658008b 100644 --- a/pkg/exposer/generic_restore.go +++ b/pkg/exposer/generic_restore.go @@ -49,6 +49,9 @@ type GenericRestoreExposeParam struct { // HostingPodAnnotations is the annotations that are going to apply to the hosting pod HostingPodAnnotations map[string]string + // HostingPodTolerations is the tolerations that are going to apply to the hosting pod + HostingPodTolerations []corev1api.Toleration + // Resources defines the resource requirements of the hosting pod Resources corev1api.ResourceRequirements @@ -140,6 +143,7 @@ func (e *genericRestoreExposer) Expose(ctx context.Context, ownerObject corev1ap param.OperationTimeout, param.HostingPodLabels, param.HostingPodAnnotations, + param.HostingPodTolerations, selectedNode, param.Resources, param.NodeOS, @@ -405,6 +409,7 @@ func (e *genericRestoreExposer) createRestorePod( operationTimeout time.Duration, label map[string]string, annotation map[string]string, + toleration []corev1api.Toleration, selectedNode string, resources corev1api.ResourceRequirements, nodeOS string, @@ -467,7 +472,6 @@ func (e *genericRestoreExposer) createRestorePod( var securityCtx *corev1api.PodSecurityContext nodeSelector := map[string]string{} podOS := corev1api.PodOS{} - toleration := []corev1api.Toleration{} if nodeOS == kube.NodeOSWindows { userID := "ContainerAdministrator" securityCtx = &corev1api.PodSecurityContext{ diff --git a/pkg/exposer/generic_restore_test.go b/pkg/exposer/generic_restore_test.go index 4270e9bc3..45fe862b0 100644 --- a/pkg/exposer/generic_restore_test.go +++ b/pkg/exposer/generic_restore_test.go @@ -813,6 +813,28 @@ func TestCreateRestorePod(t *testing.T) { }, } + daemonSetWin := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "velero", + Name: "node-agent-windows", + }, + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + APIVersion: appsv1api.SchemeGroupVersion.String(), + }, + Spec: appsv1api.DaemonSetSpec{ + Template: corev1api.PodTemplateSpec{ + Spec: corev1api.PodSpec{ + Containers: []corev1api.Container{ + { + Image: "fake-image", + }, + }, + }, + }, + }, + } + targetPVCObj := &corev1api.PersistentVolumeClaim{ ObjectMeta: metav1.ObjectMeta{ Namespace: "fake-ns", @@ -828,11 +850,12 @@ func TestCreateRestorePod(t *testing.T) { kubeClientObj []runtime.Object selectedNode string affinity *kube.LoadAffinity + nodeOS string expectedPod *corev1api.Pod }{ { - name: "", - kubeClientObj: []runtime.Object{daemonSet, targetPVCObj}, + name: "linux", + kubeClientObj: []runtime.Object{daemonSet, daemonSetWin, targetPVCObj}, selectedNode: "", affinity: &kube.LoadAffinity{ NodeSelector: metav1.LabelSelector{ @@ -840,12 +863,31 @@ func TestCreateRestorePod(t *testing.T) { { Key: "kubernetes.io/os", Operator: metav1.LabelSelectorOpIn, - Values: []string{"Linux"}, + Values: []string{"linux"}, }, }, }, StorageClass: scName, }, + nodeOS: "linux", + }, + { + name: "windows", + kubeClientObj: []runtime.Object{daemonSet, daemonSetWin, targetPVCObj}, + selectedNode: "", + affinity: &kube.LoadAffinity{ + NodeSelector: metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "kubernetes.io/os", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"windows"}, + }, + }, + }, + StorageClass: scName, + }, + nodeOS: "windows", }, } @@ -866,11 +908,12 @@ func TestCreateRestorePod(t *testing.T) { }, targetPVCObj, time.Second*3, - map[string]string{}, - map[string]string{}, + nil, + nil, + nil, test.selectedNode, corev1api.ResourceRequirements{}, - "linux", + test.nodeOS, test.affinity, ) diff --git a/pkg/exposer/pod_volume.go b/pkg/exposer/pod_volume.go index b402ffecb..d34972b9d 100644 --- a/pkg/exposer/pod_volume.go +++ b/pkg/exposer/pod_volume.go @@ -59,6 +59,9 @@ type PodVolumeExposeParam struct { // HostingPodAnnotations is the annotations that are going to apply to the hosting pod HostingPodAnnotations map[string]string + // HostingPodTolerations is the tolerations that are going to apply to the hosting pod + HostingPodTolerations []corev1api.Toleration + // Resources defines the resource requirements of the hosting pod Resources corev1api.ResourceRequirements @@ -147,7 +150,7 @@ func (e *podVolumeExposer) Expose(ctx context.Context, ownerObject corev1api.Obj curLog.WithField("path", path).Infof("Host path is retrieved for pod %s, volume %s", param.ClientPodName, param.ClientPodVolume) - hostingPod, err := e.createHostingPod(ctx, ownerObject, param.Type, path.ByPath, param.OperationTimeout, param.HostingPodLabels, param.HostingPodAnnotations, pod.Spec.NodeName, param.Resources, nodeOS) + hostingPod, err := e.createHostingPod(ctx, ownerObject, param.Type, path.ByPath, param.OperationTimeout, param.HostingPodLabels, param.HostingPodAnnotations, param.HostingPodTolerations, pod.Spec.NodeName, param.Resources, nodeOS) if err != nil { return errors.Wrapf(err, "error to create hosting pod") } @@ -263,7 +266,7 @@ func (e *podVolumeExposer) CleanUp(ctx context.Context, ownerObject corev1api.Ob } func (e *podVolumeExposer) createHostingPod(ctx context.Context, ownerObject corev1api.ObjectReference, exposeType string, hostPath string, - operationTimeout time.Duration, label map[string]string, annotation map[string]string, selectedNode string, resources corev1api.ResourceRequirements, nodeOS string) (*corev1api.Pod, error) { + operationTimeout time.Duration, label map[string]string, annotation map[string]string, toleration []corev1api.Toleration, selectedNode string, resources corev1api.ResourceRequirements, nodeOS string) (*corev1api.Pod, error) { hostingPodName := ownerObject.Name containerName := string(ownerObject.UID) @@ -318,7 +321,6 @@ func (e *podVolumeExposer) createHostingPod(ctx context.Context, ownerObject cor var securityCtx *corev1api.PodSecurityContext nodeSelector := map[string]string{} podOS := corev1api.PodOS{} - toleration := []corev1api.Toleration{} if nodeOS == kube.NodeOSWindows { userID := "ContainerAdministrator" securityCtx = &corev1api.PodSecurityContext{ diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index 3d1159085..463e5173b 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -53,6 +53,7 @@ var ( ErrDaemonSetNotFound = errors.New("daemonset not found") ErrNodeAgentLabelNotFound = errors.New("node-agent label not found") ErrNodeAgentAnnotationNotFound = errors.New("node-agent annotation not found") + ErrNodeAgentTolerationNotFound = errors.New("node-agent toleration not found") ) type LoadConcurrency struct { @@ -256,6 +257,26 @@ func GetAnnotationValue(ctx context.Context, kubeClient kubernetes.Interface, na return val, nil } +func GetToleration(ctx context.Context, kubeClient kubernetes.Interface, namespace string, key string, osType string) (*corev1api.Toleration, error) { + dsName := daemonSet + if osType == kube.NodeOSWindows { + dsName = daemonsetWindows + } + + ds, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, dsName, metav1.GetOptions{}) + if err != nil { + return nil, errors.Wrapf(err, "error getting %s daemonset", dsName) + } + + for i, t := range ds.Spec.Template.Spec.Tolerations { + if t.Key == key { + return &ds.Spec.Template.Spec.Tolerations[i], nil + } + } + + return nil, ErrNodeAgentTolerationNotFound +} + func GetHostPodPath(ctx context.Context, kubeClient kubernetes.Interface, namespace string, osType string) (string, error) { dsName := daemonSet if osType == kube.NodeOSWindows { diff --git a/pkg/nodeagent/node_agent_test.go b/pkg/nodeagent/node_agent_test.go index f41752884..4790b94c4 100644 --- a/pkg/nodeagent/node_agent_test.go +++ b/pkg/nodeagent/node_agent_test.go @@ -592,6 +592,116 @@ func TestGetAnnotationValue(t *testing.T) { } } +func TestGetToleration(t *testing.T) { + daemonSet := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "fake-ns", + Name: "node-agent", + }, + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + }, + } + + daemonSetWithOtherToleration := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "fake-ns", + Name: "node-agent", + }, + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + }, + Spec: appsv1api.DaemonSetSpec{ + Template: corev1api.PodTemplateSpec{ + Spec: corev1api.PodSpec{ + Tolerations: []corev1api.Toleration{ + { + Key: "other-toleration-key", + }, + }, + }, + }, + }, + } + + daemonSetWithToleration := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "fake-ns", + Name: "node-agent", + }, + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + }, + Spec: appsv1api.DaemonSetSpec{ + Template: corev1api.PodTemplateSpec{ + Spec: corev1api.PodSpec{ + Tolerations: []corev1api.Toleration{ + { + Key: "fake-toleration", + Value: "true", + }, + }, + }, + }, + }, + } + + tests := []struct { + name string + kubeClientObj []runtime.Object + namespace string + expectedValue corev1api.Toleration + expectErr string + }{ + // { + // name: "ds get error", + // namespace: "fake-ns", + // expectErr: "error getting node-agent daemonset: daemonsets.apps \"node-agent\" not found", + // }, + { + name: "no toleration", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + daemonSet, + }, + expectErr: ErrNodeAgentTolerationNotFound.Error(), + }, + { + name: "no expecting toleration", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + daemonSetWithOtherToleration, + }, + expectErr: ErrNodeAgentTolerationNotFound.Error(), + }, + { + name: "expecting toleration", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + daemonSetWithToleration, + }, + expectedValue: corev1api.Toleration{ + Key: "fake-toleration", + Value: "true", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeKubeClient := fake.NewSimpleClientset(test.kubeClientObj...) + + value, err := GetToleration(context.TODO(), fakeKubeClient, test.namespace, "fake-toleration", kube.NodeOSLinux) + if test.expectErr == "" { + require.NoError(t, err) + assert.Equal(t, test.expectedValue, *value) + } else { + assert.EqualError(t, err, test.expectErr) + } + }) + } +} + func TestGetHostPodPath(t *testing.T) { daemonSet := &appsv1api.DaemonSet{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/util/third_party.go b/pkg/util/third_party.go index 04f30ccb0..e85dc4a24 100644 --- a/pkg/util/third_party.go +++ b/pkg/util/third_party.go @@ -23,3 +23,8 @@ var ThirdPartyLabels = []string{ var ThirdPartyAnnotations = []string{ "iam.amazonaws.com/role", } + +var ThirdPartyTolerations = []string{ + "kubernetes.azure.com/scalesetpriority", + "CriticalAddonsOnly", +}