fail fs-backup for windows nodes

Signed-off-by: Lyndon-Li <lyonghui@vmware.com>
This commit is contained in:
Lyndon-Li
2024-11-27 14:07:28 +08:00
parent 010fd1cb1d
commit a711b1067b
12 changed files with 326 additions and 31 deletions

View File

@@ -0,0 +1 @@
Make fs-backup work on linux nodes with the new Velero deployment and disable fs-backup if the source/target pod is running in non-linux node (#8424)

View File

@@ -82,6 +82,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/restore"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/kube"
"github.com/vmware-tanzu/velero/pkg/util/logging"
)
@@ -454,10 +455,20 @@ func (s *server) veleroResourcesExist() error {
func (s *server) checkNodeAgent() {
// warn if node agent does not exist
if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent")
if kube.WithLinuxNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnLinux(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for linux nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for linux nodes")
}
}
if kube.WithWindowsNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnWindows(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for Windows nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for Windows nodes")
}
}
}

View File

@@ -1146,7 +1146,7 @@ func Test_csiSnapshotExposer_DiagnoseExpose(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1.DefaultNamespace,
Name: "node-agent-pod-1",
Labels: map[string]string{"name": "node-agent"},
Labels: map[string]string{"role": "node-agent"},
},
Spec: corev1.PodSpec{
NodeName: "fake-node",

View File

@@ -627,7 +627,7 @@ func Test_ReastoreDiagnoseExpose(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1.DefaultNamespace,
Name: "node-agent-pod-1",
Labels: map[string]string{"name": "node-agent"},
Labels: map[string]string{"role": "node-agent"},
},
Spec: corev1.PodSpec{
NodeName: "fake-node",

View File

@@ -33,8 +33,14 @@ import (
)
const (
// daemonSet is the name of the Velero node agent daemonset.
// daemonSet is the name of the Velero node agent daemonset on linux nodes.
daemonSet = "node-agent"
// daemonsetWindows is the name of the Velero node agent daemonset on Windows nodes.
daemonsetWindows = "node-agent-windows"
// nodeAgentRole marks pods with node-agent role on all nodes.
nodeAgentRole = "node-agent"
)
var (
@@ -89,9 +95,16 @@ type Configs struct {
PodResources *kube.PodResources `json:"podResources,omitempty"`
}
// IsRunning checks if the node agent daemonset is running properly. If not, return the error found
func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonSet, metav1.GetOptions{}); apierrors.IsNotFound(err) {
func IsRunningOnLinux(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonSet)
}
func IsRunningOnWindows(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonsetWindows)
}
func isRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string, daemonset string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonset, metav1.GetOptions{}); apierrors.IsNotFound(err) {
return ErrDaemonSetNotFound
} else if err != nil {
return err
@@ -116,7 +129,7 @@ func isRunningInNode(ctx context.Context, namespace string, nodeName string, crC
}
pods := new(v1.PodList)
parsedSelector, err := labels.Parse(fmt.Sprintf("name=%s", daemonSet))
parsedSelector, err := labels.Parse(fmt.Sprintf("role=%s", nodeAgentRole))
if err != nil {
return errors.Wrap(err, "fail to parse selector")
}
@@ -128,7 +141,7 @@ func isRunningInNode(ctx context.Context, namespace string, nodeName string, crC
}
if err != nil {
return errors.Wrap(err, "failed to list daemonset pods")
return errors.Wrap(err, "failed to list node-agent pods")
}
for i := range pods.Items {

View File

@@ -40,7 +40,7 @@ type reactor struct {
}
func TestIsRunning(t *testing.T) {
daemonSet := &appsv1.DaemonSet{
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "fake-ns",
Name: "node-agent",
@@ -80,7 +80,7 @@ func TestIsRunning(t *testing.T) {
name: "succeed",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
daemonSet,
ds,
},
},
}
@@ -93,7 +93,7 @@ func TestIsRunning(t *testing.T) {
fakeKubeClient.Fake.PrependReactor(reactor.verb, reactor.resource, reactor.reactorFunc)
}
err := IsRunning(context.TODO(), fakeKubeClient, test.namespace)
err := isRunning(context.TODO(), fakeKubeClient, test.namespace, daemonSet)
if test.expectErr == "" {
assert.NoError(t, err)
} else {
@@ -108,11 +108,11 @@ func TestIsRunningInNode(t *testing.T) {
corev1.AddToScheme(scheme)
nonNodeAgentPod := builder.ForPod("fake-ns", "fake-pod").Result()
nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod").Labels(map[string]string{"name": "node-agent"}).Result()
nodeAgentPodRunning1 := builder.ForPod("fake-ns", "fake-pod-1").Labels(map[string]string{"name": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodRunning2 := builder.ForPod("fake-ns", "fake-pod-2").Labels(map[string]string{"name": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod").Labels(map[string]string{"role": "node-agent"}).Result()
nodeAgentPodRunning1 := builder.ForPod("fake-ns", "fake-pod-1").Labels(map[string]string{"role": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodRunning2 := builder.ForPod("fake-ns", "fake-pod-2").Labels(map[string]string{"role": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodRunning3 := builder.ForPod("fake-ns", "fake-pod-3").
Labels(map[string]string{"name": "node-agent"}).
Labels(map[string]string{"role": "node-agent"}).
Phase(corev1.PodRunning).
NodeName("fake-node").
Result()

View File

@@ -206,6 +206,12 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api.
return nil, pvcSummary, nil
}
if err := kube.IsLinuxNode(b.ctx, pod.Spec.NodeName, b.crClient); err != nil {
err := errors.Wrapf(err, "Pod %s/%s is not running in linux node(%s), skip", pod.Namespace, pod.Name, pod.Spec.NodeName)
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
return nil, pvcSummary, []error{err}
}
err := nodeagent.IsRunningInNode(b.ctx, backup.Namespace, pod.Spec.NodeName, b.crClient)
if err != nil {
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)

View File

@@ -260,7 +260,7 @@ func createPodObj(running bool, withVolume bool, withVolumeMounted bool, volumeN
func createNodeAgentPodObj(running bool) *corev1api.Pod {
podObj := builder.ForPod(velerov1api.DefaultNamespace, "fake-node-agent").Result()
podObj.Labels = map[string]string{"name": "node-agent"}
podObj.Labels = map[string]string{"role": "node-agent"}
if running {
podObj.Status.Phase = corev1api.PodRunning
@@ -303,6 +303,14 @@ func createPVBObj(fail bool, withSnapshot bool, index int, uploaderType string)
return pvbObj
}
func createNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
}
func createWindowsNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
}
func TestBackupPodVolumes(t *testing.T) {
scheme := runtime.NewScheme()
velerov1api.AddToScheme(scheme)
@@ -358,13 +366,32 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
},
{
name: "pod is not running on Linux node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createWindowsNodeObj(),
},
sourcePod: createPodObj(false, false, false, 2),
uploaderType: "kopia",
errs: []string{
"Pod fake-ns/fake-pod is not running in linux node(fake-node-name), skip",
},
},
{
name: "node-agent pod is not running in node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
sourcePod: createPodObj(true, false, false, 2),
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
"daemonset pod not found in running state in node fake-node-name",
@@ -379,6 +406,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
mockGetRepositoryType: true,
@@ -395,6 +423,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
@@ -410,6 +439,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
@@ -427,6 +457,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
@@ -448,6 +479,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
},
@@ -471,6 +503,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, true),
@@ -482,6 +515,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "volume not mounted by pod should be skipped",
@@ -492,6 +526,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, false),
@@ -503,6 +538,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "return completed pvbs",
@@ -512,6 +548,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, true, 1),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVObj(1, false),
},
@@ -522,6 +559,7 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
pvbs: 1,
errs: []string{},
},
}
// TODO add more verification around PVCBackupSummary returned by "BackupPodVolumes"
@@ -568,8 +606,8 @@ func TestBackupPodVolumes(t *testing.T) {
pvbs, _, errs := bp.BackupPodVolumes(backupObj, test.sourcePod, test.volumes, nil, velerotest.NewLogger())
if errs == nil {
assert.Nil(t, test.errs)
if test.errs == nil {
assert.NoError(t, err)
} else {
for i := 0; i < len(errs); i++ {
assert.EqualError(t, errs[i], test.errs[i])

View File

@@ -122,7 +122,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
return nil
}
if err := nodeagent.IsRunning(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
if err := nodeagent.IsRunningOnLinux(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
return []error{errors.Wrapf(err, "error to check node agent status")}
}
@@ -213,6 +213,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
} else if err != nil {
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
} else {
if err := kube.IsLinuxNode(checkCtx, nodeName, r.crClient); err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("Restored pod is not running in linux node")
r.nodeAgentCheck <- errors.Wrapf(err, "restored pod %s/%s is not running in linux node(%s)", data.Pod.Namespace, data.Pod.Name, nodeName)
return
}
err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.crClient)
if err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")

View File

@@ -33,7 +33,6 @@ import (
"k8s.io/client-go/kubernetes"
kubefake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/tools/cache"
ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake"
"github.com/vmware-tanzu/velero/internal/volume"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
@@ -314,6 +313,30 @@ func TestRestorePodVolumes(t *testing.T) {
},
},
},
{
name: "pod is not running on linux nodes",
pvbs: []*velerov1api.PodVolumeBackup{
createPVBObj(true, true, 1, "kopia"),
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createWindowsNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
},
restoredPod: createPodObj(true, true, true, 1),
sourceNamespace: "fake-ns",
bsl: "fake-bsl",
runtimeScheme: scheme,
errs: []expectError{
{
err: "restored pod fake-ns/fake-pod is not running in linux node(fake-node-name): os type windows for node fake-node-name is not linux",
},
},
},
{
name: "node-agent pod is not running",
pvbs: []*velerov1api.PodVolumeBackup{
@@ -321,6 +344,7 @@ func TestRestorePodVolumes(t *testing.T) {
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
@@ -344,6 +368,7 @@ func TestRestorePodVolumes(t *testing.T) {
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
createNodeAgentPodObj(true),
@@ -368,11 +393,6 @@ func TestRestorePodVolumes(t *testing.T) {
ctx = test.ctx
}
fakeClientBuilder := ctrlfake.NewClientBuilder()
if test.runtimeScheme != nil {
fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme)
}
objClient := append(test.ctlClientObj, test.kubeClientObj...)
objClient = append(objClient, test.veleroClientObj...)
@@ -438,7 +458,8 @@ func TestRestorePodVolumes(t *testing.T) {
for i := 0; i < len(errs); i++ {
j := 0
for ; j < len(test.errs); j++ {
if errs[i].Error() == test.errs[j].err {
err := errs[i].Error()
if err == test.errs[j].err {
break
}
}

67
pkg/util/kube/node.go Normal file
View File

@@ -0,0 +1,67 @@
/*
Copyright The Velero Contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kube
import (
"context"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
corev1api "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)
func IsLinuxNode(ctx context.Context, nodeName string, client client.Client) error {
node := &corev1api.Node{}
if err := client.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
return errors.Wrapf(err, "error getting node %s", nodeName)
}
if os, found := node.Labels["kubernetes.io/os"]; !found {
return errors.Errorf("no os type label for node %s", nodeName)
} else if os != "linux" {
return errors.Errorf("os type %s for node %s is not linux", os, nodeName)
} else {
return nil
}
}
func WithLinuxNode(ctx context.Context, client client.Client, log logrus.FieldLogger) bool {
return withOSNode(ctx, client, "linux", log)
}
func WithWindowsNode(ctx context.Context, client client.Client, log logrus.FieldLogger) bool {
return withOSNode(ctx, client, "windows", log)
}
func withOSNode(ctx context.Context, client client.Client, osType string, log logrus.FieldLogger) bool {
nodeList := new(corev1api.NodeList)
if err := client.List(ctx, nodeList); err != nil {
log.Warn("Failed to list nodes, cannot decide existence of windows nodes")
return false
}
for _, node := range nodeList.Items {
if os, found := node.Labels["kubernetes.io/os"]; !found {
log.Warnf("Node %s doesn't have os type label, cannot decide existence of windows nodes")
} else if os == osType {
return true
}
}
return false
}

132
pkg/util/kube/node_test.go Normal file
View File

@@ -0,0 +1,132 @@
/*
Copyright The Velero Contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kube
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"github.com/vmware-tanzu/velero/pkg/builder"
clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
)
func TestIsLinuxNode(t *testing.T) {
nodeNoOSLabel := builder.ForNode("fake-node").Result()
nodeWindows := builder.ForNode("fake-node").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
nodeLinux := builder.ForNode("fake-node").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
scheme := runtime.NewScheme()
corev1.AddToScheme(scheme)
tests := []struct {
name string
kubeClientObj []runtime.Object
err string
}{
{
name: "error getting node",
err: "error getting node fake-node: nodes \"fake-node\" not found",
},
{
name: "no os label",
kubeClientObj: []runtime.Object{
nodeNoOSLabel,
},
err: "no os type label for node fake-node",
},
{
name: "os label does not match",
kubeClientObj: []runtime.Object{
nodeWindows,
},
err: "os type windows for node fake-node is not linux",
},
{
name: "succeed",
kubeClientObj: []runtime.Object{
nodeLinux,
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
fakeClientBuilder := clientFake.NewClientBuilder()
fakeClientBuilder = fakeClientBuilder.WithScheme(scheme)
fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build()
err := IsLinuxNode(context.TODO(), "fake-node", fakeClient)
if err != nil {
assert.EqualError(t, err, test.err)
} else {
assert.NoError(t, err)
}
})
}
}
func TestWithLinuxNode(t *testing.T) {
nodeWindows := builder.ForNode("fake-node-1").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
nodeLinux := builder.ForNode("fake-node-2").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
scheme := runtime.NewScheme()
corev1.AddToScheme(scheme)
tests := []struct {
name string
kubeClientObj []runtime.Object
result bool
}{
{
name: "error listing node",
},
{
name: "with node of other type",
kubeClientObj: []runtime.Object{
nodeWindows,
},
},
{
name: "with node of the same type",
kubeClientObj: []runtime.Object{
nodeWindows,
nodeLinux,
},
result: true,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
fakeClientBuilder := clientFake.NewClientBuilder()
fakeClientBuilder = fakeClientBuilder.WithScheme(scheme)
fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build()
result := withOSNode(context.TODO(), fakeClient, "linux", velerotest.NewLogger())
assert.Equal(t, test.result, result)
})
}
}