From e4e9b18b37af80399f1b69c9e89bb30c744f7cfe Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Wed, 4 Dec 2024 10:28:50 +0800 Subject: [PATCH] add diagnostic for data mover exposer Signed-off-by: Lyndon-Li --- pkg/controller/data_download_controller.go | 2 + .../data_download_controller_test.go | 4 ++ pkg/controller/data_upload_controller.go | 2 + pkg/controller/data_upload_controller_test.go | 8 +++ pkg/exposer/csi_snapshot.go | 61 +++++++++++++++++++ pkg/exposer/generic_restore.go | 48 +++++++++++++++ ...ic_restore.go => GenericRestoreExposer.go} | 18 ++++++ pkg/exposer/snapshot.go | 4 ++ pkg/nodeagent/node_agent.go | 18 +++++- pkg/util/csi/volume_snapshot.go | 42 +++++++++++++ pkg/util/kube/pod.go | 10 +++ pkg/util/kube/pvc_pv.go | 17 ++++++ pkg/util/kube/pvc_pv_test.go | 3 + 13 files changed, 235 insertions(+), 2 deletions(-) rename pkg/exposer/mocks/{generic_restore.go => GenericRestoreExposer.go} (89%) diff --git a/pkg/controller/data_download_controller.go b/pkg/controller/data_download_controller.go index a691063b0..aad034b8c 100644 --- a/pkg/controller/data_download_controller.go +++ b/pkg/controller/data_download_controller.go @@ -677,6 +677,8 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler return } + log.Warn(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd))) + r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd)) log.Info("Dataupload has been cleaned up") diff --git a/pkg/controller/data_download_controller_test.go b/pkg/controller/data_download_controller_test.go index a675b73cd..92355384b 100644 --- a/pkg/controller/data_download_controller_test.go +++ b/pkg/controller/data_download_controller_test.go @@ -971,6 +971,10 @@ func (dt *ddResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc return nil } +func (dt *ddResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string { + return "" +} + func (dt *ddResumeTestHelper) RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error { return nil } diff --git a/pkg/controller/data_upload_controller.go b/pkg/controller/data_upload_controller.go index b0d44be5b..0c44bad66 100644 --- a/pkg/controller/data_upload_controller.go +++ b/pkg/controller/data_upload_controller.go @@ -755,6 +755,8 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot } + log.Warn(ep.DiagnoseExpose(ctx, getOwnerObject(du))) + ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace) log.Info("Dataupload has been cleaned up") diff --git a/pkg/controller/data_upload_controller_test.go b/pkg/controller/data_upload_controller_test.go index ac6186555..5d3e52582 100644 --- a/pkg/controller/data_upload_controller_test.go +++ b/pkg/controller/data_upload_controller_test.go @@ -300,6 +300,10 @@ func (f *fakeSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev return f.peekErr } +func (f *fakeSnapshotExposer) DiagnoseExpose(context.Context, corev1.ObjectReference) string { + return "" +} + func (f *fakeSnapshotExposer) CleanUp(context.Context, corev1.ObjectReference, string, string) { } @@ -1043,6 +1047,10 @@ func (dt *duResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc return nil } +func (dt *duResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string { + return "" +} + func (dt *duResumeTestHelper) CleanUp(context.Context, corev1.ObjectReference, string, string) {} func (dt *duResumeTestHelper) newMicroServiceBRWatcher(kbclient.Client, kubernetes.Interface, manager.Manager, string, string, string, string, string, string, diff --git a/pkg/exposer/csi_snapshot.go b/pkg/exposer/csi_snapshot.go index bb421a794..85924f92f 100644 --- a/pkg/exposer/csi_snapshot.go +++ b/pkg/exposer/csi_snapshot.go @@ -308,6 +308,67 @@ func (e *csiSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev1 return nil } +func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string { + backupPodName := ownerObject.Name + backupPVCName := ownerObject.Name + backupVSName := ownerObject.Name + + diag := fmt.Sprintf("***************************begin diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + + pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{}) + if err != nil { + diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err) + } + + pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{}) + if err != nil { + diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err) + } + + vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{}) + if err != nil { + diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err) + } + + if pod != nil { + diag += kube.DiagnosePod(pod) + + if pod.Spec.NodeName != "" { + if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil { + diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName) + } + } + } + + if pvc != nil { + diag += kube.DiagnosePVC(pvc) + + if pvc.Spec.VolumeName != "" { + if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil { + diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err) + } else { + diag += kube.DiagnosePV(pv) + } + } + } + + if vs != nil { + diag += csi.DiagnoseVS(vs) + + if vs.Status.BoundVolumeSnapshotContentName != nil && *vs.Status.BoundVolumeSnapshotContentName != "" { + if vsc, err := e.csiSnapshotClient.VolumeSnapshotContents().Get(ctx, *vs.Status.BoundVolumeSnapshotContentName, metav1.GetOptions{}); err != nil { + diag += fmt.Sprintf("error getting backup vsc %s, err: %v\n", *vs.Status.BoundVolumeSnapshotContentName, err) + } else { + diag += csi.DiagnoseVSC(vsc) + } + } + } + + diag += fmt.Sprintf("***************************end diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + + return diag +} + const cleanUpTimeout = time.Minute func (e *csiSnapshotExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference, vsName string, sourceNamespace string) { diff --git a/pkg/exposer/generic_restore.go b/pkg/exposer/generic_restore.go index 975981d49..d7d4ac235 100644 --- a/pkg/exposer/generic_restore.go +++ b/pkg/exposer/generic_restore.go @@ -30,6 +30,7 @@ import ( "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/vmware-tanzu/velero/pkg/nodeagent" "github.com/vmware-tanzu/velero/pkg/util/boolptr" "github.com/vmware-tanzu/velero/pkg/util/kube" ) @@ -49,6 +50,10 @@ type GenericRestoreExposer interface { // Otherwise, it returns nil immediately. PeekExposed(context.Context, corev1.ObjectReference) error + // DiagnoseExpose generate the diagnostic info when the expose is not finished for a long time. + // If it finds any problem, it returns an string about the problem. + DiagnoseExpose(context.Context, corev1.ObjectReference) string + // RebindVolume unexposes the restored PV and rebind it to the target PVC RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error @@ -195,6 +200,49 @@ func (e *genericRestoreExposer) PeekExposed(ctx context.Context, ownerObject cor return nil } +func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string { + restorePodName := ownerObject.Name + restorePVCName := ownerObject.Name + + diag := fmt.Sprintf("***************************begin diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + + pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, restorePodName, metav1.GetOptions{}) + if err != nil { + diag += fmt.Sprintf("error to get restore pod %s, err: %v\n", restorePodName, err) + } + + pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, restorePVCName, metav1.GetOptions{}) + if err != nil { + diag += fmt.Sprintf("error to get restore pvc %s, err: %v\n", restorePVCName, err) + } + + if pod != nil { + diag += kube.DiagnosePod(pod) + + if pod.Spec.NodeName != "" { + if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil { + diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName) + } + } + } + + if pvc != nil { + diag += kube.DiagnosePVC(pvc) + + if pvc.Spec.VolumeName != "" { + if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil { + diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err) + } else { + diag += kube.DiagnosePV(pv) + } + } + } + + diag += fmt.Sprintf("***************************end diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + + return diag +} + func (e *genericRestoreExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference) { restorePodName := ownerObject.Name restorePVCName := ownerObject.Name diff --git a/pkg/exposer/mocks/generic_restore.go b/pkg/exposer/mocks/GenericRestoreExposer.go similarity index 89% rename from pkg/exposer/mocks/generic_restore.go rename to pkg/exposer/mocks/GenericRestoreExposer.go index e0b76d6e7..83a9789af 100644 --- a/pkg/exposer/mocks/generic_restore.go +++ b/pkg/exposer/mocks/GenericRestoreExposer.go @@ -26,6 +26,24 @@ func (_m *GenericRestoreExposer) CleanUp(_a0 context.Context, _a1 v1.ObjectRefer _m.Called(_a0, _a1) } +// DiagnoseExpose provides a mock function with given fields: _a0, _a1 +func (_m *GenericRestoreExposer) DiagnoseExpose(_a0 context.Context, _a1 v1.ObjectReference) string { + ret := _m.Called(_a0, _a1) + + if len(ret) == 0 { + panic("no return value specified for DiagnoseExpose") + } + + var r0 string + if rf, ok := ret.Get(0).(func(context.Context, v1.ObjectReference) string); ok { + r0 = rf(_a0, _a1) + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + // Expose provides a mock function with given fields: _a0, _a1, _a2, _a3, _a4, _a5, _a6 func (_m *GenericRestoreExposer) Expose(_a0 context.Context, _a1 v1.ObjectReference, _a2 string, _a3 string, _a4 map[string]string, _a5 v1.ResourceRequirements, _a6 time.Duration) error { ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5, _a6) diff --git a/pkg/exposer/snapshot.go b/pkg/exposer/snapshot.go index 63fee5e3a..a4a6bd7df 100644 --- a/pkg/exposer/snapshot.go +++ b/pkg/exposer/snapshot.go @@ -37,6 +37,10 @@ type SnapshotExposer interface { // Otherwise, it returns nil immediately. PeekExposed(context.Context, corev1.ObjectReference) error + // DiagnoseExpose generate the diagnostic info when the expose is not finished for a long time. + // If it finds any problem, it returns an string about the problem. + DiagnoseExpose(context.Context, corev1.ObjectReference) string + // CleanUp cleans up any objects generated during the snapshot expose CleanUp(context.Context, corev1.ObjectReference, string, string) } diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index b83efc6f4..3fcdceeb2 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -99,8 +99,17 @@ func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace s } } -// IsRunningInNode checks if the node agent pod is running properly in a specified node. If not, return the error found +// KbClientIsRunningInNode checks if the node agent pod is running properly in a specified node through kube client. If not, return the error found +func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName string, kubeClient kubernetes.Interface) error { + return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient) +} + +// IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error { + return isRunningInNode(ctx, namespace, nodeName, crClient, nil) +} + +func isRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client, kubeClient kubernetes.Interface) error { if nodeName == "" { return errors.New("node name is empty") } @@ -111,7 +120,12 @@ func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crC return errors.Wrap(err, "fail to parse selector") } - err = crClient.List(ctx, pods, &ctrlclient.ListOptions{LabelSelector: parsedSelector}) + if crClient != nil { + err = crClient.List(ctx, pods, &ctrlclient.ListOptions{LabelSelector: parsedSelector}) + } else { + pods, err = kubeClient.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: parsedSelector.String()}) + } + if err != nil { return errors.Wrap(err, "failed to list daemonset pods") } diff --git a/pkg/util/csi/volume_snapshot.go b/pkg/util/csi/volume_snapshot.go index 76a4d59fa..fcd683638 100644 --- a/pkg/util/csi/volume_snapshot.go +++ b/pkg/util/csi/volume_snapshot.go @@ -773,3 +773,45 @@ func WaitUntilVSCHandleIsReady( return vsc, nil } + +func DiagnoseVS(vs *snapshotv1api.VolumeSnapshot) string { + vscName := "" + if vs.Status.BoundVolumeSnapshotContentName != nil { + vscName = *vs.Status.BoundVolumeSnapshotContentName + } + + readyToUse := false + if vs.Status.ReadyToUse != nil { + readyToUse = *vs.Status.ReadyToUse + } + + errMessage := "" + if vs.Status.Error != nil && vs.Status.Error.Message != nil { + errMessage = *vs.Status.Error.Message + } + + diag := fmt.Sprintf("VS %s/%s, bind to %s, readToUse %v, errMessage %s\n", vs.Namespace, vs.Name, vscName, readyToUse, errMessage) + + return diag +} + +func DiagnoseVSC(vsc *snapshotv1api.VolumeSnapshotContent) string { + handle := "" + if vsc.Status.SnapshotHandle != nil { + handle = *vsc.Status.SnapshotHandle + } + + readyToUse := false + if vsc.Status.ReadyToUse != nil { + readyToUse = *vsc.Status.ReadyToUse + } + + errMessage := "" + if vsc.Status.Error != nil && vsc.Status.Error.Message != nil { + errMessage = *vsc.Status.Error.Message + } + + diag := fmt.Sprintf("VSC %s, readToUse %v, errMessage %s, handle %s\n", vsc.Name, readyToUse, errMessage, handle) + + return diag +} diff --git a/pkg/util/kube/pod.go b/pkg/util/kube/pod.go index 593d1541f..9f126a71a 100644 --- a/pkg/util/kube/pod.go +++ b/pkg/util/kube/pod.go @@ -257,3 +257,13 @@ func ToSystemAffinity(loadAffinities []*LoadAffinity) *corev1api.Affinity { return nil } + +func DiagnosePod(pod *corev1api.Pod) string { + diag := fmt.Sprintf("Pod %s/%s, phase %s, node name %s\n", pod.Namespace, pod.Name, pod.Status.Phase, pod.Spec.NodeName) + + for _, condition := range pod.Status.Conditions { + diag += fmt.Sprintf("Pod condition %s, reason %s, message %s\n", condition.Type, condition.Reason, condition.Message) + } + + return diag +} diff --git a/pkg/util/kube/pvc_pv.go b/pkg/util/kube/pvc_pv.go index 1811a2c1d..ac7d15fbf 100644 --- a/pkg/util/kube/pvc_pv.go +++ b/pkg/util/kube/pvc_pv.go @@ -412,3 +412,20 @@ func GetPVCForPodVolume(vol *corev1api.Volume, pod *corev1api.Pod, crClient crcl return pvc, nil } + +func DiagnosePVC(pvc *corev1api.PersistentVolumeClaim) string { + diag := fmt.Sprintf("PVC %s/%s, phase %s\n", pvc.Namespace, pvc.Name, pvc.Status.Phase) + + for _, condition := range pvc.Status.Conditions { + diag += fmt.Sprintf("PVC condition %s, reason %s, message %s\n", condition.Type, condition.Reason, condition.Message) + } + + diag += fmt.Sprintf("PVC is binding to %s\n", pvc.Spec.VolumeName) + + return diag +} + +func DiagnosePV(pv *corev1api.PersistentVolume) string { + diag := fmt.Sprintf("PV %s, phase %s, reason %s, message %s\n", pv.Name, pv.Status.Phase, pv.Status.Reason, pv.Status.Message) + return diag +} diff --git a/pkg/util/kube/pvc_pv_test.go b/pkg/util/kube/pvc_pv_test.go index 5cbe02dc0..00c3962c5 100644 --- a/pkg/util/kube/pvc_pv_test.go +++ b/pkg/util/kube/pvc_pv_test.go @@ -1463,3 +1463,6 @@ func TestMakePodPVCAttachment(t *testing.T) { }) } } + +func TestDiagnosePVC(t *testing.T) { +}