From 91ccc4adb2aac9569be6848841af171cf77b5481 Mon Sep 17 00:00:00 2001 From: Ashish Amarnath Date: Wed, 22 Jul 2020 12:07:52 -0700 Subject: [PATCH] Add metrics for restic back up operation (#2719) * add metrics for restic back up operation Signed-off-by: Ashish Amarnath * changelog Signed-off-by: Ashish Amarnath --- changelogs/unreleased/2719-ashish-amarnath | 1 + pkg/cmd/cli/restic/server.go | 29 ++++- .../pod_volume_backup_controller.go | 18 ++- .../pod_volume_backup_controller_test.go | 2 + pkg/metrics/metrics.go | 106 +++++++++++++++++- 5 files changed, 150 insertions(+), 6 deletions(-) create mode 100644 changelogs/unreleased/2719-ashish-amarnath diff --git a/changelogs/unreleased/2719-ashish-amarnath b/changelogs/unreleased/2719-ashish-amarnath new file mode 100644 index 000000000..35329dfc3 --- /dev/null +++ b/changelogs/unreleased/2719-ashish-amarnath @@ -0,0 +1 @@ +add metrics for restic back up operation diff --git a/pkg/cmd/cli/restic/server.go b/pkg/cmd/cli/restic/server.go index 60e7ca4de..b07d6a2ac 100644 --- a/pkg/cmd/cli/restic/server.go +++ b/pkg/cmd/cli/restic/server.go @@ -18,11 +18,15 @@ package restic import ( "context" "fmt" + "net/http" "os" "strings" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/vmware-tanzu/velero/internal/util/managercontroller" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + "github.com/vmware-tanzu/velero/pkg/metrics" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -57,6 +61,11 @@ var ( scheme = runtime.NewScheme() ) +const ( + // the port where prometheus metrics are exposed + defaultMetricsAddress = ":8085" +) + func NewServerCommand(f client.Factory) *cobra.Command { logLevelFlag := logging.LogLevelFlag(logrus.InfoLevel) formatFlag := logging.NewFormatFlag() @@ -74,7 +83,7 @@ func NewServerCommand(f client.Factory) *cobra.Command { logger.Infof("Starting Velero restic server %s (%s)", buildinfo.Version, buildinfo.FormattedGitSHA()) f.SetBasename(fmt.Sprintf("%s-%s", c.Parent().Name(), c.Name())) - s, err := newResticServer(logger, f) + s, err := newResticServer(logger, f, defaultMetricsAddress) cmd.CheckError(err) s.run() @@ -99,9 +108,11 @@ type resticServer struct { cancelFunc context.CancelFunc fileSystem filesystem.Interface mgr manager.Manager + metrics *metrics.ServerMetrics + metricsAddress string } -func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*resticServer, error) { +func newResticServer(logger logrus.FieldLogger, factory client.Factory, metricAddress string) (*resticServer, error) { kubeClient, err := factory.KubeClient() if err != nil { @@ -169,6 +180,7 @@ func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*restic cancelFunc: cancelFunc, fileSystem: filesystem.NewFileSystem(), mgr: mgr, + metricsAddress: metricAddress, } if err := s.validatePodVolumesHostPath(); err != nil { @@ -181,6 +193,18 @@ func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*restic func (s *resticServer) run() { signals.CancelOnShutdown(s.cancelFunc, s.logger) + go func() { + metricsMux := http.NewServeMux() + metricsMux.Handle("/metrics", promhttp.Handler()) + s.logger.Infof("Starting metric server for restic at address [%s]", s.metricsAddress) + if err := http.ListenAndServe(s.metricsAddress, metricsMux); err != nil { + s.logger.Fatalf("Failed to start metric server for restic at [%s]: %v", s.metricsAddress, err) + } + }() + s.metrics = metrics.NewResticServerMetrics() + s.metrics.RegisterAllMetrics() + s.metrics.InitResticMetricsForNode(os.Getenv("NODE_NAME")) + s.logger.Info("Starting controllers") backupController := controller.NewPodVolumeBackupController( @@ -191,6 +215,7 @@ func (s *resticServer) run() { s.secretInformer, s.kubeInformerFactory.Core().V1().PersistentVolumeClaims(), s.kubeInformerFactory.Core().V1().PersistentVolumes(), + s.metrics, s.mgr.GetClient(), os.Getenv("NODE_NAME"), ) diff --git a/pkg/controller/pod_volume_backup_controller.go b/pkg/controller/pod_volume_backup_controller.go index 87ec35c87..76902e42e 100644 --- a/pkg/controller/pod_volume_backup_controller.go +++ b/pkg/controller/pod_volume_backup_controller.go @@ -23,6 +23,7 @@ import ( "os" "path/filepath" "strings" + "time" jsonpatch "github.com/evanphx/json-patch" "github.com/pkg/errors" @@ -40,6 +41,7 @@ import ( velerov1client "github.com/vmware-tanzu/velero/pkg/generated/clientset/versioned/typed/velero/v1" informers "github.com/vmware-tanzu/velero/pkg/generated/informers/externalversions/velero/v1" listers "github.com/vmware-tanzu/velero/pkg/generated/listers/velero/v1" + "github.com/vmware-tanzu/velero/pkg/metrics" "github.com/vmware-tanzu/velero/pkg/restic" "github.com/vmware-tanzu/velero/pkg/util/filesystem" "github.com/vmware-tanzu/velero/pkg/util/kube" @@ -58,6 +60,7 @@ type podVolumeBackupController struct { pvLister corev1listers.PersistentVolumeLister kbClient client.Client nodeName string + metrics *metrics.ServerMetrics processBackupFunc func(*velerov1api.PodVolumeBackup) error fileSystem filesystem.Interface @@ -73,6 +76,7 @@ func NewPodVolumeBackupController( secretInformer cache.SharedIndexInformer, pvcInformer corev1informers.PersistentVolumeClaimInformer, pvInformer corev1informers.PersistentVolumeInformer, + metrics *metrics.ServerMetrics, kbClient client.Client, nodeName string, ) Interface { @@ -86,6 +90,7 @@ func NewPodVolumeBackupController( pvLister: pvInformer.Lister(), kbClient: kbClient, nodeName: nodeName, + metrics: metrics, fileSystem: filesystem.NewFileSystem(), clock: &clock.RealClock{}, @@ -126,6 +131,8 @@ func (c *podVolumeBackupController) pvbHandler(obj interface{}) { return } + c.metrics.RegisterPodVolumeBackupEnqueue(c.nodeName) + log.Debug("Enqueueing") c.enqueue(obj) } @@ -174,6 +181,10 @@ func loggerForPodVolumeBackup(baseLogger logrus.FieldLogger, req *velerov1api.Po return log } +func getOwningBackup(req *velerov1api.PodVolumeBackup) string { + return fmt.Sprintf("%s/%s", req.Namespace, req.OwnerReferences[0].Name) +} + func (c *podVolumeBackupController) processBackup(req *velerov1api.PodVolumeBackup) error { log := loggerForPodVolumeBackup(c.logger, req) @@ -312,7 +323,12 @@ func (c *podVolumeBackupController) processBackup(req *velerov1api.PodVolumeBack log.WithError(err).Error("Error setting PodVolumeBackup phase to Completed") return err } - + latencyDuration := req.Status.CompletionTimestamp.Time.Sub(req.Status.StartTimestamp.Time) + latencySeconds := float64(latencyDuration / time.Second) + backupName := getOwningBackup(req) + c.metrics.ObserveRestiOpLatency(c.nodeName, req.Name, resticCmd.Command, backupName, latencySeconds) + c.metrics.RegisterResticOpLatencyGauge(c.nodeName, req.Name, resticCmd.Command, backupName, latencySeconds) + c.metrics.RegisterPodVolumeBackupDequeue(c.nodeName) log.Info("Backup completed") return nil diff --git a/pkg/controller/pod_volume_backup_controller_test.go b/pkg/controller/pod_volume_backup_controller_test.go index 9b8f926bd..80f11f733 100644 --- a/pkg/controller/pod_volume_backup_controller_test.go +++ b/pkg/controller/pod_volume_backup_controller_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/require" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + "github.com/vmware-tanzu/velero/pkg/metrics" velerotest "github.com/vmware-tanzu/velero/pkg/test" ) @@ -156,6 +157,7 @@ func TestPVBHandler(t *testing.T) { c := &podVolumeBackupController{ genericController: newGenericController("pod-volume-backup", velerotest.NewLogger()), nodeName: controllerNode, + metrics: metrics.NewResticServerMetrics(), } c.pvbHandler(test.obj) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 2e18c4345..b02641799 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -28,7 +28,9 @@ type ServerMetrics struct { } const ( - metricNamespace = "velero" + metricNamespace = "velero" + resticMetricsNamespace = "restic" + //Velero metrics backupTarballSizeBytesGauge = "backup_tarball_size_bytes" backupTotal = "backup_total" backupAttemptTotal = "backup_attempt_total" @@ -51,8 +53,18 @@ const ( volumeSnapshotSuccessTotal = "volume_snapshot_success_total" volumeSnapshotFailureTotal = "volume_snapshot_failure_total" - scheduleLabel = "schedule" - backupNameLabel = "backupName" + // Restic metrics + podVolumeBackupEnqueueTotal = "pod_volume_backup_enqueue_count" + podVolumeBackupDequeueTotal = "pod_volume_backup_dequeue_count" + resticOperationLatencySeconds = "restic_operation_latency_seconds" + resticOperationLatencyGaugeSeconds = "restic_operation_latency_seconds_gauge" + + // Labels + nodeMetricLabel = "node" + resticOperationLabel = "operation" + pvbNameLabel = "pod_volume_backup" + scheduleLabel = "schedule" + backupNameLabel = "backupName" secondsInMinute = 60.0 ) @@ -242,6 +254,56 @@ func NewServerMetrics() *ServerMetrics { } } +func NewResticServerMetrics() *ServerMetrics { + return &ServerMetrics{ + metrics: map[string]prometheus.Collector{ + podVolumeBackupEnqueueTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: resticMetricsNamespace, + Name: podVolumeBackupEnqueueTotal, + Help: "Total number of pod_volume_backup objects enqueued", + }, + []string{nodeMetricLabel}, + ), + podVolumeBackupDequeueTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: resticMetricsNamespace, + Name: podVolumeBackupDequeueTotal, + Help: "Total number of pod_volume_backup objects dequeued", + }, + []string{nodeMetricLabel}, + ), + resticOperationLatencyGaugeSeconds: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: resticMetricsNamespace, + Name: resticOperationLatencyGaugeSeconds, + Help: "Gauge metric indicating time taken, in seconds, to perform restic operations", + }, + []string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel}, + ), + resticOperationLatencySeconds: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: resticMetricsNamespace, + Name: resticOperationLatencySeconds, + Help: "Time taken to complete restic operations, in seconds", + Buckets: []float64{ + toSeconds(1 * time.Minute), + toSeconds(5 * time.Minute), + toSeconds(10 * time.Minute), + toSeconds(15 * time.Minute), + toSeconds(30 * time.Minute), + toSeconds(1 * time.Hour), + toSeconds(2 * time.Hour), + toSeconds(3 * time.Hour), + toSeconds(4 * time.Hour), + }, + }, + []string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel}, + ), + }, + } +} + // RegisterAllMetrics registers all prometheus metrics. func (m *ServerMetrics) RegisterAllMetrics() { for _, pm := range m.metrics { @@ -301,6 +363,44 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { } } +// InitSchedule initializes counter metrics for a node. +func (m *ServerMetrics) InitResticMetricsForNode(node string) { + if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(node).Add(0) + } + if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(node).Add(0) + } +} + +// RegisterPodVolumeBackupEnqueue records enqueuing of a PodVolumeBackup object. +func (m *ServerMetrics) RegisterPodVolumeBackupEnqueue(node string) { + if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(node).Inc() + } +} + +// RegisterPodVolumeBackupDequeue records dequeuing of a PodVolumeBackup object. +func (m *ServerMetrics) RegisterPodVolumeBackupDequeue(node string) { + if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(node).Inc() + } +} + +// ObserveRestiOpLatency records the number of seconds a restic operation took. +func (m *ServerMetrics) ObserveRestiOpLatency(node, pvbName, opName, backupName string, seconds float64) { + if h, ok := m.metrics[resticOperationLatencySeconds].(*prometheus.HistogramVec); ok { + h.WithLabelValues(node, opName, backupName, pvbName).Observe(seconds) + } +} + +// RegisterResticOpLatencyGauge registers the restic operation latency as a gauge metric. +func (m *ServerMetrics) RegisterResticOpLatencyGauge(node, pvbName, opName, backupName string, seconds float64) { + if g, ok := m.metrics[resticOperationLatencyGaugeSeconds].(*prometheus.GaugeVec); ok { + g.WithLabelValues(node, opName, backupName, pvbName).Set(seconds) + } +} + // SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball. func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, size int64) { if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {