mirror of
https://github.com/vmware-tanzu/velero.git
synced 2026-01-06 21:36:30 +00:00
Add metrics for restic back up operation (#2719)
* add metrics for restic back up operation Signed-off-by: Ashish Amarnath <ashisham@vmware.com> * changelog Signed-off-by: Ashish Amarnath <ashisham@vmware.com>
This commit is contained in:
1
changelogs/unreleased/2719-ashish-amarnath
Normal file
1
changelogs/unreleased/2719-ashish-amarnath
Normal file
@@ -0,0 +1 @@
|
||||
add metrics for restic back up operation
|
||||
@@ -18,11 +18,15 @@ package restic
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
|
||||
"github.com/vmware-tanzu/velero/internal/util/managercontroller"
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sirupsen/logrus"
|
||||
@@ -57,6 +61,11 @@ var (
|
||||
scheme = runtime.NewScheme()
|
||||
)
|
||||
|
||||
const (
|
||||
// the port where prometheus metrics are exposed
|
||||
defaultMetricsAddress = ":8085"
|
||||
)
|
||||
|
||||
func NewServerCommand(f client.Factory) *cobra.Command {
|
||||
logLevelFlag := logging.LogLevelFlag(logrus.InfoLevel)
|
||||
formatFlag := logging.NewFormatFlag()
|
||||
@@ -74,7 +83,7 @@ func NewServerCommand(f client.Factory) *cobra.Command {
|
||||
logger.Infof("Starting Velero restic server %s (%s)", buildinfo.Version, buildinfo.FormattedGitSHA())
|
||||
|
||||
f.SetBasename(fmt.Sprintf("%s-%s", c.Parent().Name(), c.Name()))
|
||||
s, err := newResticServer(logger, f)
|
||||
s, err := newResticServer(logger, f, defaultMetricsAddress)
|
||||
cmd.CheckError(err)
|
||||
|
||||
s.run()
|
||||
@@ -99,9 +108,11 @@ type resticServer struct {
|
||||
cancelFunc context.CancelFunc
|
||||
fileSystem filesystem.Interface
|
||||
mgr manager.Manager
|
||||
metrics *metrics.ServerMetrics
|
||||
metricsAddress string
|
||||
}
|
||||
|
||||
func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*resticServer, error) {
|
||||
func newResticServer(logger logrus.FieldLogger, factory client.Factory, metricAddress string) (*resticServer, error) {
|
||||
|
||||
kubeClient, err := factory.KubeClient()
|
||||
if err != nil {
|
||||
@@ -169,6 +180,7 @@ func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*restic
|
||||
cancelFunc: cancelFunc,
|
||||
fileSystem: filesystem.NewFileSystem(),
|
||||
mgr: mgr,
|
||||
metricsAddress: metricAddress,
|
||||
}
|
||||
|
||||
if err := s.validatePodVolumesHostPath(); err != nil {
|
||||
@@ -181,6 +193,18 @@ func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*restic
|
||||
func (s *resticServer) run() {
|
||||
signals.CancelOnShutdown(s.cancelFunc, s.logger)
|
||||
|
||||
go func() {
|
||||
metricsMux := http.NewServeMux()
|
||||
metricsMux.Handle("/metrics", promhttp.Handler())
|
||||
s.logger.Infof("Starting metric server for restic at address [%s]", s.metricsAddress)
|
||||
if err := http.ListenAndServe(s.metricsAddress, metricsMux); err != nil {
|
||||
s.logger.Fatalf("Failed to start metric server for restic at [%s]: %v", s.metricsAddress, err)
|
||||
}
|
||||
}()
|
||||
s.metrics = metrics.NewResticServerMetrics()
|
||||
s.metrics.RegisterAllMetrics()
|
||||
s.metrics.InitResticMetricsForNode(os.Getenv("NODE_NAME"))
|
||||
|
||||
s.logger.Info("Starting controllers")
|
||||
|
||||
backupController := controller.NewPodVolumeBackupController(
|
||||
@@ -191,6 +215,7 @@ func (s *resticServer) run() {
|
||||
s.secretInformer,
|
||||
s.kubeInformerFactory.Core().V1().PersistentVolumeClaims(),
|
||||
s.kubeInformerFactory.Core().V1().PersistentVolumes(),
|
||||
s.metrics,
|
||||
s.mgr.GetClient(),
|
||||
os.Getenv("NODE_NAME"),
|
||||
)
|
||||
|
||||
@@ -23,6 +23,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
jsonpatch "github.com/evanphx/json-patch"
|
||||
"github.com/pkg/errors"
|
||||
@@ -40,6 +41,7 @@ import (
|
||||
velerov1client "github.com/vmware-tanzu/velero/pkg/generated/clientset/versioned/typed/velero/v1"
|
||||
informers "github.com/vmware-tanzu/velero/pkg/generated/informers/externalversions/velero/v1"
|
||||
listers "github.com/vmware-tanzu/velero/pkg/generated/listers/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
"github.com/vmware-tanzu/velero/pkg/restic"
|
||||
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
|
||||
"github.com/vmware-tanzu/velero/pkg/util/kube"
|
||||
@@ -58,6 +60,7 @@ type podVolumeBackupController struct {
|
||||
pvLister corev1listers.PersistentVolumeLister
|
||||
kbClient client.Client
|
||||
nodeName string
|
||||
metrics *metrics.ServerMetrics
|
||||
|
||||
processBackupFunc func(*velerov1api.PodVolumeBackup) error
|
||||
fileSystem filesystem.Interface
|
||||
@@ -73,6 +76,7 @@ func NewPodVolumeBackupController(
|
||||
secretInformer cache.SharedIndexInformer,
|
||||
pvcInformer corev1informers.PersistentVolumeClaimInformer,
|
||||
pvInformer corev1informers.PersistentVolumeInformer,
|
||||
metrics *metrics.ServerMetrics,
|
||||
kbClient client.Client,
|
||||
nodeName string,
|
||||
) Interface {
|
||||
@@ -86,6 +90,7 @@ func NewPodVolumeBackupController(
|
||||
pvLister: pvInformer.Lister(),
|
||||
kbClient: kbClient,
|
||||
nodeName: nodeName,
|
||||
metrics: metrics,
|
||||
|
||||
fileSystem: filesystem.NewFileSystem(),
|
||||
clock: &clock.RealClock{},
|
||||
@@ -126,6 +131,8 @@ func (c *podVolumeBackupController) pvbHandler(obj interface{}) {
|
||||
return
|
||||
}
|
||||
|
||||
c.metrics.RegisterPodVolumeBackupEnqueue(c.nodeName)
|
||||
|
||||
log.Debug("Enqueueing")
|
||||
c.enqueue(obj)
|
||||
}
|
||||
@@ -174,6 +181,10 @@ func loggerForPodVolumeBackup(baseLogger logrus.FieldLogger, req *velerov1api.Po
|
||||
return log
|
||||
}
|
||||
|
||||
func getOwningBackup(req *velerov1api.PodVolumeBackup) string {
|
||||
return fmt.Sprintf("%s/%s", req.Namespace, req.OwnerReferences[0].Name)
|
||||
}
|
||||
|
||||
func (c *podVolumeBackupController) processBackup(req *velerov1api.PodVolumeBackup) error {
|
||||
log := loggerForPodVolumeBackup(c.logger, req)
|
||||
|
||||
@@ -312,7 +323,12 @@ func (c *podVolumeBackupController) processBackup(req *velerov1api.PodVolumeBack
|
||||
log.WithError(err).Error("Error setting PodVolumeBackup phase to Completed")
|
||||
return err
|
||||
}
|
||||
|
||||
latencyDuration := req.Status.CompletionTimestamp.Time.Sub(req.Status.StartTimestamp.Time)
|
||||
latencySeconds := float64(latencyDuration / time.Second)
|
||||
backupName := getOwningBackup(req)
|
||||
c.metrics.ObserveRestiOpLatency(c.nodeName, req.Name, resticCmd.Command, backupName, latencySeconds)
|
||||
c.metrics.RegisterResticOpLatencyGauge(c.nodeName, req.Name, resticCmd.Command, backupName, latencySeconds)
|
||||
c.metrics.RegisterPodVolumeBackupDequeue(c.nodeName)
|
||||
log.Info("Backup completed")
|
||||
|
||||
return nil
|
||||
|
||||
@@ -23,6 +23,7 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/metrics"
|
||||
velerotest "github.com/vmware-tanzu/velero/pkg/test"
|
||||
)
|
||||
|
||||
@@ -156,6 +157,7 @@ func TestPVBHandler(t *testing.T) {
|
||||
c := &podVolumeBackupController{
|
||||
genericController: newGenericController("pod-volume-backup", velerotest.NewLogger()),
|
||||
nodeName: controllerNode,
|
||||
metrics: metrics.NewResticServerMetrics(),
|
||||
}
|
||||
|
||||
c.pvbHandler(test.obj)
|
||||
|
||||
@@ -28,7 +28,9 @@ type ServerMetrics struct {
|
||||
}
|
||||
|
||||
const (
|
||||
metricNamespace = "velero"
|
||||
metricNamespace = "velero"
|
||||
resticMetricsNamespace = "restic"
|
||||
//Velero metrics
|
||||
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
|
||||
backupTotal = "backup_total"
|
||||
backupAttemptTotal = "backup_attempt_total"
|
||||
@@ -51,8 +53,18 @@ const (
|
||||
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
|
||||
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
|
||||
|
||||
scheduleLabel = "schedule"
|
||||
backupNameLabel = "backupName"
|
||||
// Restic metrics
|
||||
podVolumeBackupEnqueueTotal = "pod_volume_backup_enqueue_count"
|
||||
podVolumeBackupDequeueTotal = "pod_volume_backup_dequeue_count"
|
||||
resticOperationLatencySeconds = "restic_operation_latency_seconds"
|
||||
resticOperationLatencyGaugeSeconds = "restic_operation_latency_seconds_gauge"
|
||||
|
||||
// Labels
|
||||
nodeMetricLabel = "node"
|
||||
resticOperationLabel = "operation"
|
||||
pvbNameLabel = "pod_volume_backup"
|
||||
scheduleLabel = "schedule"
|
||||
backupNameLabel = "backupName"
|
||||
|
||||
secondsInMinute = 60.0
|
||||
)
|
||||
@@ -242,6 +254,56 @@ func NewServerMetrics() *ServerMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
func NewResticServerMetrics() *ServerMetrics {
|
||||
return &ServerMetrics{
|
||||
metrics: map[string]prometheus.Collector{
|
||||
podVolumeBackupEnqueueTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: resticMetricsNamespace,
|
||||
Name: podVolumeBackupEnqueueTotal,
|
||||
Help: "Total number of pod_volume_backup objects enqueued",
|
||||
},
|
||||
[]string{nodeMetricLabel},
|
||||
),
|
||||
podVolumeBackupDequeueTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: resticMetricsNamespace,
|
||||
Name: podVolumeBackupDequeueTotal,
|
||||
Help: "Total number of pod_volume_backup objects dequeued",
|
||||
},
|
||||
[]string{nodeMetricLabel},
|
||||
),
|
||||
resticOperationLatencyGaugeSeconds: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: resticMetricsNamespace,
|
||||
Name: resticOperationLatencyGaugeSeconds,
|
||||
Help: "Gauge metric indicating time taken, in seconds, to perform restic operations",
|
||||
},
|
||||
[]string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel},
|
||||
),
|
||||
resticOperationLatencySeconds: prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: resticMetricsNamespace,
|
||||
Name: resticOperationLatencySeconds,
|
||||
Help: "Time taken to complete restic operations, in seconds",
|
||||
Buckets: []float64{
|
||||
toSeconds(1 * time.Minute),
|
||||
toSeconds(5 * time.Minute),
|
||||
toSeconds(10 * time.Minute),
|
||||
toSeconds(15 * time.Minute),
|
||||
toSeconds(30 * time.Minute),
|
||||
toSeconds(1 * time.Hour),
|
||||
toSeconds(2 * time.Hour),
|
||||
toSeconds(3 * time.Hour),
|
||||
toSeconds(4 * time.Hour),
|
||||
},
|
||||
},
|
||||
[]string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterAllMetrics registers all prometheus metrics.
|
||||
func (m *ServerMetrics) RegisterAllMetrics() {
|
||||
for _, pm := range m.metrics {
|
||||
@@ -301,6 +363,44 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
|
||||
}
|
||||
}
|
||||
|
||||
// InitSchedule initializes counter metrics for a node.
|
||||
func (m *ServerMetrics) InitResticMetricsForNode(node string) {
|
||||
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(node).Add(0)
|
||||
}
|
||||
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(node).Add(0)
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterPodVolumeBackupEnqueue records enqueuing of a PodVolumeBackup object.
|
||||
func (m *ServerMetrics) RegisterPodVolumeBackupEnqueue(node string) {
|
||||
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(node).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterPodVolumeBackupDequeue records dequeuing of a PodVolumeBackup object.
|
||||
func (m *ServerMetrics) RegisterPodVolumeBackupDequeue(node string) {
|
||||
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(node).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveRestiOpLatency records the number of seconds a restic operation took.
|
||||
func (m *ServerMetrics) ObserveRestiOpLatency(node, pvbName, opName, backupName string, seconds float64) {
|
||||
if h, ok := m.metrics[resticOperationLatencySeconds].(*prometheus.HistogramVec); ok {
|
||||
h.WithLabelValues(node, opName, backupName, pvbName).Observe(seconds)
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterResticOpLatencyGauge registers the restic operation latency as a gauge metric.
|
||||
func (m *ServerMetrics) RegisterResticOpLatencyGauge(node, pvbName, opName, backupName string, seconds float64) {
|
||||
if g, ok := m.metrics[resticOperationLatencyGaugeSeconds].(*prometheus.GaugeVec); ok {
|
||||
g.WithLabelValues(node, opName, backupName, pvbName).Set(seconds)
|
||||
}
|
||||
}
|
||||
|
||||
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
|
||||
func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, size int64) {
|
||||
if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {
|
||||
|
||||
Reference in New Issue
Block a user