Add metrics for restic back up operation (#2719)

* add metrics for restic back up operation

Signed-off-by: Ashish Amarnath <ashisham@vmware.com>

* changelog

Signed-off-by: Ashish Amarnath <ashisham@vmware.com>
This commit is contained in:
Ashish Amarnath
2020-07-22 12:07:52 -07:00
committed by GitHub
parent a63a82fcb0
commit 91ccc4adb2
5 changed files with 150 additions and 6 deletions

View File

@@ -0,0 +1 @@
add metrics for restic back up operation

View File

@@ -18,11 +18,15 @@ package restic
import (
"context"
"fmt"
"net/http"
"os"
"strings"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/vmware-tanzu/velero/internal/util/managercontroller"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
@@ -57,6 +61,11 @@ var (
scheme = runtime.NewScheme()
)
const (
// the port where prometheus metrics are exposed
defaultMetricsAddress = ":8085"
)
func NewServerCommand(f client.Factory) *cobra.Command {
logLevelFlag := logging.LogLevelFlag(logrus.InfoLevel)
formatFlag := logging.NewFormatFlag()
@@ -74,7 +83,7 @@ func NewServerCommand(f client.Factory) *cobra.Command {
logger.Infof("Starting Velero restic server %s (%s)", buildinfo.Version, buildinfo.FormattedGitSHA())
f.SetBasename(fmt.Sprintf("%s-%s", c.Parent().Name(), c.Name()))
s, err := newResticServer(logger, f)
s, err := newResticServer(logger, f, defaultMetricsAddress)
cmd.CheckError(err)
s.run()
@@ -99,9 +108,11 @@ type resticServer struct {
cancelFunc context.CancelFunc
fileSystem filesystem.Interface
mgr manager.Manager
metrics *metrics.ServerMetrics
metricsAddress string
}
func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*resticServer, error) {
func newResticServer(logger logrus.FieldLogger, factory client.Factory, metricAddress string) (*resticServer, error) {
kubeClient, err := factory.KubeClient()
if err != nil {
@@ -169,6 +180,7 @@ func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*restic
cancelFunc: cancelFunc,
fileSystem: filesystem.NewFileSystem(),
mgr: mgr,
metricsAddress: metricAddress,
}
if err := s.validatePodVolumesHostPath(); err != nil {
@@ -181,6 +193,18 @@ func newResticServer(logger logrus.FieldLogger, factory client.Factory) (*restic
func (s *resticServer) run() {
signals.CancelOnShutdown(s.cancelFunc, s.logger)
go func() {
metricsMux := http.NewServeMux()
metricsMux.Handle("/metrics", promhttp.Handler())
s.logger.Infof("Starting metric server for restic at address [%s]", s.metricsAddress)
if err := http.ListenAndServe(s.metricsAddress, metricsMux); err != nil {
s.logger.Fatalf("Failed to start metric server for restic at [%s]: %v", s.metricsAddress, err)
}
}()
s.metrics = metrics.NewResticServerMetrics()
s.metrics.RegisterAllMetrics()
s.metrics.InitResticMetricsForNode(os.Getenv("NODE_NAME"))
s.logger.Info("Starting controllers")
backupController := controller.NewPodVolumeBackupController(
@@ -191,6 +215,7 @@ func (s *resticServer) run() {
s.secretInformer,
s.kubeInformerFactory.Core().V1().PersistentVolumeClaims(),
s.kubeInformerFactory.Core().V1().PersistentVolumes(),
s.metrics,
s.mgr.GetClient(),
os.Getenv("NODE_NAME"),
)

View File

@@ -23,6 +23,7 @@ import (
"os"
"path/filepath"
"strings"
"time"
jsonpatch "github.com/evanphx/json-patch"
"github.com/pkg/errors"
@@ -40,6 +41,7 @@ import (
velerov1client "github.com/vmware-tanzu/velero/pkg/generated/clientset/versioned/typed/velero/v1"
informers "github.com/vmware-tanzu/velero/pkg/generated/informers/externalversions/velero/v1"
listers "github.com/vmware-tanzu/velero/pkg/generated/listers/velero/v1"
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/vmware-tanzu/velero/pkg/restic"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/kube"
@@ -58,6 +60,7 @@ type podVolumeBackupController struct {
pvLister corev1listers.PersistentVolumeLister
kbClient client.Client
nodeName string
metrics *metrics.ServerMetrics
processBackupFunc func(*velerov1api.PodVolumeBackup) error
fileSystem filesystem.Interface
@@ -73,6 +76,7 @@ func NewPodVolumeBackupController(
secretInformer cache.SharedIndexInformer,
pvcInformer corev1informers.PersistentVolumeClaimInformer,
pvInformer corev1informers.PersistentVolumeInformer,
metrics *metrics.ServerMetrics,
kbClient client.Client,
nodeName string,
) Interface {
@@ -86,6 +90,7 @@ func NewPodVolumeBackupController(
pvLister: pvInformer.Lister(),
kbClient: kbClient,
nodeName: nodeName,
metrics: metrics,
fileSystem: filesystem.NewFileSystem(),
clock: &clock.RealClock{},
@@ -126,6 +131,8 @@ func (c *podVolumeBackupController) pvbHandler(obj interface{}) {
return
}
c.metrics.RegisterPodVolumeBackupEnqueue(c.nodeName)
log.Debug("Enqueueing")
c.enqueue(obj)
}
@@ -174,6 +181,10 @@ func loggerForPodVolumeBackup(baseLogger logrus.FieldLogger, req *velerov1api.Po
return log
}
func getOwningBackup(req *velerov1api.PodVolumeBackup) string {
return fmt.Sprintf("%s/%s", req.Namespace, req.OwnerReferences[0].Name)
}
func (c *podVolumeBackupController) processBackup(req *velerov1api.PodVolumeBackup) error {
log := loggerForPodVolumeBackup(c.logger, req)
@@ -312,7 +323,12 @@ func (c *podVolumeBackupController) processBackup(req *velerov1api.PodVolumeBack
log.WithError(err).Error("Error setting PodVolumeBackup phase to Completed")
return err
}
latencyDuration := req.Status.CompletionTimestamp.Time.Sub(req.Status.StartTimestamp.Time)
latencySeconds := float64(latencyDuration / time.Second)
backupName := getOwningBackup(req)
c.metrics.ObserveRestiOpLatency(c.nodeName, req.Name, resticCmd.Command, backupName, latencySeconds)
c.metrics.RegisterResticOpLatencyGauge(c.nodeName, req.Name, resticCmd.Command, backupName, latencySeconds)
c.metrics.RegisterPodVolumeBackupDequeue(c.nodeName)
log.Info("Backup completed")
return nil

View File

@@ -23,6 +23,7 @@ import (
"github.com/stretchr/testify/require"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/metrics"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
)
@@ -156,6 +157,7 @@ func TestPVBHandler(t *testing.T) {
c := &podVolumeBackupController{
genericController: newGenericController("pod-volume-backup", velerotest.NewLogger()),
nodeName: controllerNode,
metrics: metrics.NewResticServerMetrics(),
}
c.pvbHandler(test.obj)

View File

@@ -28,7 +28,9 @@ type ServerMetrics struct {
}
const (
metricNamespace = "velero"
metricNamespace = "velero"
resticMetricsNamespace = "restic"
//Velero metrics
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
backupTotal = "backup_total"
backupAttemptTotal = "backup_attempt_total"
@@ -51,8 +53,18 @@ const (
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
// Restic metrics
podVolumeBackupEnqueueTotal = "pod_volume_backup_enqueue_count"
podVolumeBackupDequeueTotal = "pod_volume_backup_dequeue_count"
resticOperationLatencySeconds = "restic_operation_latency_seconds"
resticOperationLatencyGaugeSeconds = "restic_operation_latency_seconds_gauge"
// Labels
nodeMetricLabel = "node"
resticOperationLabel = "operation"
pvbNameLabel = "pod_volume_backup"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
secondsInMinute = 60.0
)
@@ -242,6 +254,56 @@ func NewServerMetrics() *ServerMetrics {
}
}
func NewResticServerMetrics() *ServerMetrics {
return &ServerMetrics{
metrics: map[string]prometheus.Collector{
podVolumeBackupEnqueueTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: resticMetricsNamespace,
Name: podVolumeBackupEnqueueTotal,
Help: "Total number of pod_volume_backup objects enqueued",
},
[]string{nodeMetricLabel},
),
podVolumeBackupDequeueTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: resticMetricsNamespace,
Name: podVolumeBackupDequeueTotal,
Help: "Total number of pod_volume_backup objects dequeued",
},
[]string{nodeMetricLabel},
),
resticOperationLatencyGaugeSeconds: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: resticMetricsNamespace,
Name: resticOperationLatencyGaugeSeconds,
Help: "Gauge metric indicating time taken, in seconds, to perform restic operations",
},
[]string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel},
),
resticOperationLatencySeconds: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: resticMetricsNamespace,
Name: resticOperationLatencySeconds,
Help: "Time taken to complete restic operations, in seconds",
Buckets: []float64{
toSeconds(1 * time.Minute),
toSeconds(5 * time.Minute),
toSeconds(10 * time.Minute),
toSeconds(15 * time.Minute),
toSeconds(30 * time.Minute),
toSeconds(1 * time.Hour),
toSeconds(2 * time.Hour),
toSeconds(3 * time.Hour),
toSeconds(4 * time.Hour),
},
},
[]string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel},
),
},
}
}
// RegisterAllMetrics registers all prometheus metrics.
func (m *ServerMetrics) RegisterAllMetrics() {
for _, pm := range m.metrics {
@@ -301,6 +363,44 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
}
}
// InitSchedule initializes counter metrics for a node.
func (m *ServerMetrics) InitResticMetricsForNode(node string) {
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(node).Add(0)
}
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(node).Add(0)
}
}
// RegisterPodVolumeBackupEnqueue records enqueuing of a PodVolumeBackup object.
func (m *ServerMetrics) RegisterPodVolumeBackupEnqueue(node string) {
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(node).Inc()
}
}
// RegisterPodVolumeBackupDequeue records dequeuing of a PodVolumeBackup object.
func (m *ServerMetrics) RegisterPodVolumeBackupDequeue(node string) {
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(node).Inc()
}
}
// ObserveRestiOpLatency records the number of seconds a restic operation took.
func (m *ServerMetrics) ObserveRestiOpLatency(node, pvbName, opName, backupName string, seconds float64) {
if h, ok := m.metrics[resticOperationLatencySeconds].(*prometheus.HistogramVec); ok {
h.WithLabelValues(node, opName, backupName, pvbName).Observe(seconds)
}
}
// RegisterResticOpLatencyGauge registers the restic operation latency as a gauge metric.
func (m *ServerMetrics) RegisterResticOpLatencyGauge(node, pvbName, opName, backupName string, seconds float64) {
if g, ok := m.metrics[resticOperationLatencyGaugeSeconds].(*prometheus.GaugeVec); ok {
g.WithLabelValues(node, opName, backupName, pvbName).Set(seconds)
}
}
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, size int64) {
if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {