Add CSI VolumeSnapshot related metrics.

Signed-off-by: Xun Jiang <jxun@vmware.com>
This commit is contained in:
Xun Jiang
2022-04-09 00:57:28 +08:00
parent 3b75ae8ccc
commit 368a1ddf3c
5 changed files with 143 additions and 2 deletions

View File

@@ -17,6 +17,7 @@ limitations under the License.
package v1
import (
resource "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@@ -310,6 +311,21 @@ type BackupStatus struct {
// +optional
// +nullable
Progress *BackupProgress `json:"progress,omitempty"`
// CsiVolumeSnapshotsAttempted is the total number of attempted
// CSI VolumeSnapshots for this backup.
// +optional
CsiVolumeSnapshotsAttempted int `json:"csiVolumeSnapshotsAttempted,omitempty"`
// CsiVolumeSnapshotsCompleted is the total number of successfully
// completed CSI VolumeSnapshots for this backup.
// +optional
CsiVolumeSnapshotsCompleted int `json:"csiVolumeSnapshotsCompleted,omitempty"`
// CsiVolumeSnapshotsStorageTotal is the total storage size of created
// snapshots for this backup.
// +optional
CsiVolumeSnapshotsStorageTotal resource.Quantity `json:"csiVolumeSnapshotsStorageTotal,omitempty"`
}
// BackupProgress stores information about the progress of a Backup's execution.

View File

@@ -20,6 +20,7 @@ import (
"fmt"
"sort"
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v4/apis/volumesnapshot/v1"
"github.com/vmware-tanzu/velero/internal/hook"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/plugin/framework"
@@ -48,6 +49,7 @@ type Request struct {
VolumeSnapshots []*volume.Snapshot
PodVolumeBackups []*velerov1api.PodVolumeBackup
BackedUpItems map[itemKey]struct{}
CsiSnapshots []*snapshotv1api.VolumeSnapshot
}
// BackupResourceList returns the list of backed up resources grouped by the API

View File

@@ -636,6 +636,14 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
}
}
backup.Status.CsiVolumeSnapshotsAttempted = len(backup.CsiSnapshots)
for _, vs := range backup.CsiSnapshots {
if *vs.Status.ReadyToUse {
backup.Status.CsiVolumeSnapshotsCompleted++
backup.Status.CsiVolumeSnapshotsStorageTotal.Add(*vs.Status.RestoreSize)
}
}
backup.Status.Warnings = logCounter.GetCount(logrus.WarnLevel)
backup.Status.Errors = logCounter.GetCount(logrus.ErrorLevel)
@@ -694,6 +702,19 @@ func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, bac
serverMetrics.RegisterVolumeSnapshotAttempts(backupScheduleName, backup.Status.VolumeSnapshotsAttempted)
serverMetrics.RegisterVolumeSnapshotSuccesses(backupScheduleName, backup.Status.VolumeSnapshotsCompleted)
serverMetrics.RegisterVolumeSnapshotFailures(backupScheduleName, backup.Status.VolumeSnapshotsAttempted-backup.Status.VolumeSnapshotsCompleted)
if features.IsEnabled(velerov1api.CSIFeatureFlag) {
serverMetrics.RegisterCsiSnapshotAttempts(backupScheduleName, backup.Name, backup.Status.CsiVolumeSnapshotsAttempted)
serverMetrics.RegisterCsiSnapshotSuccesses(backupScheduleName, backup.Name, backup.Status.CsiVolumeSnapshotsCompleted)
serverMetrics.RegisterCsiSnapshotFailures(backupScheduleName, backup.Name, backup.Status.CsiVolumeSnapshotsAttempted-backup.Status.CsiVolumeSnapshotsCompleted)
storageSize, ret := backup.Status.CsiVolumeSnapshotsStorageTotal.AsInt64()
if !ret {
log.WithError(fmt.Errorf("fail to convert CSI snapshot size: %v to int64", backup.Status.CsiVolumeSnapshotsStorageTotal))
storageSize = 0
}
serverMetrics.RegisterCsiStorageSizeAdd(backupScheduleName, backup.Name, storageSize)
}
if backup.Status.Progress != nil {
serverMetrics.RegisterBackupItemsTotalGauge(backupScheduleName, backup.Status.Progress.TotalItems)
}

View File

@@ -28,6 +28,7 @@ import (
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
@@ -39,6 +40,7 @@ import (
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
pkgbackup "github.com/vmware-tanzu/velero/pkg/backup"
"github.com/vmware-tanzu/velero/pkg/discovery"
"github.com/vmware-tanzu/velero/pkg/features"
velerov1client "github.com/vmware-tanzu/velero/pkg/generated/clientset/versioned/typed/velero/v1"
velerov1informers "github.com/vmware-tanzu/velero/pkg/generated/informers/externalversions/velero/v1"
velerov1listers "github.com/vmware-tanzu/velero/pkg/generated/listers/velero/v1"
@@ -407,6 +409,25 @@ func (c *backupDeletionController) processRequest(req *velerov1api.DeleteBackupR
c.metrics.RegisterBackupDeletionFailed(backupScheduleName)
}
if features.IsEnabled(velerov1api.CSIFeatureFlag) {
vss, err := backupStore.GetCSIVolumeSnapshots(backup.Name)
if err != nil {
errs = append(errs, err.Error())
}
var restoreSizeTotal resource.Quantity
for _, vs := range vss {
restoreSizeTotal.Add(*vs.Status.RestoreSize)
}
storageSize, ret := restoreSizeTotal.AsInt64()
if !ret {
log.WithError(fmt.Errorf("fail to convert CSI snapshot size: %v to int64", backup.Status.CsiVolumeSnapshotsStorageTotal))
storageSize = 0
}
c.metrics.RegisterCsiStorageSizeSub(backupScheduleName, backup.Name, storageSize)
}
// Update status to processed and record errors
req, err = c.patchDeleteBackupRequest(req, func(r *velerov1api.DeleteBackupRequest) {
r.Status.Phase = velerov1api.DeleteBackupRequestPhaseProcessed

View File

@@ -54,6 +54,10 @@ const (
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
csiSnapshotAttemptTotal = "csi_snapshot_attempt_total"
csiSnapshotSuccessTotal = "csi_snapshot_success_total"
csiSnapshotFailureTotal = "csi_snapshot_failure_total"
csiSnapshotStorageTotal = "csi_snapshot_storage_total"
// Restic metrics
podVolumeBackupEnqueueTotal = "pod_volume_backup_enqueue_count"
@@ -67,8 +71,6 @@ const (
pvbNameLabel = "pod_volume_backup"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
secondsInMinute = 60.0
)
// NewServerMetrics returns new ServerMetrics
@@ -268,6 +270,38 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
csiSnapshotAttemptTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: csiSnapshotAttemptTotal,
Help: "Total number of CSI attempted volume snapshots",
},
[]string{scheduleLabel, backupNameLabel},
),
csiSnapshotSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: csiSnapshotSuccessTotal,
Help: "Total number of CSI successful volume snapshots",
},
[]string{scheduleLabel, backupNameLabel},
),
csiSnapshotFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: csiSnapshotFailureTotal,
Help: "Total number of CSI failed volume snapshots",
},
[]string{scheduleLabel, backupNameLabel},
),
csiSnapshotStorageTotal: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricNamespace,
Name: csiSnapshotStorageTotal,
Help: "Total size of CSI volume snapshots storage size",
},
[]string{scheduleLabel, backupNameLabel},
),
},
}
}
@@ -385,6 +419,18 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[csiSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[csiSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[csiSnapshotStorageTotal].(*prometheus.GaugeVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
}
// InitSchedule initializes counter metrics for a node.
@@ -593,3 +639,38 @@ func (m *ServerMetrics) RegisterVolumeSnapshotFailures(backupSchedule string, vo
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsFailed))
}
}
// RegisterCsiSnapshotAttempts records an attempt to snapshot a volume by CSI plugin.
func (m *ServerMetrics) RegisterCsiSnapshotAttempts(backupSchedule, backupName string, csiSnapshotsAttempted int) {
if c, ok := m.metrics[csiSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule, backupName).Add(float64(csiSnapshotsAttempted))
}
}
// RegisterCsiSnapshotSuccesses records a completed volume snapshot by CSI plugin.
func (m *ServerMetrics) RegisterCsiSnapshotSuccesses(backupSchedule, backupName string, csiSnapshotCompleted int) {
if c, ok := m.metrics[csiSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule, backupName).Add(float64(csiSnapshotCompleted))
}
}
// RegisterCsiSnapshotFailures records a failed volume snapshot by CSI plugin.
func (m *ServerMetrics) RegisterCsiSnapshotFailures(backupSchedule, backupName string, csiSnapshotsFailed int) {
if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule, backupName).Add(float64(csiSnapshotsFailed))
}
}
// RegisterCsiStorageSizeAdd records volume snapshot's storage size increase created by CSI plugin.
func (m *ServerMetrics) RegisterCsiStorageSizeAdd(backupSchedule, backupName string, csiStorageSize int64) {
if g, ok := m.metrics[csiSnapshotStorageTotal].(*prometheus.GaugeVec); ok {
g.WithLabelValues(backupSchedule, backupName).Add(float64(csiStorageSize))
}
}
// RegisterCsiStorageSizeSub records volume snapshot's storage size decrease created by CSI plugin.
func (m *ServerMetrics) RegisterCsiStorageSizeSub(backupSchedule, backupName string, csiStorageSize int64) {
if g, ok := m.metrics[csiSnapshotStorageTotal].(*prometheus.GaugeVec); ok {
g.WithLabelValues(backupSchedule, backupName).Sub(float64(csiStorageSize))
}
}