From 3f3aaa7cc873154e54dec0935a64ebd870189d92 Mon Sep 17 00:00:00 2001 From: Lisandro Pin Date: Tue, 28 Apr 2026 21:34:02 +0200 Subject: [PATCH] Export Prometheus metrics for scrubbing operations. (#9264) This PR introduces three new metrics... - `scrub_last_time_seconds` - `scrub_volume_failures` - `scrub_shard_failures` ...capturing overall volume scrub results, and allowing to construct alerts and dashboards to monitor scrubbing progress. Note that these metrics are aggregated at the volume/EC shard level, and not intended for fine-grained tracking of scrubbing operations. --- weed/server/volume_grpc_scrub.go | 13 +++++++++++++ weed/stats/metrics.go | 27 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/weed/server/volume_grpc_scrub.go b/weed/server/volume_grpc_scrub.go index 8c6a235b9..68acc5616 100644 --- a/weed/server/volume_grpc_scrub.go +++ b/weed/server/volume_grpc_scrub.go @@ -4,8 +4,11 @@ import ( "context" "errors" "fmt" + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/stats" "github.com/seaweedfs/seaweedfs/weed/storage" "github.com/seaweedfs/seaweedfs/weed/storage/needle" ) @@ -68,6 +71,11 @@ func (vs *VolumeServer) ScrubVolume(ctx context.Context, req *volume_server_pb.S } } } + + scrubLabels := prometheus.Labels{"mode": req.GetMode().String()} + stats.VolumeServerScrubLastTimeSeconds.With(scrubLabels).Set(float64(time.Now().Unix())) + stats.VolumeServerScrubVolumeFailures.With(scrubLabels).Add(float64(len(brokenVolumes))) + if len(errs) != 0 { return nil, errors.Join(errs...) } @@ -129,6 +137,11 @@ func (vs *VolumeServer) ScrubEcVolume(ctx context.Context, req *volume_server_pb } } + scrubLabels := prometheus.Labels{"mode": req.GetMode().String()} + stats.VolumeServerScrubLastTimeSeconds.With(scrubLabels).Set(float64(time.Now().Unix())) + stats.VolumeServerScrubVolumeFailures.With(scrubLabels).Add(float64(len(brokenVolumeIds))) + stats.VolumeServerScrubShardFailures.With(scrubLabels).Add(float64(len(brokenShardInfos))) + res := &volume_server_pb.ScrubEcVolumeResponse{ TotalVolumes: totalVolumes, TotalFiles: totalFiles, diff --git a/weed/stats/metrics.go b/weed/stats/metrics.go index e943f5726..a66c81e88 100644 --- a/weed/stats/metrics.go +++ b/weed/stats/metrics.go @@ -394,6 +394,30 @@ var ( Help: "Counter of overall failed file write requests from clients.", }) + VolumeServerScrubLastTimeSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "volumeServer", + Name: "scrub_last_time_seconds", + Help: "Last scrub execution time, as seconds since UNIX epoch.", + }, []string{"mode"}) + + VolumeServerScrubVolumeFailures = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "volumeServer", + Name: "scrub_volume_failures", + Help: "Counter of overall volumes with issues detected during scrubbing.", + }, []string{"mode"}) + + VolumeServerScrubShardFailures = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "volumeServer", + Name: "scrub_shard_failures", + Help: "Counter of overall EC shards with issues detected during scrubbing.", + }, []string{"mode"}) + S3RequestCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: Namespace, @@ -564,6 +588,9 @@ func init() { Gather.MustRegister(VolumeServerFileReadFailures) Gather.MustRegister(VolumeServerFileReadInvalidNeedles) Gather.MustRegister(VolumeServerFileWriteFailures) + Gather.MustRegister(VolumeServerScrubLastTimeSeconds) + Gather.MustRegister(VolumeServerScrubVolumeFailures) + Gather.MustRegister(VolumeServerScrubShardFailures) Gather.MustRegister(S3RequestCounter) Gather.MustRegister(S3HandlerCounter)