Export Prometheus metrics for scrubbing operations. (#9264)

This PR introduces three new metrics...

  - `scrub_last_time_seconds`
  - `scrub_volume_failures`
  - `scrub_shard_failures`

...capturing overall volume scrub results, and allowing to construct alerts
and dashboards to monitor scrubbing progress.

Note that these metrics are aggregated at the volume/EC shard level, and not
intended for fine-grained tracking of scrubbing operations.
This commit is contained in:
Lisandro Pin
2026-04-28 21:34:02 +02:00
committed by GitHub
parent 294f7c3d04
commit 3f3aaa7cc8
2 changed files with 40 additions and 0 deletions

View File

@@ -4,8 +4,11 @@ import (
"context"
"errors"
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
)
@@ -68,6 +71,11 @@ func (vs *VolumeServer) ScrubVolume(ctx context.Context, req *volume_server_pb.S
}
}
}
scrubLabels := prometheus.Labels{"mode": req.GetMode().String()}
stats.VolumeServerScrubLastTimeSeconds.With(scrubLabels).Set(float64(time.Now().Unix()))
stats.VolumeServerScrubVolumeFailures.With(scrubLabels).Add(float64(len(brokenVolumes)))
if len(errs) != 0 {
return nil, errors.Join(errs...)
}
@@ -129,6 +137,11 @@ func (vs *VolumeServer) ScrubEcVolume(ctx context.Context, req *volume_server_pb
}
}
scrubLabels := prometheus.Labels{"mode": req.GetMode().String()}
stats.VolumeServerScrubLastTimeSeconds.With(scrubLabels).Set(float64(time.Now().Unix()))
stats.VolumeServerScrubVolumeFailures.With(scrubLabels).Add(float64(len(brokenVolumeIds)))
stats.VolumeServerScrubShardFailures.With(scrubLabels).Add(float64(len(brokenShardInfos)))
res := &volume_server_pb.ScrubEcVolumeResponse{
TotalVolumes: totalVolumes,
TotalFiles: totalFiles,

View File

@@ -394,6 +394,30 @@ var (
Help: "Counter of overall failed file write requests from clients.",
})
VolumeServerScrubLastTimeSeconds = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "scrub_last_time_seconds",
Help: "Last scrub execution time, as seconds since UNIX epoch.",
}, []string{"mode"})
VolumeServerScrubVolumeFailures = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "scrub_volume_failures",
Help: "Counter of overall volumes with issues detected during scrubbing.",
}, []string{"mode"})
VolumeServerScrubShardFailures = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "scrub_shard_failures",
Help: "Counter of overall EC shards with issues detected during scrubbing.",
}, []string{"mode"})
S3RequestCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
@@ -564,6 +588,9 @@ func init() {
Gather.MustRegister(VolumeServerFileReadFailures)
Gather.MustRegister(VolumeServerFileReadInvalidNeedles)
Gather.MustRegister(VolumeServerFileWriteFailures)
Gather.MustRegister(VolumeServerScrubLastTimeSeconds)
Gather.MustRegister(VolumeServerScrubVolumeFailures)
Gather.MustRegister(VolumeServerScrubShardFailures)
Gather.MustRegister(S3RequestCounter)
Gather.MustRegister(S3HandlerCounter)