mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-22 09:41:28 +00:00
Export Prometheus metrics for scrubbing operations. (#9264)
This PR introduces three new metrics... - `scrub_last_time_seconds` - `scrub_volume_failures` - `scrub_shard_failures` ...capturing overall volume scrub results, and allowing to construct alerts and dashboards to monitor scrubbing progress. Note that these metrics are aggregated at the volume/EC shard level, and not intended for fine-grained tracking of scrubbing operations.
This commit is contained in:
@@ -4,8 +4,11 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/stats"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
)
|
||||
@@ -68,6 +71,11 @@ func (vs *VolumeServer) ScrubVolume(ctx context.Context, req *volume_server_pb.S
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
scrubLabels := prometheus.Labels{"mode": req.GetMode().String()}
|
||||
stats.VolumeServerScrubLastTimeSeconds.With(scrubLabels).Set(float64(time.Now().Unix()))
|
||||
stats.VolumeServerScrubVolumeFailures.With(scrubLabels).Add(float64(len(brokenVolumes)))
|
||||
|
||||
if len(errs) != 0 {
|
||||
return nil, errors.Join(errs...)
|
||||
}
|
||||
@@ -129,6 +137,11 @@ func (vs *VolumeServer) ScrubEcVolume(ctx context.Context, req *volume_server_pb
|
||||
}
|
||||
}
|
||||
|
||||
scrubLabels := prometheus.Labels{"mode": req.GetMode().String()}
|
||||
stats.VolumeServerScrubLastTimeSeconds.With(scrubLabels).Set(float64(time.Now().Unix()))
|
||||
stats.VolumeServerScrubVolumeFailures.With(scrubLabels).Add(float64(len(brokenVolumeIds)))
|
||||
stats.VolumeServerScrubShardFailures.With(scrubLabels).Add(float64(len(brokenShardInfos)))
|
||||
|
||||
res := &volume_server_pb.ScrubEcVolumeResponse{
|
||||
TotalVolumes: totalVolumes,
|
||||
TotalFiles: totalFiles,
|
||||
|
||||
@@ -394,6 +394,30 @@ var (
|
||||
Help: "Counter of overall failed file write requests from clients.",
|
||||
})
|
||||
|
||||
VolumeServerScrubLastTimeSeconds = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: "volumeServer",
|
||||
Name: "scrub_last_time_seconds",
|
||||
Help: "Last scrub execution time, as seconds since UNIX epoch.",
|
||||
}, []string{"mode"})
|
||||
|
||||
VolumeServerScrubVolumeFailures = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: "volumeServer",
|
||||
Name: "scrub_volume_failures",
|
||||
Help: "Counter of overall volumes with issues detected during scrubbing.",
|
||||
}, []string{"mode"})
|
||||
|
||||
VolumeServerScrubShardFailures = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: "volumeServer",
|
||||
Name: "scrub_shard_failures",
|
||||
Help: "Counter of overall EC shards with issues detected during scrubbing.",
|
||||
}, []string{"mode"})
|
||||
|
||||
S3RequestCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
@@ -564,6 +588,9 @@ func init() {
|
||||
Gather.MustRegister(VolumeServerFileReadFailures)
|
||||
Gather.MustRegister(VolumeServerFileReadInvalidNeedles)
|
||||
Gather.MustRegister(VolumeServerFileWriteFailures)
|
||||
Gather.MustRegister(VolumeServerScrubLastTimeSeconds)
|
||||
Gather.MustRegister(VolumeServerScrubVolumeFailures)
|
||||
Gather.MustRegister(VolumeServerScrubShardFailures)
|
||||
|
||||
Gather.MustRegister(S3RequestCounter)
|
||||
Gather.MustRegister(S3HandlerCounter)
|
||||
|
||||
Reference in New Issue
Block a user