From 08d3d06a0658dfb5d9043bac26bb70f4ef278e78 Mon Sep 17 00:00:00 2001 From: Shireesh Anjal <355479+anjalshireesh@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:16:34 +0530 Subject: [PATCH] Add drive metrics in metrics-v3 (#19452) Add following metrics: - used_inodes - total_inodes - healing - online - reads_per_sec - reads_kb_per_sec - reads_await - writes_per_sec - writes_kb_per_sec - writes_await - perc_util To be able to calculate the `per_sec` values, we capture the IOStats-related data in the beginning (along with the time at which they were captured), and compare them against the current values subsequently. This is because dividing by "time since server uptime." doesn't work in k8s environments. --- cmd/metrics-resource.go | 10 +-- cmd/metrics-v3-cache.go | 77 +++++++++++++++++ cmd/metrics-v3-system-drive.go | 145 ++++++++++++++++++++++++++++----- cmd/metrics-v3.go | 13 +++ docs/metrics/v3.md | 11 +++ 5 files changed, 226 insertions(+), 30 deletions(-) diff --git a/cmd/metrics-resource.go b/cmd/metrics-resource.go index f2f54a637..9392231f5 100644 --- a/cmd/metrics-resource.go +++ b/cmd/metrics-resource.go @@ -227,15 +227,7 @@ func updateDriveIOStats(currentStats madmin.DiskIOStats, latestStats madmin.Disk // too soon to update the stats return } - diffStats := madmin.DiskIOStats{ - ReadIOs: currentStats.ReadIOs - latestStats.ReadIOs, - WriteIOs: currentStats.WriteIOs - latestStats.WriteIOs, - ReadTicks: currentStats.ReadTicks - latestStats.ReadTicks, - WriteTicks: currentStats.WriteTicks - latestStats.WriteTicks, - TotalTicks: currentStats.TotalTicks - latestStats.TotalTicks, - ReadSectors: currentStats.ReadSectors - latestStats.ReadSectors, - WriteSectors: currentStats.WriteSectors - latestStats.WriteSectors, - } + diffStats := getDiffStats(latestStats, currentStats) updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/diffInSeconds, labels, false) readKib := float64(diffStats.ReadSectors*sectorSize) / kib diff --git a/cmd/metrics-v3-cache.go b/cmd/metrics-v3-cache.go index 1fa22396c..8b1c7fd1c 100644 --- a/cmd/metrics-v3-cache.go +++ b/cmd/metrics-v3-cache.go @@ -18,6 +18,7 @@ package cmd import ( + "sync" "time" "github.com/minio/madmin-go/v3" @@ -61,8 +62,20 @@ func newNodesUpDownCache() *cachevalue.Cache[nodesOnline] { loadNodesUpDown) } +type driveIOStatMetrics struct { + readsPerSec float64 + readsKBPerSec float64 + readsAwait float64 + writesPerSec float64 + writesKBPerSec float64 + writesAwait float64 + percUtil float64 +} + +// storageMetrics - cached storage metrics. type storageMetrics struct { storageInfo madmin.StorageInfo + ioStats map[string]driveIOStatMetrics onlineDrives, offlineDrives, totalDrives int } @@ -98,7 +111,48 @@ func newESetHealthResultCache() *cachevalue.Cache[HealthResult] { ) } +func getDiffStats(initialStats, currentStats madmin.DiskIOStats) madmin.DiskIOStats { + return madmin.DiskIOStats{ + ReadIOs: currentStats.ReadIOs - initialStats.ReadIOs, + WriteIOs: currentStats.WriteIOs - initialStats.WriteIOs, + ReadSectors: currentStats.ReadSectors - initialStats.ReadSectors, + WriteSectors: currentStats.WriteSectors - initialStats.WriteSectors, + ReadTicks: currentStats.ReadTicks - initialStats.ReadTicks, + WriteTicks: currentStats.WriteTicks - initialStats.WriteTicks, + TotalTicks: currentStats.TotalTicks - initialStats.TotalTicks, + } +} + +func getDriveIOStatMetrics(ioStats madmin.DiskIOStats, duration time.Duration) (m driveIOStatMetrics) { + durationSecs := duration.Seconds() + + m.readsPerSec = float64(ioStats.ReadIOs) / durationSecs + m.readsKBPerSec = float64(ioStats.ReadSectors) * float64(sectorSize) / kib / durationSecs + if ioStats.ReadIOs > 0 { + m.readsAwait = float64(ioStats.ReadTicks) / float64(ioStats.ReadIOs) + } + + m.writesPerSec = float64(ioStats.WriteIOs) / durationSecs + m.writesKBPerSec = float64(ioStats.WriteSectors) * float64(sectorSize) / kib / durationSecs + if ioStats.WriteIOs > 0 { + m.writesAwait = float64(ioStats.WriteTicks) / float64(ioStats.WriteIOs) + } + + // TotalTicks is in milliseconds + m.percUtil = float64(ioStats.TotalTicks) * 100 / (durationSecs * 1000) + + return +} + func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] { + var ( + // prevDriveIOStats is used to calculate "per second" + // values for IOStat related disk metrics e.g. reads/sec. + prevDriveIOStats map[string]madmin.DiskIOStats + prevDriveIOStatsMu sync.RWMutex + prevDriveIOStatsRefreshedAt time.Time + ) + loadDriveMetrics := func() (v storageMetrics, err error) { objLayer := newObjectLayerFn() if objLayer == nil { @@ -108,14 +162,37 @@ func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] { storageInfo := objLayer.LocalStorageInfo(GlobalContext, true) onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks) totalDrives := onlineDrives.Merge(offlineDrives) + v = storageMetrics{ storageInfo: storageInfo, onlineDrives: onlineDrives.Sum(), offlineDrives: offlineDrives.Sum(), totalDrives: totalDrives.Sum(), + ioStats: map[string]driveIOStatMetrics{}, } + + currentStats := getCurrentDriveIOStats() + now := time.Now().UTC() + + prevDriveIOStatsMu.Lock() + if prevDriveIOStats != nil { + duration := now.Sub(prevDriveIOStatsRefreshedAt) + if duration.Seconds() > 1 { + for d, cs := range currentStats { + if ps, found := prevDriveIOStats[d]; found { + v.ioStats[d] = getDriveIOStatMetrics(getDiffStats(ps, cs), duration) + } + } + } + } + + prevDriveIOStats = currentStats + prevDriveIOStatsRefreshedAt = now + prevDriveIOStatsMu.Unlock() + return } + return cachevalue.NewFromFunc(1*time.Minute, cachevalue.Opts{ReturnLastGood: true}, loadDriveMetrics) diff --git a/cmd/metrics-v3-system-drive.go b/cmd/metrics-v3-system-drive.go index a4217b495..b231b1787 100644 --- a/cmd/metrics-v3-system-drive.go +++ b/cmd/metrics-v3-system-drive.go @@ -20,6 +20,8 @@ package cmd import ( "context" "strconv" + + "github.com/minio/madmin-go/v3" ) // label constants @@ -30,6 +32,9 @@ const ( driveIndexL = "drive_index" apiL = "api" + + sectorSize = uint64(512) + kib = float64(1 << 10) ) var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL} @@ -38,15 +43,28 @@ const ( driveUsedBytes = "used_bytes" driveFreeBytes = "free_bytes" driveTotalBytes = "total_bytes" + driveUsedInodes = "used_inodes" driveFreeInodes = "free_inodes" + driveTotalInodes = "total_inodes" driveTimeoutErrorsTotal = "timeout_errors_total" driveAvailabilityErrorsTotal = "availability_errors_total" driveWaitingIO = "waiting_io" driveAPILatencyMicros = "api_latency_micros" + driveHealing = "healing" + driveOnline = "online" driveOfflineCount = "offline_count" driveOnlineCount = "online_count" driveCount = "count" + + // iostat related + driveReadsPerSec = "reads_per_sec" + driveReadsKBPerSec = "reads_kb_per_sec" + driveReadsAwait = "reads_await" + driveWritesPerSec = "writes_per_sec" + driveWritesKBPerSec = "writes_kb_per_sec" + driveWritesAwait = "writes_await" + drivePercUtil = "perc_util" ) var ( @@ -56,8 +74,12 @@ var ( "Total storage free on a drive in bytes", allDriveLabels...) driveTotalBytesMD = NewGaugeMD(driveTotalBytes, "Total storage available on a drive in bytes", allDriveLabels...) + driveUsedInodesMD = NewGaugeMD(driveUsedInodes, + "Total used inodes on a drive", allDriveLabels...) driveFreeInodesMD = NewGaugeMD(driveFreeInodes, "Total free inodes on a drive", allDriveLabels...) + driveTotalInodesMD = NewGaugeMD(driveTotalInodes, + "Total inodes available on a drive", allDriveLabels...) driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal, "Total timeout errors on a drive", allDriveLabels...) driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal, @@ -68,6 +90,10 @@ var ( driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros, "Average last minute latency in µs for drive API storage operations", append(allDriveLabels, apiL)...) + driveHealingMD = NewGaugeMD(driveHealing, + "Is it healing?", allDriveLabels...) + driveOnlineMD = NewGaugeMD(driveOnline, + "Is it online?", allDriveLabels...) driveOfflineCountMD = NewGaugeMD(driveOfflineCount, "Count of offline drives") @@ -75,8 +101,101 @@ var ( "Count of online drives") driveCountMD = NewGaugeMD(driveCount, "Count of all drives") + + // iostat related + driveReadsPerSecMD = NewGaugeMD(driveReadsPerSec, + "Reads per second on a drive", + allDriveLabels...) + driveReadsKBPerSecMD = NewGaugeMD(driveReadsKBPerSec, + "Kilobytes read per second on a drive", + allDriveLabels...) + driveReadsAwaitMD = NewGaugeMD(driveReadsAwait, + "Average time for read requests served on a drive", + allDriveLabels...) + driveWritesPerSecMD = NewGaugeMD(driveWritesPerSec, + "Writes per second on a drive", + allDriveLabels...) + driveWritesKBPerSecMD = NewGaugeMD(driveWritesKBPerSec, + "Kilobytes written per second on a drive", + allDriveLabels...) + driveWritesAwaitMD = NewGaugeMD(driveWritesAwait, + "Average time for write requests served on a drive", + allDriveLabels...) + drivePercUtilMD = NewGaugeMD(drivePercUtil, + "Percentage of time the disk was busy", + allDriveLabels...) ) +func getCurrentDriveIOStats() map[string]madmin.DiskIOStats { + var types madmin.MetricType = madmin.MetricsDisk + driveRealtimeMetrics := collectLocalMetrics(types, collectMetricsOpts{ + hosts: map[string]struct{}{ + globalLocalNodeName: {}, + }, + }) + + stats := map[string]madmin.DiskIOStats{} + for d, m := range driveRealtimeMetrics.ByDisk { + stats[d] = m.IOStats + } + return stats +} + +func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string) { + m.Set(driveUsedBytes, float64(drive.UsedSpace), labels...) + m.Set(driveFreeBytes, float64(drive.AvailableSpace), labels...) + m.Set(driveTotalBytes, float64(drive.TotalSpace), labels...) + m.Set(driveUsedInodes, float64(drive.UsedInodes), labels...) + m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...) + m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...) + + var healing, online float64 + if drive.Healing { + healing = 1 + } + m.Set(driveHealing, healing, labels...) + + if drive.State == "ok" { + online = 1 + } + m.Set(driveOnline, online, labels...) +} + +func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) { + if disk.Metrics == nil { + return + } + + m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...) + m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...) + m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...) + + // Append the api label for the drive API latencies. + labels = append(labels, "api", "") + lastIdx := len(labels) - 1 + for apiName, latency := range disk.Metrics.LastMinute { + labels[lastIdx] = "storage." + apiName + m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()), + labels...) + } +} + +func (m *MetricValues) setDriveIOStatMetrics(ioStats driveIOStatMetrics, labels []string) { + m.Set(driveReadsPerSec, ioStats.readsPerSec, labels...) + m.Set(driveReadsKBPerSec, ioStats.readsKBPerSec, labels...) + if ioStats.readsPerSec > 0 { + m.Set(driveReadsAwait, ioStats.readsAwait, labels...) + } + + m.Set(driveWritesPerSec, ioStats.writesPerSec, labels...) + m.Set(driveWritesKBPerSec, ioStats.writesKBPerSec, labels...) + if ioStats.writesPerSec > 0 { + m.Set(driveWritesAwait, ioStats.writesAwait, labels...) + } + + m.Set(drivePercUtil, ioStats.percUtil, labels...) +} + // loadDriveMetrics - `MetricsLoaderFn` for node drive metrics. func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error { driveMetrics, err := c.driveMetrics.Get() @@ -85,9 +204,7 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro return nil } - storageInfo := driveMetrics.storageInfo - - for _, disk := range storageInfo.Disks { + for _, disk := range driveMetrics.storageInfo.Disks { labels := []string{ driveL, disk.DrivePath, poolIndexL, strconv.Itoa(disk.PoolIndex), @@ -95,25 +212,11 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro driveIndexL, strconv.Itoa(disk.DiskIndex), } - m.Set(driveUsedBytes, float64(disk.UsedSpace), labels...) - m.Set(driveFreeBytes, float64(disk.AvailableSpace), labels...) - m.Set(driveTotalBytes, float64(disk.TotalSpace), labels...) - m.Set(driveFreeInodes, float64(disk.FreeInodes), labels...) - - if disk.Metrics != nil { - m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...) - m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...) - m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...) - - // Append the api label for the drive API latencies. - labels = append(labels, "api", "") - lastIdx := len(labels) - 1 - for apiName, latency := range disk.Metrics.LastMinute { - labels[lastIdx] = "storage." + apiName - m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()), - labels...) - } + m.setDriveBasicMetrics(disk, labels) + if dm, found := driveMetrics.ioStats[disk.DrivePath]; found { + m.setDriveIOStatMetrics(dm, labels) } + m.setDriveAPIMetrics(disk, labels) } m.Set(driveOfflineCount, float64(driveMetrics.offlineDrives)) diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index 5814f9c39..a8353882d 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -117,15 +117,28 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { driveUsedBytesMD, driveFreeBytesMD, driveTotalBytesMD, + driveUsedInodesMD, driveFreeInodesMD, + driveTotalInodesMD, driveTimeoutErrorsMD, driveAvailabilityErrorsMD, driveWaitingIOMD, driveAPILatencyMD, + driveHealingMD, + driveOnlineMD, driveOfflineCountMD, driveOnlineCountMD, driveCountMD, + + // iostat related + driveReadsPerSecMD, + driveReadsKBPerSecMD, + driveReadsAwaitMD, + driveWritesPerSecMD, + driveWritesKBPerSecMD, + driveWritesAwaitMD, + drivePercUtilMD, }, loadDriveMetrics, ) diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index e048cf44f..aebd6e32d 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -105,7 +105,9 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_drive_used_bytes` | `gauge` | Total storage used on a drive in bytes | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_free_bytes` | `gauge` | Total storage free on a drive in bytes | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_total_bytes` | `gauge` | Total storage available on a drive in bytes | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_used_inodes` | `gauge` | Total used inodes on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_free_inodes` | `gauge` | Total free inodes on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_total_inodes` | `gauge` | Total inodes available on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_timeout_errors_total` | `counter` | Total timeout errors on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_availability_errors_total` | `counter` | Total availability errors (I/O errors, permission denied and timeouts) on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_waiting_io` | `gauge` | Total waiting I/O operations on a drive | `drive,set_index,drive_index,pool_index,server` | @@ -113,6 +115,15 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_drive_offline_count` | `gauge` | Count of offline drives | `pool_index,server` | | `minio_system_drive_online_count` | `gauge` | Count of online drives | `pool_index,server` | | `minio_system_drive_count` | `gauge` | Count of all drives | `pool_index,server` | +| `minio_system_drive_healing` | `gauge` | Is it healing? | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_online` | `gauge` | Is it online? | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_reads_per_sec` | `gauge` | Reads per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_reads_kb_per_sec` | `gauge` | Kilobytes read per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_reads_await` | `gauge` | Average time for read requests served on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_writes_per_sec` | `gauge` | Writes per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_writes_kb_per_sec` | `gauge` | Kilobytes written per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_writes_await` | `gauge` | Average time for write requests served on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_perc_util` | `gauge` | Percentage of time the disk was busy | `drive,set_index,drive_index,pool_index,server` | ### `/system/network/internode`