From 074d70112d8832c8be5cf30dbd77b04a1f319f23 Mon Sep 17 00:00:00 2001 From: Shireesh Anjal <355479+anjalshireesh@users.noreply.github.com> Date: Sun, 12 May 2024 22:53:50 +0530 Subject: [PATCH] Consolidate drive health related metrics into single metric (#19706) Instead of having "online" and "healing" as two metrics, replace with a single metric "health" which can have following values: 0 = offline 1 = healthy 2 = healing --- cmd/metrics-v3-system-drive.go | 33 ++++++++++++++++++--------------- cmd/metrics-v3.go | 3 +-- docs/metrics/v3.md | 3 +-- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/cmd/metrics-v3-system-drive.go b/cmd/metrics-v3-system-drive.go index 6c3c25548..1da77bf86 100644 --- a/cmd/metrics-v3-system-drive.go +++ b/cmd/metrics-v3-system-drive.go @@ -35,6 +35,10 @@ const ( sectorSize = uint64(512) kib = float64(1 << 10) + + driveHealthOffline = float64(0) + driveHealthOnline = float64(1) + driveHealthHealing = float64(2) ) var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL} @@ -51,8 +55,7 @@ const ( driveAvailabilityErrorsTotal = "availability_errors_total" driveWaitingIO = "waiting_io" driveAPILatencyMicros = "api_latency_micros" - driveHealing = "healing" - driveOnline = "online" + driveHealth = "health" driveOfflineCount = "offline_count" driveOnlineCount = "online_count" @@ -93,10 +96,8 @@ var ( driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros, "Average last minute latency in µs for drive API storage operations", append(allDriveLabels, apiL)...) - driveHealingMD = NewGaugeMD(driveHealing, - "Is it healing?", allDriveLabels...) - driveOnlineMD = NewGaugeMD(driveOnline, - "Is it online?", allDriveLabels...) + driveHealthMD = NewGaugeMD(driveHealth, + "Drive health (0 = offline, 1 = healthy, 2 = healing)", allDriveLabels...) driveOfflineCountMD = NewGaugeMD(driveOfflineCount, "Count of offline drives") @@ -152,16 +153,18 @@ func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string) m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...) m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...) - var healing, online float64 - if drive.Healing { - healing = 1 + var health float64 + switch drive.Healing { + case true: + health = driveHealthHealing + case false: + if drive.State == "ok" { + health = driveHealthOnline + } else { + health = driveHealthOffline + } } - m.Set(driveHealing, healing, labels...) - - if drive.State == "ok" { - online = 1 - } - m.Set(driveOnline, online, labels...) + m.Set(driveHealth, health, labels...) } func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) { diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index 178edb31e..862ff649b 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -186,8 +186,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { driveAvailabilityErrorsMD, driveWaitingIOMD, driveAPILatencyMD, - driveHealingMD, - driveOnlineMD, + driveHealthMD, driveOfflineCountMD, driveOnlineCountMD, diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index 1ce673009..3a276fdcc 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -132,8 +132,7 @@ The standard metrics group for GoCollector is not shown below. | `minio_system_drive_offline_count` | `gauge` | Count of offline drives | `pool_index,server` | | `minio_system_drive_online_count` | `gauge` | Count of online drives | `pool_index,server` | | `minio_system_drive_count` | `gauge` | Count of all drives | `pool_index,server` | -| `minio_system_drive_healing` | `gauge` | Is it healing? | `drive,set_index,drive_index,pool_index,server` | -| `minio_system_drive_online` | `gauge` | Is it online? | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_health` | `gauge` | Drive health (0 = offline, 1 = healthy, 2 = healing) | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_reads_per_sec` | `gauge` | Reads per second on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_reads_kb_per_sec` | `gauge` | Kilobytes read per second on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_reads_await` | `gauge` | Average time for read requests served on a drive | `drive,set_index,drive_index,pool_index,server` |