diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 53a4395ef..c4866da3d 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -28,7 +28,6 @@ import ( "time" "github.com/minio/kes-go" - "github.com/minio/madmin-go/v3" "github.com/minio/minio/internal/bucket/lifecycle" "github.com/minio/minio/internal/logger" "github.com/minio/minio/internal/mcontext" @@ -83,7 +82,6 @@ func init() { nodeGroups := []*MetricsGroup{ getNodeHealthMetrics(), - getLocalDriveStorageMetrics(), getCacheMetrics(), getHTTPMetrics(false), getNetworkMetrics(), @@ -129,7 +127,7 @@ const ( cacheSubsystem MetricSubsystem = "cache" capacityRawSubsystem MetricSubsystem = "capacity_raw" capacityUsableSubsystem MetricSubsystem = "capacity_usable" - diskSubsystem MetricSubsystem = "disk" + driveSubsystem MetricSubsystem = "drive" storageClassSubsystem MetricSubsystem = "storage_class" fileDescriptorSubsystem MetricSubsystem = "file_descriptor" goRoutines MetricSubsystem = "go_routine" @@ -379,7 +377,7 @@ func getClusterCapacityUsageFreeBytesMD() MetricDescription { func getNodeDriveAPILatencyMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: latencyMicroSec, Help: "Average last minute latency in µs for drive API storage operations", Type: gaugeMetric, @@ -389,17 +387,37 @@ func getNodeDriveAPILatencyMD() MetricDescription { func getNodeDriveUsedBytesMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: usedBytes, Help: "Total storage used on a drive", Type: gaugeMetric, } } +func getNodeDriveTimeoutErrorsMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: driveSubsystem, + Name: "errors_timeout", + Help: "Total number of timeout errors since server start", + Type: counterMetric, + } +} + +func getNodeDriveAvailablityErrorsMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: driveSubsystem, + Name: "errors_availability", + Help: "Total number of I/O errors, permission denied and timeouts since server start", + Type: counterMetric, + } +} + func getNodeDriveFreeBytesMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: freeBytes, Help: "Total storage available on a drive", Type: gaugeMetric, @@ -409,9 +427,9 @@ func getNodeDriveFreeBytesMD() MetricDescription { func getClusterDrivesOfflineTotalMD() MetricDescription { return MetricDescription{ Namespace: clusterMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: offlineTotal, - Help: "Total drives offline", + Help: "Total drives offline in this cluster", Type: gaugeMetric, } } @@ -419,9 +437,9 @@ func getClusterDrivesOfflineTotalMD() MetricDescription { func getClusterDrivesOnlineTotalMD() MetricDescription { return MetricDescription{ Namespace: clusterMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: onlineTotal, - Help: "Total drives online", + Help: "Total drives online in this cluster", Type: gaugeMetric, } } @@ -429,9 +447,9 @@ func getClusterDrivesOnlineTotalMD() MetricDescription { func getClusterDrivesTotalMD() MetricDescription { return MetricDescription{ Namespace: clusterMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: total, - Help: "Total drives", + Help: "Total drives in this cluster", Type: gaugeMetric, } } @@ -439,9 +457,9 @@ func getClusterDrivesTotalMD() MetricDescription { func getNodeDrivesOfflineTotalMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: offlineTotal, - Help: "Total drives offline", + Help: "Total drives offline in this node", Type: gaugeMetric, } } @@ -449,9 +467,9 @@ func getNodeDrivesOfflineTotalMD() MetricDescription { func getNodeDrivesOnlineTotalMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: onlineTotal, - Help: "Total drives online", + Help: "Total drives online in this node", Type: gaugeMetric, } } @@ -459,9 +477,9 @@ func getNodeDrivesOnlineTotalMD() MetricDescription { func getNodeDrivesTotalMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: total, - Help: "Total drives", + Help: "Total drives in this node", Type: gaugeMetric, } } @@ -489,7 +507,7 @@ func getNodeRRSParityMD() MetricDescription { func getNodeDrivesFreeInodes() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: freeInodes, Help: "Total free inodes", Type: gaugeMetric, @@ -499,7 +517,7 @@ func getNodeDrivesFreeInodes() MetricDescription { func getNodeDriveTotalBytesMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, - Subsystem: diskSubsystem, + Subsystem: driveSubsystem, Name: totalBytes, Help: "Total storage on a drive", Type: gaugeMetric, @@ -1888,22 +1906,22 @@ func getCacheMetrics() *MetricsGroup { metrics = append(metrics, Metric{ Description: getCacheUsagePercentMD(), Value: float64(cdStats.UsagePercent), - VariableLabels: map[string]string{"disk": cdStats.Dir}, + VariableLabels: map[string]string{"drive": cdStats.Dir}, }) metrics = append(metrics, Metric{ Description: getCacheUsageInfoMD(), Value: float64(cdStats.UsageState), - VariableLabels: map[string]string{"disk": cdStats.Dir, "level": cdStats.GetUsageLevelString()}, + VariableLabels: map[string]string{"drive": cdStats.Dir, "level": cdStats.GetUsageLevelString()}, }) metrics = append(metrics, Metric{ Description: getCacheUsedBytesMD(), Value: float64(cdStats.UsageSize), - VariableLabels: map[string]string{"disk": cdStats.Dir}, + VariableLabels: map[string]string{"drive": cdStats.Dir}, }) metrics = append(metrics, Metric{ Description: getCacheTotalBytesMD(), Value: float64(cdStats.TotalCapacity), - VariableLabels: map[string]string{"disk": cdStats.Dir}, + VariableLabels: map[string]string{"drive": cdStats.Dir}, }) } return @@ -2560,26 +2578,48 @@ func getLocalStorageMetrics() *MetricsGroup { metrics = append(metrics, Metric{ Description: getNodeDriveUsedBytesMD(), Value: float64(disk.UsedSpace), - VariableLabels: map[string]string{"disk": disk.DrivePath}, + VariableLabels: map[string]string{"drive": disk.DrivePath}, }) metrics = append(metrics, Metric{ Description: getNodeDriveFreeBytesMD(), Value: float64(disk.AvailableSpace), - VariableLabels: map[string]string{"disk": disk.DrivePath}, + VariableLabels: map[string]string{"drive": disk.DrivePath}, }) metrics = append(metrics, Metric{ Description: getNodeDriveTotalBytesMD(), Value: float64(disk.TotalSpace), - VariableLabels: map[string]string{"disk": disk.DrivePath}, + VariableLabels: map[string]string{"drive": disk.DrivePath}, }) metrics = append(metrics, Metric{ Description: getNodeDrivesFreeInodes(), Value: float64(disk.FreeInodes), - VariableLabels: map[string]string{"disk": disk.DrivePath}, + VariableLabels: map[string]string{"drive": disk.DrivePath}, }) + + if disk.Metrics != nil { + metrics = append(metrics, Metric{ + Description: getNodeDriveTimeoutErrorsMD(), + Value: float64(disk.Metrics.TotalErrorsTimeout), + VariableLabels: map[string]string{"drive": disk.DrivePath}, + }) + + metrics = append(metrics, Metric{ + Description: getNodeDriveAvailablityErrorsMD(), + Value: float64(disk.Metrics.TotalErrorsAvailability), + VariableLabels: map[string]string{"drive": disk.DrivePath}, + }) + + for apiName, latency := range disk.Metrics.LastMinute { + metrics = append(metrics, Metric{ + Description: getNodeDriveAPILatencyMD(), + Value: float64(latency.Avg().Microseconds()), + VariableLabels: map[string]string{"drive": disk.DrivePath, "api": "storage." + apiName}, + }) + } + } } metrics = append(metrics, Metric{ @@ -2612,39 +2652,6 @@ func getLocalStorageMetrics() *MetricsGroup { return mg } -func getLocalDriveStorageMetrics() *MetricsGroup { - mg := &MetricsGroup{ - cacheInterval: 1 * time.Minute, - } - mg.RegisterRead(func(ctx context.Context) (metrics []Metric) { - objLayer := newObjectLayerFn() - // Service not initialized yet - if objLayer == nil { - return - } - - storageInfo := objLayer.LocalStorageInfo(ctx) - if storageInfo.Backend.Type == madmin.FS { - return - } - metrics = make([]Metric, 0, 50) - for _, disk := range storageInfo.Disks { - if disk.Metrics == nil { - continue - } - for apiName, latency := range disk.Metrics.LastMinute { - metrics = append(metrics, Metric{ - Description: getNodeDriveAPILatencyMD(), - Value: float64(latency.Avg().Microseconds()), - VariableLabels: map[string]string{"disk": disk.DrivePath, "api": "storage." + apiName}, - }) - } - } - return - }) - return mg -} - func getClusterWriteQuorumMD() MetricDescription { return MetricDescription{ Namespace: clusterMetricNamespace, diff --git a/docs/metrics/prometheus/grafana/minio-dashboard.json b/docs/metrics/prometheus/grafana/minio-dashboard.json index 593ac337e..330e99863 100644 --- a/docs/metrics/prometheus/grafana/minio-dashboard.json +++ b/docs/metrics/prometheus/grafana/minio-dashboard.json @@ -984,7 +984,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "minio_cluster_disk_online_total{job=\"$scrape_jobs\"}", + "expr": "minio_cluster_drive_online_total{job=\"$scrape_jobs\"}", "format": "table", "hide": false, "instant": true, @@ -1418,7 +1418,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "minio_cluster_disk_offline_total{job=\"$scrape_jobs\"}", + "expr": "minio_cluster_drive_offline_total{job=\"$scrape_jobs\"}", "format": "table", "hide": false, "instant": true, @@ -2389,7 +2389,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "minio_node_disk_used_bytes{job=\"$scrape_jobs\"}", + "expr": "minio_node_drive_used_bytes{job=\"$scrape_jobs\"}", "format": "time_series", "instant": false, "interval": "", @@ -2479,7 +2479,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "minio_node_disk_free_inodes{job=\"$scrape_jobs\"}", + "expr": "minio_node_drive_free_inodes{job=\"$scrape_jobs\"}", "format": "time_series", "instant": false, "interval": "", @@ -2905,4 +2905,4 @@ "uid": "TgmJnqnnk", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/docs/metrics/prometheus/list.md b/docs/metrics/prometheus/list.md index 9e067c7a4..1eeb1ef93 100644 --- a/docs/metrics/prometheus/list.md +++ b/docs/metrics/prometheus/list.md @@ -27,9 +27,9 @@ These metrics can be obtained from any MinIO server once per collection. | `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster | | `minio_cluster_usage_total_bytes` | Total cluster usage in bytes | | `minio_cluster_buckets_total` | Total number of buckets in the cluster | -| `minio_cluster_disk_offline_total` | Total drives offline. | -| `minio_cluster_disk_online_total` | Total drives online. | -| `minio_cluster_disk_total` | Total drives. | +| `minio_cluster_drive_offline_total` | Total drives offline in this cluster. | +| `minio_cluster_drive_online_total` | Total drives online in this cluster. | +| `minio_cluster_drive_total` | Total drives in this cluster. | | `minio_cluster_ilm_transitioned_bytes` | Total bytes transitioned to a tier. | | `minio_cluster_ilm_transitioned_objects` | Total number of objects transitioned to a tier. | | `minio_cluster_ilm_transitioned_versions` | Total number of versions transitioned to a tier. | @@ -51,15 +51,34 @@ These metrics can be obtained from any MinIO server once per collection. | `minio_inter_node_traffic_errors_total` | Total number of failed internode calls. | | `minio_inter_node_traffic_received_bytes` | Total number of bytes received from other peer nodes. | | `minio_inter_node_traffic_sent_bytes` | Total number of bytes sent to the other peer nodes. | -| `minio_minio_update_percent` | Total percentage cache usage. | -| `minio_node_disk_free_bytes` | Total storage available on a drive. | -| `minio_node_disk_free_inodes` | Total free inodes. | -| `minio_node_disk_latency_us` | Average last minute latency in µs for drive API storage operations. | -| `minio_node_disk_offline_total` | Total drives offline. | -| `minio_node_disk_online_total` | Total drives online. | -| `minio_node_disk_total` | Total drives. | -| `minio_node_disk_total_bytes` | Total storage on a drive. | -| `minio_node_disk_used_bytes` | Total storage used on a drive. | +| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. | +| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. | +| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. | +| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. | +| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. | +| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. | +| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. | +| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. | +| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. | +| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. | +| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. | +| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. | +| `minio_s3_requests_total` | Total number S3 requests. | +| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. | +| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. | +| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. | +| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. | +| `minio_software_commit_info` | Git commit hash for the MinIO release. | +| `minio_software_version_info` | MinIO Release tag for the server. | +| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. | +| `minio_node_drive_free_bytes` | Total storage available on a drive. | +| `minio_node_drive_free_inodes` | Total free inodes. | +| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. | +| `minio_node_drive_offline_total` | Total drives offline in this node. | +| `minio_node_drive_online_total` | Total drives online in this node. | +| `minio_node_drive_total` | Total drives in this node. | +| `minio_node_drive_total_bytes` | Total storage on a drive. | +| `minio_node_drive_used_bytes` | Total storage used on a drive. | | `minio_node_file_descriptor_limit_total` | Limit on total number of open file descriptors for the MinIO Server process. | | `minio_node_file_descriptor_open_total` | Total number of open file descriptors by the MinIO Server process. | | `minio_node_go_routine_total` | Total number of go routines running. | @@ -86,26 +105,6 @@ These metrics can be obtained from any MinIO server once per collection. | `minio_node_scanner_versions_scanned` | Total number of object versions scanned since server start. | | `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr. | | `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw. | -| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. | -| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. | -| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. | -| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. | -| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. | -| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. | -| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. | -| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. | -| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. | -| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. | -| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. | -| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. | -| `minio_s3_requests_total` | Total number S3 requests. | -| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. | -| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. | -| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. | -| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. | -| `minio_software_commit_info` | Git commit hash for the MinIO release. | -| `minio_software_version_info` | MinIO Release tag for the server. | -| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. | # List of metrics exported per bucket level