Merge pull request #805 from paulfantom/runbooks

jsonnet/kube-prometheus: change runbook urls to point to wiki
This commit is contained in:
Frederic Branczyk 2020-11-26 10:39:14 +01:00 committed by GitHub
commit 6b9502c5f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 79 additions and 77 deletions

View File

@ -146,6 +146,8 @@ local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet';
'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305',
], ],
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
kubeletSelector: 'job="kubelet", metrics_path="/metrics"', kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
kubeStateMetricsSelector: 'job="kube-state-metrics"', kubeStateMetricsSelector: 'job="kube-state-metrics"',

View File

@ -785,7 +785,7 @@ spec:
- alert: KubeStateMetricsListErrors - alert: KubeStateMetricsListErrors
annotations: annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations. summary: kube-state-metrics is experiencing errors in list operations.
expr: | expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
@ -798,7 +798,7 @@ spec:
- alert: KubeStateMetricsWatchErrors - alert: KubeStateMetricsWatchErrors
annotations: annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations. summary: kube-state-metrics is experiencing errors in watch operations.
expr: | expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
@ -813,7 +813,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours. summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: | expr: |
( (
@ -829,7 +829,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours. summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: | expr: |
( (
@ -845,7 +845,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace - alert: NodeFilesystemAlmostOutOfSpace
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left. summary: Filesystem has less than 5% space left.
expr: | expr: |
( (
@ -859,7 +859,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace - alert: NodeFilesystemAlmostOutOfSpace
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left. summary: Filesystem has less than 3% space left.
expr: | expr: |
( (
@ -873,7 +873,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp - alert: NodeFilesystemFilesFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours. summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: | expr: |
( (
@ -889,7 +889,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp - alert: NodeFilesystemFilesFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours. summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: | expr: |
( (
@ -905,7 +905,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles - alert: NodeFilesystemAlmostOutOfFiles
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left. summary: Filesystem has less than 5% inodes left.
expr: | expr: |
( (
@ -919,7 +919,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles - alert: NodeFilesystemAlmostOutOfFiles
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left. summary: Filesystem has less than 3% inodes left.
expr: | expr: |
( (
@ -933,7 +933,7 @@ spec:
- alert: NodeNetworkReceiveErrs - alert: NodeNetworkReceiveErrs
annotations: annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors. summary: Network interface is reporting many receive errors.
expr: | expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
@ -943,7 +943,7 @@ spec:
- alert: NodeNetworkTransmitErrs - alert: NodeNetworkTransmitErrs
annotations: annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors. summary: Network interface is reporting many transmit errors.
expr: | expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
@ -953,7 +953,7 @@ spec:
- alert: NodeHighNumberConntrackEntriesUsed - alert: NodeHighNumberConntrackEntriesUsed
annotations: annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.' description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit. summary: Number of conntrack are getting close to the limit.
expr: | expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
@ -962,7 +962,7 @@ spec:
- alert: NodeTextFileCollectorScrapeError - alert: NodeTextFileCollectorScrapeError
annotations: annotations:
description: Node Exporter text file collector failed to scrape. description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape. summary: Node Exporter text file collector failed to scrape.
expr: | expr: |
node_textfile_scrape_error{job="node-exporter"} == 1 node_textfile_scrape_error{job="node-exporter"} == 1
@ -971,7 +971,7 @@ spec:
- alert: NodeClockSkewDetected - alert: NodeClockSkewDetected
annotations: annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected
summary: Clock skew detected. summary: Clock skew detected.
expr: | expr: |
( (
@ -991,7 +991,7 @@ spec:
- alert: NodeClockNotSynchronising - alert: NodeClockNotSynchronising
annotations: annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising
summary: Clock not synchronising. summary: Clock not synchronising.
expr: | expr: |
min_over_time(node_timex_sync_status[5m]) == 0 min_over_time(node_timex_sync_status[5m]) == 0
@ -1003,7 +1003,7 @@ spec:
- alert: NodeRAIDDegraded - alert: NodeRAIDDegraded
annotations: annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded
summary: RAID Array is degraded summary: RAID Array is degraded
expr: | expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
@ -1013,7 +1013,7 @@ spec:
- alert: NodeRAIDDiskFailure - alert: NodeRAIDDiskFailure
annotations: annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure
summary: Failed device in RAID array summary: Failed device in RAID array
expr: | expr: |
node_md_disks{state="fail"} > 0 node_md_disks{state="fail"} > 0
@ -1024,7 +1024,7 @@ spec:
- alert: PrometheusOperatorListErrors - alert: PrometheusOperatorListErrors
annotations: annotations:
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller. summary: Errors while performing list operations in controller.
expr: | expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
@ -1034,7 +1034,7 @@ spec:
- alert: PrometheusOperatorWatchErrors - alert: PrometheusOperatorWatchErrors
annotations: annotations:
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller. summary: Errors while performing watch operations in controller.
expr: | expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
@ -1044,7 +1044,7 @@ spec:
- alert: PrometheusOperatorSyncFailed - alert: PrometheusOperatorSyncFailed
annotations: annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed
summary: Last controller reconciliation failed summary: Last controller reconciliation failed
expr: | expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
@ -1054,7 +1054,7 @@ spec:
- alert: PrometheusOperatorReconcileErrors - alert: PrometheusOperatorReconcileErrors
annotations: annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller. summary: Errors while reconciling controller.
expr: | expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
@ -1064,7 +1064,7 @@ spec:
- alert: PrometheusOperatorNodeLookupErrors - alert: PrometheusOperatorNodeLookupErrors
annotations: annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus. summary: Errors while reconciling Prometheus.
expr: | expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
@ -1074,7 +1074,7 @@ spec:
- alert: PrometheusOperatorNotReady - alert: PrometheusOperatorNotReady
annotations: annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready
summary: Prometheus operator not ready summary: Prometheus operator not ready
expr: | expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
@ -1084,7 +1084,7 @@ spec:
- alert: PrometheusOperatorRejectedResources - alert: PrometheusOperatorRejectedResources
annotations: annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator summary: Resources rejected by Prometheus operator
expr: | expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
@ -1096,7 +1096,7 @@ spec:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
annotations: annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping
summary: Pod is crash looping. summary: Pod is crash looping.
expr: | expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
@ -1106,7 +1106,7 @@ spec:
- alert: KubePodNotReady - alert: KubePodNotReady
annotations: annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes. summary: Pod has been in a non-ready state for more than 15 minutes.
expr: | expr: |
sum by (namespace, pod) ( sum by (namespace, pod) (
@ -1122,7 +1122,7 @@ spec:
- alert: KubeDeploymentGenerationMismatch - alert: KubeDeploymentGenerationMismatch
annotations: annotations:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back summary: Deployment generation mismatch due to possible roll-back
expr: | expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"} kube_deployment_status_observed_generation{job="kube-state-metrics"}
@ -1134,7 +1134,7 @@ spec:
- alert: KubeDeploymentReplicasMismatch - alert: KubeDeploymentReplicasMismatch
annotations: annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas. summary: Deployment has not matched the expected number of replicas.
expr: | expr: |
( (
@ -1152,7 +1152,7 @@ spec:
- alert: KubeStatefulSetReplicasMismatch - alert: KubeStatefulSetReplicasMismatch
annotations: annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas. summary: Deployment has not matched the expected number of replicas.
expr: | expr: |
( (
@ -1170,7 +1170,7 @@ spec:
- alert: KubeStatefulSetGenerationMismatch - alert: KubeStatefulSetGenerationMismatch
annotations: annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back summary: StatefulSet generation mismatch due to possible roll-back
expr: | expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"} kube_statefulset_status_observed_generation{job="kube-state-metrics"}
@ -1182,7 +1182,7 @@ spec:
- alert: KubeStatefulSetUpdateNotRolledOut - alert: KubeStatefulSetUpdateNotRolledOut
annotations: annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out. summary: StatefulSet update has not been rolled out.
expr: | expr: |
( (
@ -1208,7 +1208,7 @@ spec:
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck. summary: DaemonSet rollout is stuck.
expr: | expr: |
( (
@ -1240,7 +1240,7 @@ spec:
- alert: KubeContainerWaiting - alert: KubeContainerWaiting
annotations: annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour summary: Pod container waiting longer than 1 hour
expr: | expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
@ -1250,7 +1250,7 @@ spec:
- alert: KubeDaemonSetNotScheduled - alert: KubeDaemonSetNotScheduled
annotations: annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled. summary: DaemonSet pods are not scheduled.
expr: | expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
@ -1262,7 +1262,7 @@ spec:
- alert: KubeDaemonSetMisScheduled - alert: KubeDaemonSetMisScheduled
annotations: annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled. summary: DaemonSet pods are misscheduled.
expr: | expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@ -1272,7 +1272,7 @@ spec:
- alert: KubeJobCompletion - alert: KubeJobCompletion
annotations: annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion
summary: Job did not complete in time summary: Job did not complete in time
expr: | expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@ -1282,7 +1282,7 @@ spec:
- alert: KubeJobFailed - alert: KubeJobFailed
annotations: annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed
summary: Job failed to complete. summary: Job failed to complete.
expr: | expr: |
kube_job_failed{job="kube-state-metrics"} > 0 kube_job_failed{job="kube-state-metrics"} > 0
@ -1292,7 +1292,7 @@ spec:
- alert: KubeHpaReplicasMismatch - alert: KubeHpaReplicasMismatch
annotations: annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas. summary: HPA has not matched descired number of replicas.
expr: | expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"} (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
@ -1306,7 +1306,7 @@ spec:
- alert: KubeHpaMaxedOut - alert: KubeHpaMaxedOut
annotations: annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout
summary: HPA is running at max replicas summary: HPA is running at max replicas
expr: | expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"} kube_hpa_status_current_replicas{job="kube-state-metrics"}
@ -1320,7 +1320,7 @@ spec:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests. summary: Cluster has overcommitted CPU resource requests.
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
@ -1334,7 +1334,7 @@ spec:
- alert: KubeMemoryOvercommit - alert: KubeMemoryOvercommit
annotations: annotations:
description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests. summary: Cluster has overcommitted memory resource requests.
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
@ -1350,7 +1350,7 @@ spec:
- alert: KubeCPUQuotaOvercommit - alert: KubeCPUQuotaOvercommit
annotations: annotations:
description: Cluster has overcommitted CPU resource requests for Namespaces. description: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests. summary: Cluster has overcommitted CPU resource requests.
expr: | expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
@ -1363,7 +1363,7 @@ spec:
- alert: KubeMemoryQuotaOvercommit - alert: KubeMemoryQuotaOvercommit
annotations: annotations:
description: Cluster has overcommitted memory resource requests for Namespaces. description: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests. summary: Cluster has overcommitted memory resource requests.
expr: | expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
@ -1376,7 +1376,7 @@ spec:
- alert: KubeQuotaAlmostFull - alert: KubeQuotaAlmostFull
annotations: annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull
summary: Namespace quota is going to be full. summary: Namespace quota is going to be full.
expr: | expr: |
kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
@ -1389,7 +1389,7 @@ spec:
- alert: KubeQuotaFullyUsed - alert: KubeQuotaFullyUsed
annotations: annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused
summary: Namespace quota is fully used. summary: Namespace quota is fully used.
expr: | expr: |
kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
@ -1402,7 +1402,7 @@ spec:
- alert: KubeQuotaExceeded - alert: KubeQuotaExceeded
annotations: annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded
summary: Namespace quota has exceeded the limits. summary: Namespace quota has exceeded the limits.
expr: | expr: |
kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
@ -1415,7 +1415,7 @@ spec:
- alert: CPUThrottlingHigh - alert: CPUThrottlingHigh
annotations: annotations:
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh
summary: Processes experience elevated CPU throttling. summary: Processes experience elevated CPU throttling.
expr: | expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
@ -1430,7 +1430,7 @@ spec:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup
summary: PersistentVolume is filling up. summary: PersistentVolume is filling up.
expr: | expr: |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
@ -1443,7 +1443,7 @@ spec:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup
summary: PersistentVolume is filling up. summary: PersistentVolume is filling up.
expr: | expr: |
( (
@ -1459,7 +1459,7 @@ spec:
- alert: KubePersistentVolumeErrors - alert: KubePersistentVolumeErrors
annotations: annotations:
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning. summary: PersistentVolume is having issues with provisioning.
expr: | expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
@ -1471,7 +1471,7 @@ spec:
- alert: KubeVersionMismatch - alert: KubeVersionMismatch
annotations: annotations:
description: There are {{ $value }} different semantic versions of Kubernetes components running. description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running. summary: Different semantic versions of Kubernetes components running.
expr: | expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
@ -1481,7 +1481,7 @@ spec:
- alert: KubeClientErrors - alert: KubeClientErrors
annotations: annotations:
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors
summary: Kubernetes API server client is experiencing errors. summary: Kubernetes API server client is experiencing errors.
expr: | expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
@ -1496,7 +1496,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
description: The API server is burning too much error budget. description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
summary: The API server is burning too much error budget. summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
@ -1510,7 +1510,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
description: The API server is burning too much error budget. description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
summary: The API server is burning too much error budget. summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
@ -1524,7 +1524,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
description: The API server is burning too much error budget. description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
summary: The API server is burning too much error budget. summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
@ -1538,7 +1538,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
description: The API server is burning too much error budget. description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
summary: The API server is burning too much error budget. summary: The API server is burning too much error budget.
expr: | expr: |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
@ -1554,7 +1554,7 @@ spec:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration
summary: Client certificate is about to expire. summary: Client certificate is about to expire.
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
@ -1563,7 +1563,7 @@ spec:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration
summary: Client certificate is about to expire. summary: Client certificate is about to expire.
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
@ -1572,7 +1572,7 @@ spec:
- alert: AggregatedAPIErrors - alert: AggregatedAPIErrors
annotations: annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors
summary: An aggregated API has reported errors. summary: An aggregated API has reported errors.
expr: | expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
@ -1581,7 +1581,7 @@ spec:
- alert: AggregatedAPIDown - alert: AggregatedAPIDown
annotations: annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown
summary: An aggregated API is down. summary: An aggregated API is down.
expr: | expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
@ -1591,7 +1591,7 @@ spec:
- alert: KubeAPIDown - alert: KubeAPIDown
annotations: annotations:
description: KubeAPI has disappeared from Prometheus target discovery. description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown
summary: Target disappeared from Prometheus target discovery. summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="apiserver"} == 1) absent(up{job="apiserver"} == 1)
@ -1603,7 +1603,7 @@ spec:
- alert: KubeNodeNotReady - alert: KubeNodeNotReady
annotations: annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.' description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready
summary: Node is not ready. summary: Node is not ready.
expr: | expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
@ -1613,7 +1613,7 @@ spec:
- alert: KubeNodeUnreachable - alert: KubeNodeUnreachable
annotations: annotations:
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable
summary: Node is unreachable. summary: Node is unreachable.
expr: | expr: |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
@ -1623,7 +1623,7 @@ spec:
- alert: KubeletTooManyPods - alert: KubeletTooManyPods
annotations: annotations:
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods
summary: Kubelet is running at capacity. summary: Kubelet is running at capacity.
expr: | expr: |
count by(node) ( count by(node) (
@ -1639,7 +1639,7 @@ spec:
- alert: KubeNodeReadinessFlapping - alert: KubeNodeReadinessFlapping
annotations: annotations:
description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping
summary: Node readiness status is flapping. summary: Node readiness status is flapping.
expr: | expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
@ -1649,7 +1649,7 @@ spec:
- alert: KubeletPlegDurationHigh - alert: KubeletPlegDurationHigh
annotations: annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: | expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
@ -1659,7 +1659,7 @@ spec:
- alert: KubeletPodStartUpLatencyHigh - alert: KubeletPodStartUpLatencyHigh
annotations: annotations:
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high. summary: Kubelet Pod startup latency is too high.
expr: | expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
@ -1669,7 +1669,7 @@ spec:
- alert: KubeletClientCertificateExpiration - alert: KubeletClientCertificateExpiration
annotations: annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire. summary: Kubelet client certificate is about to expire.
expr: | expr: |
kubelet_certificate_manager_client_ttl_seconds < 604800 kubelet_certificate_manager_client_ttl_seconds < 604800
@ -1678,7 +1678,7 @@ spec:
- alert: KubeletClientCertificateExpiration - alert: KubeletClientCertificateExpiration
annotations: annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire. summary: Kubelet client certificate is about to expire.
expr: | expr: |
kubelet_certificate_manager_client_ttl_seconds < 86400 kubelet_certificate_manager_client_ttl_seconds < 86400
@ -1687,7 +1687,7 @@ spec:
- alert: KubeletServerCertificateExpiration - alert: KubeletServerCertificateExpiration
annotations: annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire. summary: Kubelet server certificate is about to expire.
expr: | expr: |
kubelet_certificate_manager_server_ttl_seconds < 604800 kubelet_certificate_manager_server_ttl_seconds < 604800
@ -1696,7 +1696,7 @@ spec:
- alert: KubeletServerCertificateExpiration - alert: KubeletServerCertificateExpiration
annotations: annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire. summary: Kubelet server certificate is about to expire.
expr: | expr: |
kubelet_certificate_manager_server_ttl_seconds < 86400 kubelet_certificate_manager_server_ttl_seconds < 86400
@ -1705,7 +1705,7 @@ spec:
- alert: KubeletClientCertificateRenewalErrors - alert: KubeletClientCertificateRenewalErrors
annotations: annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate. summary: Kubelet has failed to renew its client certificate.
expr: | expr: |
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
@ -1715,7 +1715,7 @@ spec:
- alert: KubeletServerCertificateRenewalErrors - alert: KubeletServerCertificateRenewalErrors
annotations: annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate. summary: Kubelet has failed to renew its server certificate.
expr: | expr: |
increase(kubelet_server_expiration_renew_errors[5m]) > 0 increase(kubelet_server_expiration_renew_errors[5m]) > 0
@ -1725,7 +1725,7 @@ spec:
- alert: KubeletDown - alert: KubeletDown
annotations: annotations:
description: Kubelet has disappeared from Prometheus target discovery. description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown
summary: Target disappeared from Prometheus target discovery. summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="kubelet", metrics_path="/metrics"} == 1) absent(up{job="kubelet", metrics_path="/metrics"} == 1)
@ -1737,7 +1737,7 @@ spec:
- alert: KubeSchedulerDown - alert: KubeSchedulerDown
annotations: annotations:
description: KubeScheduler has disappeared from Prometheus target discovery. description: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown
summary: Target disappeared from Prometheus target discovery. summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="kube-scheduler"} == 1) absent(up{job="kube-scheduler"} == 1)
@ -1749,7 +1749,7 @@ spec:
- alert: KubeControllerManagerDown - alert: KubeControllerManagerDown
annotations: annotations:
description: KubeControllerManager has disappeared from Prometheus target discovery. description: KubeControllerManager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery. summary: Target disappeared from Prometheus target discovery.
expr: | expr: |
absent(up{job="kube-controller-manager"} == 1) absent(up{job="kube-controller-manager"} == 1)