kube-prometheus: ensure triggering alerts on down targets

This commit is contained in:
Frederic Branczyk 2017-06-28 10:50:17 +02:00
parent 4c42ab4fcc
commit a5533a4f6c
7 changed files with 32 additions and 28 deletions

View File

@ -1,14 +1,14 @@
### Up Alerting ### ### Up Alerting ###
Alert TargetDown Alert TargetDown
IF 100 * (count(up == 0) / count(up)) > 3 IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
FOR 10m FOR 10m
LABELS { LABELS {
severity = "warning" severity = "warning"
} }
ANNOTATIONS { ANNOTATIONS {
summary = "Targets are down", summary = "Targets are down",
description = "More than {{ $value }}% of targets are down." description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
} }
### Dead man's switch ### ### Dead man's switch ###

View File

@ -1,5 +1,5 @@
ALERT K8SApiserverDown ALERT K8SApiserverDown
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) IF absent(up{job="apiserver"} == 1)
FOR 5m FOR 5m
LABELS { LABELS {
severity = "critical" severity = "critical"

View File

@ -1,5 +1,5 @@
ALERT K8SControllerManagerDown ALERT K8SControllerManagerDown
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) IF absent(up{job="kube-controller-manager"} == 1)
FOR 5m FOR 5m
LABELS { LABELS {
severity = "critical", severity = "critical",
@ -7,4 +7,5 @@ ALERT K8SControllerManagerDown
ANNOTATIONS { ANNOTATIONS {
summary = "Controller manager is down", summary = "Controller manager is down",
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
} }

View File

@ -1,5 +1,5 @@
ALERT K8SSchedulerDown ALERT K8SSchedulerDown
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) IF absent(up{job="kube-scheduler"} == 1)
FOR 5m FOR 5m
LABELS { LABELS {
severity = "critical", severity = "critical",
@ -7,4 +7,5 @@ ALERT K8SSchedulerDown
ANNOTATIONS { ANNOTATIONS {
summary = "Scheduler is down", summary = "Scheduler is down",
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
} }

View File

@ -11,24 +11,24 @@ ALERT K8SNodeNotReady
ALERT K8SManyNodesNotReady ALERT K8SManyNodesNotReady
IF IF
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 count(kube_node_status_ready{condition="true"} == 0) > 1
AND AND
( (
count by (cluster) (kube_node_status_ready{condition="true"} == 0) count(kube_node_status_ready{condition="true"} == 0)
/ /
count by (cluster) (kube_node_status_ready{condition="true"}) count(kube_node_status_ready{condition="true"})
) > 0.2 ) > 0.2
FOR 1m FOR 1m
LABELS { LABELS {
severity = "critical", severity = "critical",
} }
ANNOTATIONS { ANNOTATIONS {
summary = "Many K8s nodes are Not Ready", summary = "Many Kubernetes nodes are Not Ready",
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
} }
ALERT K8SKubeletDown ALERT K8SKubeletDown
IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
FOR 1h FOR 1h
LABELS { LABELS {
severity = "warning", severity = "warning",
@ -39,7 +39,7 @@ ALERT K8SKubeletDown
} }
ALERT K8SKubeletDown ALERT K8SKubeletDown
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
FOR 1h FOR 1h
LABELS { LABELS {
severity = "critical", severity = "critical",

View File

@ -1,10 +1,10 @@
ALERT NodeExporterDown ALERT NodeExporterDown
IF up{job="node-exporter"} == 0 IF absent(up{job="node-exporter"} == 1)
FOR 10m FOR 10m
LABELS { LABELS {
severity = "warning" severity = "warning"
} }
ANNOTATIONS { ANNOTATIONS {
summary = "node-exporter cannot be scraped", summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m.", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
} }

View File

@ -225,14 +225,14 @@ data:
### Up Alerting ### ### Up Alerting ###
Alert TargetDown Alert TargetDown
IF 100 * (count(up == 0) / count(up)) > 3 IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
FOR 10m FOR 10m
LABELS { LABELS {
severity = "warning" severity = "warning"
} }
ANNOTATIONS { ANNOTATIONS {
summary = "Targets are down", summary = "Targets are down",
description = "More than {{ $value }}% of targets are down." description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
} }
### Dead man's switch ### ### Dead man's switch ###
@ -287,7 +287,7 @@ data:
} }
kube-apiserver.rules: |+ kube-apiserver.rules: |+
ALERT K8SApiserverDown ALERT K8SApiserverDown
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) IF absent(up{job="apiserver"} == 1)
FOR 5m FOR 5m
LABELS { LABELS {
severity = "critical" severity = "critical"
@ -316,7 +316,7 @@ data:
} }
kube-controller-manager.rules: |+ kube-controller-manager.rules: |+
ALERT K8SControllerManagerDown ALERT K8SControllerManagerDown
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) IF absent(up{job="kube-controller-manager"} == 1)
FOR 5m FOR 5m
LABELS { LABELS {
severity = "critical", severity = "critical",
@ -324,6 +324,7 @@ data:
ANNOTATIONS { ANNOTATIONS {
summary = "Controller manager is down", summary = "Controller manager is down",
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
} }
kubelet.rules: |+ kubelet.rules: |+
ALERT K8SNodeNotReady ALERT K8SNodeNotReady
@ -339,24 +340,24 @@ data:
ALERT K8SManyNodesNotReady ALERT K8SManyNodesNotReady
IF IF
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 count(kube_node_status_ready{condition="true"} == 0) > 1
AND AND
( (
count by (cluster) (kube_node_status_ready{condition="true"} == 0) count(kube_node_status_ready{condition="true"} == 0)
/ /
count by (cluster) (kube_node_status_ready{condition="true"}) count(kube_node_status_ready{condition="true"})
) > 0.2 ) > 0.2
FOR 1m FOR 1m
LABELS { LABELS {
severity = "critical", severity = "critical",
} }
ANNOTATIONS { ANNOTATIONS {
summary = "Many K8s nodes are Not Ready", summary = "Many Kubernetes nodes are Not Ready",
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
} }
ALERT K8SKubeletDown ALERT K8SKubeletDown
IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
FOR 1h FOR 1h
LABELS { LABELS {
severity = "warning", severity = "warning",
@ -367,7 +368,7 @@ data:
} }
ALERT K8SKubeletDown ALERT K8SKubeletDown
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
FOR 1h FOR 1h
LABELS { LABELS {
severity = "critical", severity = "critical",
@ -560,7 +561,7 @@ data:
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
kube-scheduler.rules: |+ kube-scheduler.rules: |+
ALERT K8SSchedulerDown ALERT K8SSchedulerDown
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) IF absent(up{job="kube-scheduler"} == 1)
FOR 5m FOR 5m
LABELS { LABELS {
severity = "critical", severity = "critical",
@ -568,17 +569,18 @@ data:
ANNOTATIONS { ANNOTATIONS {
summary = "Scheduler is down", summary = "Scheduler is down",
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
} }
node.rules: |+ node.rules: |+
ALERT NodeExporterDown ALERT NodeExporterDown
IF up{job="node-exporter"} == 0 IF absent(up{job="node-exporter"} == 1)
FOR 10m FOR 10m
LABELS { LABELS {
severity = "warning" severity = "warning"
} }
ANNOTATIONS { ANNOTATIONS {
summary = "node-exporter cannot be scraped", summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m.", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
} }
prometheus.rules: |+ prometheus.rules: |+
ALERT FailedReload ALERT FailedReload