mirror of
https://github.com/prometheus-operator/kube-prometheus.git
synced 2025-10-28 14:41:02 +01:00
Merge pull request #470 from paulfantom/sync_k8s_mixins
Sync dependencies
This commit is contained in:
commit
86b0419f59
@ -1,24 +1,6 @@
|
||||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'node-time',
|
||||
rules: [
|
||||
{
|
||||
alert: 'ClockSkewDetected',
|
||||
annotations: {
|
||||
message: 'Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host.',
|
||||
},
|
||||
expr: |||
|
||||
abs(node_timex_offset_seconds{%(nodeExporterSelector)s}) > 0.05
|
||||
||| % $._config,
|
||||
'for': '2m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'node-network',
|
||||
rules: [
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
"subdir": "grafana"
|
||||
}
|
||||
},
|
||||
"version": "539a90dbf63c812ad0194d8078dd776868a11c81",
|
||||
"sum": "b8faWX1qqLGyN67sA36oRqYZ5HX+tHBRMPtrWRqIysE="
|
||||
"version": "57b4365eacda291b82e0d55ba7eec573a8198dda",
|
||||
"sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
@ -18,7 +18,7 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "e5c90ebf90cb3692c26240d19406de47414a2b38",
|
||||
"version": "0eee733220fc766ff0d193d61d9124aa06493986",
|
||||
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
|
||||
},
|
||||
{
|
||||
@ -38,8 +38,8 @@
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "c459106d2d2b583dd3a83f6c75eb52abee3af764",
|
||||
"sum": "CeM3LRgUCUJTolTdMnerfMPGYmhClx7gX5ajrQVEY2Y="
|
||||
"version": "815b848ade47c2f4ee866fe5efc435acd9ad799c",
|
||||
"sum": "J3Vp0EVbxTObr6KydLXsi4Rc0ssNVAEuwLc0NQ+4wqU="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
@ -48,7 +48,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "7ac7da1a0fe165b68cdb718b2521b560d51bd1f4",
|
||||
"version": "d3c9f46e8f1ab665db6b31446fbe23e399c9f529",
|
||||
"sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE="
|
||||
},
|
||||
{
|
||||
@ -69,8 +69,8 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "b2d7f762bd22be3ba5e7d54a1fcecfe1092f214b",
|
||||
"sum": "NqrJQnQnRDzkCbrHg7L1zX8XPAzfoE4DS2XBEj6WC8g="
|
||||
"version": "bf3064885199f90080bec6790f2d27c5ad08184d",
|
||||
"sum": "MFXrg/dNmfAHIm+H8bEGdZ957E1Y6B6aS42iFQCE1O0="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
@ -79,7 +79,7 @@
|
||||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "b2d7f762bd22be3ba5e7d54a1fcecfe1092f214b",
|
||||
"version": "bf3064885199f90080bec6790f2d27c5ad08184d",
|
||||
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
||||
},
|
||||
{
|
||||
@ -89,7 +89,7 @@
|
||||
"subdir": "jsonnet/kube-state-metrics"
|
||||
}
|
||||
},
|
||||
"version": "89ede10b19d7ef0145777717351cabe14b113c01",
|
||||
"version": "ab094dffe1e5c6d59663c8a2de056cba62f6cd2c",
|
||||
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
|
||||
},
|
||||
{
|
||||
@ -99,7 +99,7 @@
|
||||
"subdir": "jsonnet/kube-state-metrics-mixin"
|
||||
}
|
||||
},
|
||||
"version": "89ede10b19d7ef0145777717351cabe14b113c01",
|
||||
"version": "ab094dffe1e5c6d59663c8a2de056cba62f6cd2c",
|
||||
"sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU="
|
||||
},
|
||||
{
|
||||
@ -109,8 +109,8 @@
|
||||
"subdir": "slo-libsonnet"
|
||||
}
|
||||
},
|
||||
"version": "437c402c5f3ad86c3c16db8471f1649284fef0ee",
|
||||
"sum": "2Zcyku1f558VrUpMaJnI78fahDksPLcS1idmxxwcQ7Q="
|
||||
"version": "5ddd7ffc39e7a54c9aca997c2c389a8046fab0ff",
|
||||
"sum": "S7/+tnAkzVh8Li7sg7Hu4aeIQAWHCtxhRQ+k1OKjoQk="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
@ -119,8 +119,8 @@
|
||||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "0107bc794204f50d887898da60032da890637471",
|
||||
"sum": "VKdF0zPMSCiuIuXWblSz2VOeBaXzQ7fp40vz9sxj+Bo="
|
||||
"version": "7f5a0ea5f633594e2ab4de52c5779a5a5a40f09f",
|
||||
"sum": "P2H+7fx8/JsMEvB6cMxtxYomRwxB13M4a8VuwNSlM/E="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
@ -129,8 +129,8 @@
|
||||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "1c321ed047ac57e34688e40a55349c9dfe2b72c8",
|
||||
"sum": "u1YS9CVuBTcw2vks0PZbLb1gtlI/7bVGDVBZsjWFLTw=",
|
||||
"version": "fac7a4a0504404fa5d4c5abb8fcc9750bd5cbda7",
|
||||
"sum": "5EUgr6Spr1zNR8Y2/NevjvEkGV9WMvKo6nEScNER1Lc=",
|
||||
"name": "prometheus"
|
||||
},
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,7 @@ data:
|
||||
"apiVersion": 1,
|
||||
"providers": [
|
||||
{
|
||||
"folder": "",
|
||||
"folder": "Default",
|
||||
"name": "0",
|
||||
"options": {
|
||||
"path": "/grafana-dashboard-definitions/0"
|
||||
|
||||
@ -16,7 +16,8 @@ spec:
|
||||
app: grafana
|
||||
spec:
|
||||
containers:
|
||||
- image: grafana/grafana:6.6.0
|
||||
- env: []
|
||||
image: grafana/grafana:6.6.0
|
||||
name: grafana
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
@ -93,9 +94,6 @@ spec:
|
||||
- mountPath: /grafana-dashboard-definitions/0/pod-total
|
||||
name: grafana-dashboard-pod-total
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/pods
|
||||
name: grafana-dashboard-pods
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/prometheus-remote-write
|
||||
name: grafana-dashboard-prometheus-remote-write
|
||||
readOnly: false
|
||||
@ -180,9 +178,6 @@ spec:
|
||||
- configMap:
|
||||
name: grafana-dashboard-pod-total
|
||||
name: grafana-dashboard-pod-total
|
||||
- configMap:
|
||||
name: grafana-dashboard-pods
|
||||
name: grafana-dashboard-pods
|
||||
- configMap:
|
||||
name: grafana-dashboard-prometheus-remote-write
|
||||
name: grafana-dashboard-prometheus-remote-write
|
||||
|
||||
@ -65,25 +65,139 @@ spec:
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
- name: kube-apiserver-error
|
||||
rules:
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate30m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate2h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate6h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1d
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate3d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
@ -95,23 +209,33 @@ spec:
|
||||
- expr: |
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info)
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
||||
@ -139,35 +263,39 @@ spec:
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||
1, max by (replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
@ -224,7 +352,10 @@ spec:
|
||||
sum(min(kube_pod_info) by (cluster, node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: |
|
||||
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
||||
topk by(namespace, pod) (1,
|
||||
max by (node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
@ -244,6 +375,23 @@ spec:
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||
@ -457,6 +605,47 @@ spec:
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit
|
||||
expr: |
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
|
||||
Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds < 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||
is configured on this host.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
@ -498,9 +687,15 @@ spec:
|
||||
matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||
expr: |
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -510,9 +705,15 @@ spec:
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -656,7 +857,7 @@ spec:
|
||||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable_cpu_cores)
|
||||
>
|
||||
@ -670,7 +871,7 @@ spec:
|
||||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable_memory_bytes)
|
||||
>
|
||||
@ -799,10 +1000,12 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-apiserver-error
|
||||
- name: kube-apiserver-error-alerts
|
||||
rules:
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
message: 'High requests error budget burn for job=apiserver (current value:
|
||||
{{ $value }})'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
@ -821,6 +1024,8 @@ spec:
|
||||
severity: critical
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
message: 'High requests error budget burn for job=apiserver (current value:
|
||||
{{ $value }})'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
@ -837,118 +1042,6 @@ spec:
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: warning
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate30m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate2h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate6h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1d
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate3d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeAPILatencyHigh
|
||||
@ -985,30 +1078,6 @@ spec:
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
@ -1053,6 +1122,27 @@ spec:
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
||||
reported errors. The number of errors have increased for it in the past
|
||||
five minutes. High values indicate that the availability of the service
|
||||
changes too often.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down.
|
||||
It has not been available at least for the past five minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||
expr: |
|
||||
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
||||
@ -1088,7 +1178,37 @@ spec:
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value
|
||||
}} times in the last 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||
expr: |
|
||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
||||
of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||
expr: |
|
||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||
on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -1253,7 +1373,8 @@ spec:
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
|
||||
{{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
|
||||
{{ printf "%.1f" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue
|
||||
}}{{ else }}{{ $labels.url }}{{ end }}.
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
@ -1273,7 +1394,8 @@ spec:
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
|
||||
is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
|
||||
is {{ printf "%.1f" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue
|
||||
}}{{ else }}{{ $labels.url }}{{ end }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@ -1378,17 +1500,6 @@ spec:
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
- name: node-time
|
||||
rules:
|
||||
- alert: ClockSkewDetected
|
||||
annotations:
|
||||
message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
|
||||
}}. Ensure NTP is configured correctly on this host.
|
||||
expr: |
|
||||
abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user