Merge pull request #882 from paulfantom/deps

This commit is contained in:
Paweł Krupa 2021-01-21 11:50:10 +01:00 committed by GitHub
commit a89a364ece
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 259 additions and 95 deletions

View File

@ -80,7 +80,7 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "release-2.23",
"version": "release-2.24",
"name": "prometheus"
},
{

View File

@ -8,8 +8,8 @@
"subdir": "grafana"
}
},
"version": "7176a6d54b3b19e0529ce574ab5ed427f1c721e9",
"sum": "IrxVMYJrTbDliaVMXX72jUKm8Ju2Za8cAbds7d26wuY="
"version": "4204279da8d3d6317116ee161ac706fadbba9193",
"sum": "VUavLhri7lTnJ2V7F9lDlL+K96NwIhqqlxMtasYBs3Q="
},
{
"source": {
@ -18,7 +18,7 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "ca866c02422ff3f3d1f0876898a30c33dd7bcccf",
"version": "5dcd459ae9c7948f5620002f5b0bb9cf0b8f1502",
"sum": "bLqTqEr0jky9zz5MV/7ucn6H5mph2NlXas0TVnGNB1Y="
},
{
@ -28,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "356bd73e4792ffe107725776ca8946895969c191",
"sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU="
"version": "b0d72d6ed0e9fcab83fc2dd954b3bd57113e768c",
"sum": "g2UC37YmOShdIFThAO99Uw89UO+H3sHt+y0ionv9/sA="
},
{
"source": {
@ -38,8 +38,8 @@
"subdir": "grafana-builder"
}
},
"version": "9c3fb8096e1f80e2f3a84566566906ff187f5a8c",
"sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k="
"version": "2cef89cb717c8b596443ac5de0415d1ffdb42252",
"sum": "EmHrmBY8PbnV0BKXmVWvAEmax6eglRinKSyZbTmVWuc="
},
{
"source": {
@ -59,8 +59,8 @@
"subdir": ""
}
},
"version": "ead45674dba3c8712e422d99223453177aac6bf4",
"sum": "3i0NkntlBluDS1NRF+iSc2e727Alkv3ziuVjAP12/kE="
"version": "4a8e078147dbca51067521e6ac59c7b54d44d3bd",
"sum": "D5XwKXhd3c0e+1D5iRgUhStB0qpcT4dSCmytuGQa3+k="
},
{
"source": {
@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "ead45674dba3c8712e422d99223453177aac6bf4",
"version": "4a8e078147dbca51067521e6ac59c7b54d44d3bd",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
@ -89,7 +89,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "7bdd62593c9273b5179cf3c9d2d819e9d997aaa4",
"version": "72d6d3106861f992b7d6ecc0a88abe9b12ad5427",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
},
{
@ -99,7 +99,7 @@
"subdir": "jsonnet/mixin"
}
},
"version": "5555f492df250168657b72bb8cb60bec071de71f",
"version": "788d4456425eaf8c1d613582995bdf7de02154b0",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U="
},
{
@ -119,8 +119,8 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "193ebba04d1e70d971047e983a0b489112610460",
"sum": "QcftU7gjCQyj7B6M4YJeCAeaPd0kwxd4J4rolo7AnLE=",
"version": "3f46b62d75da4d68d2098388797e6a61fcc5e043",
"sum": "VP1vn/WTGLZaBgGhGMUO81qNTc/fnp5KtzVjcaxad6Q=",
"name": "alertmanager"
},
{
@ -130,7 +130,7 @@
"subdir": "docs/node-mixin"
}
},
"version": "8b466360a35581e0301bd22918be7011cf4203c3",
"version": "cfdd9dd0c983057df5e814e067fadbf8c7781559",
"sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8="
},
{
@ -140,8 +140,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "26d89b4b0776fe4cd5a3656dfa520f119a375273",
"sum": "1VRVMuxAEZ9vdGHFlndmG9iQzDD6AoIXrX80CDpGDaU=",
"version": "e4487274853c587717006eeda8804e597d120340",
"sum": "6kUzElCBWZ5U/3cxEpHNMmoKKPubG45QxpmLu8PGg08=",
"name": "prometheus"
},
{

View File

@ -55,17 +55,31 @@ spec:
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
summary: All Alertmanager instances in a cluster failed to send notifications.
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (namespace,service) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.

View File

@ -3,6 +3,11 @@ data:
datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0=
kind: Secret
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-datasources
namespace: monitoring
type: Opaque

View File

@ -1729,6 +1729,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-apiserver
namespace: monitoring
- apiVersion: v1
@ -3595,6 +3600,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-cluster-total
namespace: monitoring
- apiVersion: v1
@ -4730,6 +4740,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-controller-manager
namespace: monitoring
- apiVersion: v1
@ -7296,6 +7311,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-k8s-resources-cluster
namespace: monitoring
- apiVersion: v1
@ -9566,6 +9586,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-k8s-resources-namespace
namespace: monitoring
- apiVersion: v1
@ -10528,6 +10553,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-k8s-resources-node
namespace: monitoring
- apiVersion: v1
@ -12284,6 +12314,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-k8s-resources-pod
namespace: monitoring
- apiVersion: v1
@ -14302,6 +14337,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-k8s-resources-workload
namespace: monitoring
- apiVersion: v1
@ -16481,6 +16521,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-k8s-resources-workloads-namespace
namespace: monitoring
- apiVersion: v1
@ -18998,6 +19043,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-kubelet
namespace: monitoring
- apiVersion: v1
@ -20446,6 +20496,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-namespace-by-pod
namespace: monitoring
- apiVersion: v1
@ -22166,6 +22221,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-namespace-by-workload
namespace: monitoring
- apiVersion: v1
@ -23114,6 +23174,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-node-cluster-rsrc-use
namespace: monitoring
- apiVersion: v1
@ -24089,6 +24154,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-node-rsrc-use
namespace: monitoring
- apiVersion: v1
@ -25070,6 +25140,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-nodes
namespace: monitoring
- apiVersion: v1
@ -25262,7 +25337,7 @@ items:
"tableColumn": "",
"targets": [
{
"expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n",
"expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
@ -25459,7 +25534,7 @@ items:
"tableColumn": "",
"targets": [
{
"expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n",
"expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
@ -25631,6 +25706,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-persistentvolumesusage
namespace: monitoring
- apiVersion: v1
@ -26843,6 +26923,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-pod-total
namespace: monitoring
- apiVersion: v1
@ -26868,7 +26953,7 @@ items:
"links": [
],
"refresh": "",
"refresh": "60s",
"rows": [
{
"collapse": false,
@ -27119,7 +27204,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n",
"expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n- \n (rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
@ -27704,7 +27789,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}",
"expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"} or prometheus_remote_storage_samples_pending{cluster=~\"$cluster\", instance=~\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
@ -28009,7 +28094,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
@ -28102,7 +28187,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
@ -28195,7 +28280,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
@ -28348,7 +28433,7 @@ items:
"schemaVersion": 14,
"style": "dark",
"tags": [
"prometheus-mixin"
],
"templating": {
"list": [
@ -28492,11 +28577,16 @@ items:
]
},
"timezone": "browser",
"title": "Prometheus Remote Write",
"title": "Prometheus / Remote Write",
"version": 0
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-prometheus-remote-write
namespace: monitoring
- apiVersion: v1
@ -28515,7 +28605,7 @@ items:
"links": [
],
"refresh": "10s",
"refresh": "60s",
"rows": [
{
"collapse": false,
@ -29594,7 +29684,7 @@ items:
"schemaVersion": 14,
"style": "dark",
"tags": [
"prometheus-mixin"
],
"templating": {
"list": [
@ -29702,12 +29792,17 @@ items:
]
},
"timezone": "utc",
"title": "Prometheus Overview",
"title": "Prometheus / Overview",
"uid": "",
"version": 0
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-prometheus
namespace: monitoring
- apiVersion: v1
@ -30923,6 +31018,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-proxy
namespace: monitoring
- apiVersion: v1
@ -31981,6 +32081,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-scheduler
namespace: monitoring
- apiVersion: v1
@ -32893,6 +32998,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-statefulset
namespace: monitoring
- apiVersion: v1
@ -34315,6 +34425,11 @@ items:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboard-workload-total
namespace: monitoring
kind: ConfigMapList

View File

@ -17,5 +17,10 @@ data:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana-dashboards
namespace: monitoring

View File

@ -2,21 +2,28 @@ apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
checksum/grafana-dashboards: b02ae450c84445cbaca8c685eefaec6c
checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a
checksum/grafana-datasources: a77789e5440a1e51e204e99e2f0f480a
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
spec:
containers:
- env: []

View File

@ -2,7 +2,10 @@ apiVersion: v1
kind: Service
metadata:
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.3.5
name: grafana
namespace: monitoring
spec:
@ -11,5 +14,7 @@ spec:
port: 3000
targetPort: http
selector:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
type: NodePort

View File

@ -15,11 +15,11 @@ spec:
rules:
- alert: KubePodCrashLooping
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping
summary: Pod is crash looping.
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) * 60 * 5 > 0
for: 15m
labels:
severity: warning
@ -499,11 +499,11 @@ spec:
severity: critical
- alert: AggregatedAPIErrors
annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors
summary: An aggregated API has reported errors.
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[10m])) > 4
labels:
severity: warning
- alert: AggregatedAPIDown
@ -526,6 +526,16 @@ spec:
for: 15m
labels:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapiterminatedrequests
summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
- name: kubernetes-system-kubelet
rules:
- alert: KubeNodeNotReady
@ -1102,77 +1112,80 @@ spec:
verb: write
record: apiserver_request:availability30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d]))
avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:

View File

@ -202,9 +202,9 @@ spec:
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m])
)
* 100
> 3