From dfe9184c9ba6996faa4b9afa50c73a8939a33962 Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Mon, 13 Jul 2020 17:30:56 +0200 Subject: [PATCH 1/2] prometheus-operator.libsonnet: Add PrometheusOperatorWatchErrors alert --- .../alerts/prometheus-operator.libsonnet | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index a430c505..580aa866 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -4,6 +4,19 @@ { name: 'prometheus-operator', rules: [ + { + alert: 'PrometheusOperatorWatchErrors', + expr: ||| + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', + }, + 'for': '15m', + }, { alert: 'PrometheusOperatorReconcileErrors', expr: ||| From a5b71282cd4fbcdfe2c841d313b47dff8b1ca718 Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Mon, 13 Jul 2020 17:31:38 +0200 Subject: [PATCH 2/2] manifests/prometheus-rules.yaml: Regenerate --- manifests/prometheus-rules.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index aefacdf1..94c3fe4e 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1793,6 +1793,15 @@ spec: severity: warning - name: prometheus-operator rules: + - alert: PrometheusOperatorWatchErrors + annotations: + message: Errors while performing watch operations in controller {{$labels.controller}} + in {{$labels.namespace}} namespace. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.1 + for: 15m + labels: + severity: warning - alert: PrometheusOperatorReconcileErrors annotations: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace