From 5743540fbb877a08a46056e10b591d0c1b7a39cf Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Wed, 15 Jul 2020 10:24:45 +0200 Subject: [PATCH 1/2] prometheus-operator.libsonnet: Add List error alert and fix threshold to Watch error alert --- .../alerts/prometheus-operator.libsonnet | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index 580aa866..a8ddd13a 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -5,15 +5,28 @@ name: 'prometheus-operator', rules: [ { - alert: 'PrometheusOperatorWatchErrors', + alert: 'PrometheusOperatorListErrors', expr: ||| - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.1 + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4 ||| % $._config, labels: { severity: 'warning', }, annotations: { - message: 'Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', + message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', + }, + 'for': '15m', + }, + { + alert: 'PrometheusOperatorWatchErrors', + expr: ||| + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', }, 'for': '15m', }, From d88cb26377c100f478890637845e70864bdb8b6e Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Wed, 15 Jul 2020 10:28:03 +0200 Subject: [PATCH 2/2] manifests/prometheus-rules.yaml: Regenerate --- manifests/prometheus-rules.yaml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 94c3fe4e..3bd6d63a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1793,12 +1793,21 @@ spec: severity: warning - name: prometheus-operator rules: - - alert: PrometheusOperatorWatchErrors + - alert: PrometheusOperatorListErrors annotations: - message: Errors while performing watch operations in controller {{$labels.controller}} + message: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.1 + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + message: Errors while performing Watch operations in controller {{$labels.controller}} + in {{$labels.namespace}} namespace. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4 for: 15m labels: severity: warning