Merge pull request #1007 from ArthurSens/arthursens/missing-runbook-url-1001

This commit is contained in:
Paweł Krupa 2021-03-05 15:36:04 +01:00 committed by GitHub
commit 5f13d38155
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 80 additions and 16 deletions

View File

@ -61,6 +61,7 @@ local defaults = {
alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}',
alertmanagerClusterLabels: 'namespace,service',
alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
},
},
};
@ -73,7 +74,8 @@ function(params) {
assert std.isObject(am.config.resources),
assert std.isObject(am.config.mixin._config),
mixin:: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') {
mixin:: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/alerts/add-runbook-links.libsonnet') {
_config+:: am.config.mixin._config,
},

View File

@ -28,6 +28,7 @@ local defaults = {
ruleLabels: {},
_config: {
kubeStateMetricsSelector: 'job="' + defaults.name + '"',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
},
},
};
@ -46,7 +47,8 @@ function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-
commonLabels:: ksm.config.commonLabels,
podLabels:: ksm.config.selectorLabels,
mixin:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') {
mixin:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/alerts/add-runbook-links.libsonnet') {
_config+:: ksm.config.mixin._config,
},

View File

@ -11,6 +11,7 @@ local defaults = {
_config: {
nodeExporterSelector: 'job="node-exporter"',
hostNetworkInterfaceSelector: 'device!~"veth.+"',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
},
},
};
@ -21,7 +22,8 @@ function(params) {
local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'),
mixin:: alertsandrules {
mixin:: alertsandrules +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/alerts/add-runbook-links.libsonnet') {
_config+:: m.config.mixin._config,
},

View File

@ -29,6 +29,7 @@ local defaults = {
nodeExporterSelector: 'job="' + defaults.name + '"',
fsSpaceFillingUpCriticalThreshold: 15,
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
},
},
};
@ -41,7 +42,8 @@ function(params) {
assert std.isObject(ne.config.resources),
assert std.isObject(ne.config.mixin._config),
mixin:: (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') {
mixin:: (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/alerts/add-runbook-links.libsonnet') {
_config+:: ne.config.mixin._config,
},

View File

@ -30,6 +30,7 @@ local defaults = {
},
_config: {
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
},
},
};
@ -41,7 +42,8 @@ function(params)
prometheusOperator(config) {
local po = self,
mixin:: (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') {
mixin:: (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/alerts/add-runbook-links.libsonnet') {
_config+:: config.mixin._config,
},

View File

@ -34,6 +34,7 @@ local defaults = {
prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"',
prometheusName: '{{$labels.namespace}}/{{$labels.pod}}',
thanosSelector: 'job="thanos-sidecar"',
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
},
},
thanos: {},
@ -47,7 +48,8 @@ function(params) {
assert std.isObject(p.config.resources),
assert std.isObject(p.config.mixin._config),
mixin:: (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') + (
mixin:: (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/alerts/add-runbook-links.libsonnet') + (
if p.config.thanos != {} then
(import 'github.com/thanos-io/thanos/mixin/alerts/sidecar.libsonnet') + {
sidecar: {

View File

@ -17,6 +17,7 @@ spec:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -28,6 +29,7 @@ spec:
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -41,6 +43,7 @@ spec:
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
@ -55,6 +58,7 @@ spec:
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (namespace,service, integration) (
@ -69,6 +73,7 @@ spec:
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (namespace,service, integration) (
@ -83,6 +88,7 @@ spec:
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
@ -95,6 +101,7 @@ spec:
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |
(
@ -113,6 +120,7 @@ spec:
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |
(

View File

@ -16,6 +16,7 @@ spec:
- alert: TargetDown
annotations:
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/targetdown
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
for: 10m
labels:
@ -28,6 +29,7 @@ spec:
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/watchdog
expr: vector(1)
labels:
severity: none
@ -36,6 +38,7 @@ spec:
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
expr: |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m

View File

@ -17,6 +17,7 @@ spec:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
@ -29,6 +30,7 @@ spec:
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))

View File

@ -17,6 +17,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
@ -32,6 +33,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
@ -47,6 +49,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
@ -60,6 +63,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
@ -73,6 +77,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
@ -88,6 +93,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
@ -103,6 +109,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
@ -116,6 +123,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
@ -129,6 +137,7 @@ spec:
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
@ -138,6 +147,7 @@ spec:
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
@ -147,6 +157,7 @@ spec:
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
@ -155,6 +166,7 @@ spec:
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
@ -163,6 +175,7 @@ spec:
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
@ -182,6 +195,7 @@ spec:
- alert: NodeClockNotSynchronising
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
@ -193,6 +207,7 @@ spec:
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
@ -202,6 +217,7 @@ spec:
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="fail"} > 0

View File

@ -17,6 +17,7 @@ spec:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
@ -26,6 +27,7 @@ spec:
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
@ -35,6 +37,7 @@ spec:
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
@ -44,6 +47,7 @@ spec:
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
@ -53,6 +57,7 @@ spec:
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
@ -62,6 +67,7 @@ spec:
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
@ -71,6 +77,7 @@ spec:
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0

View File

@ -17,6 +17,7 @@ spec:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -28,6 +29,7 @@ spec:
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
@ -43,6 +45,7 @@ spec:
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
expr: |
(
@ -58,6 +61,7 @@ spec:
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -69,6 +73,7 @@ spec:
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
@ -78,6 +83,7 @@ spec:
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
@ -87,6 +93,7 @@ spec:
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: |
(
@ -104,6 +111,7 @@ spec:
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -113,6 +121,7 @@ spec:
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -122,6 +131,7 @@ spec:
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |
(
@ -141,6 +151,7 @@ spec:
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -157,6 +168,7 @@ spec:
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -172,6 +184,7 @@ spec:
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -181,6 +194,7 @@ spec:
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -190,6 +204,7 @@ spec:
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -199,6 +214,7 @@ spec:
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (