mirror of
				https://github.com/prometheus-operator/kube-prometheus.git
				synced 2025-11-04 10:01:03 +01:00 
			
		
		
		
	Merge pull request #1010 from coreos/no_ingest_alert
Add alert if it samples aren't ingested
This commit is contained in:
		
						commit
						9764d157b1
					
				@ -8,6 +8,7 @@ groups:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusNotificationQueueRunningFull
 | 
			
		||||
    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
 | 
			
		||||
    for: 10m
 | 
			
		||||
@ -16,6 +17,7 @@ groups:
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
 | 
			
		||||
        $labels.pod}}
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusErrorSendingAlerts
 | 
			
		||||
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
 | 
			
		||||
      > 0.01
 | 
			
		||||
@ -25,6 +27,7 @@ groups:
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
 | 
			
		||||
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusErrorSendingAlerts
 | 
			
		||||
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
 | 
			
		||||
      > 0.03
 | 
			
		||||
@ -34,6 +37,7 @@ groups:
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
 | 
			
		||||
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusNotConnectedToAlertmanagers
 | 
			
		||||
    expr: prometheus_notifications_alertmanagers_discovered < 1
 | 
			
		||||
    for: 10m
 | 
			
		||||
@ -42,6 +46,7 @@ groups:
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
 | 
			
		||||
        to any Alertmanagers
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusTSDBReloadsFailing
 | 
			
		||||
    expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
 | 
			
		||||
    for: 12h
 | 
			
		||||
@ -51,6 +56,7 @@ groups:
 | 
			
		||||
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
 | 
			
		||||
        reload failures over the last four hours.'
 | 
			
		||||
      summary: Prometheus has issues reloading data blocks from disk
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusTSDBCompactionsFailing
 | 
			
		||||
    expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
 | 
			
		||||
    for: 12h
 | 
			
		||||
@ -60,6 +66,7 @@ groups:
 | 
			
		||||
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
 | 
			
		||||
        compaction failures over the last four hours.'
 | 
			
		||||
      summary: Prometheus has issues compacting sample blocks
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusTSDBWALCorruptions
 | 
			
		||||
    expr: tsdb_wal_corruptions_total > 0
 | 
			
		||||
    for: 4h
 | 
			
		||||
@ -69,3 +76,12 @@ groups:
 | 
			
		||||
      description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
 | 
			
		||||
        log (WAL).'
 | 
			
		||||
      summary: Prometheus write-ahead log is corrupted
 | 
			
		||||
 | 
			
		||||
  - alert: PrometheusNotIngestingSamples
 | 
			
		||||
    expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
 | 
			
		||||
    for: 10m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
 | 
			
		||||
      summary: "Prometheus isn't ingesting samples"
 | 
			
		||||
 | 
			
		||||
@ -539,6 +539,7 @@ data:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusNotificationQueueRunningFull
 | 
			
		||||
        expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
 | 
			
		||||
        for: 10m
 | 
			
		||||
@ -547,6 +548,7 @@ data:
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
 | 
			
		||||
            $labels.pod}}
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusErrorSendingAlerts
 | 
			
		||||
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
 | 
			
		||||
          > 0.01
 | 
			
		||||
@ -556,6 +558,7 @@ data:
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
 | 
			
		||||
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusErrorSendingAlerts
 | 
			
		||||
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
 | 
			
		||||
          > 0.03
 | 
			
		||||
@ -565,6 +568,7 @@ data:
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
 | 
			
		||||
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusNotConnectedToAlertmanagers
 | 
			
		||||
        expr: prometheus_notifications_alertmanagers_discovered < 1
 | 
			
		||||
        for: 10m
 | 
			
		||||
@ -573,6 +577,7 @@ data:
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
 | 
			
		||||
            to any Alertmanagers
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusTSDBReloadsFailing
 | 
			
		||||
        expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
 | 
			
		||||
        for: 12h
 | 
			
		||||
@ -582,6 +587,7 @@ data:
 | 
			
		||||
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
 | 
			
		||||
            reload failures over the last four hours.'
 | 
			
		||||
          summary: Prometheus has issues reloading data blocks from disk
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusTSDBCompactionsFailing
 | 
			
		||||
        expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
 | 
			
		||||
        for: 12h
 | 
			
		||||
@ -591,6 +597,7 @@ data:
 | 
			
		||||
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
 | 
			
		||||
            compaction failures over the last four hours.'
 | 
			
		||||
          summary: Prometheus has issues compacting sample blocks
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusTSDBWALCorruptions
 | 
			
		||||
        expr: tsdb_wal_corruptions_total > 0
 | 
			
		||||
        for: 4h
 | 
			
		||||
@ -600,3 +607,12 @@ data:
 | 
			
		||||
          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
 | 
			
		||||
            log (WAL).'
 | 
			
		||||
          summary: Prometheus write-ahead log is corrupted
 | 
			
		||||
    
 | 
			
		||||
      - alert: PrometheusNotIngestingSamples
 | 
			
		||||
        expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
 | 
			
		||||
          summary: "Prometheus isn't ingesting samples"
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user