mirror of
				https://github.com/prometheus-operator/kube-prometheus.git
				synced 2025-10-25 22:21:19 +02:00 
			
		
		
		
	
						commit
						c0de51e66b
					
				| @ -1,5 +1,5 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./alertmanager.rules | - name: alertmanager.rules | ||||||
|   rules: |   rules: | ||||||
|   - alert: AlertmanagerConfigInconsistent |   - alert: AlertmanagerConfigInconsistent | ||||||
|     expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) |     expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) | ||||||
| @ -11,7 +11,6 @@ groups: | |||||||
|     annotations: |     annotations: | ||||||
|       description: The configuration of the instances of the Alertmanager cluster |       description: The configuration of the instances of the Alertmanager cluster | ||||||
|         `{{$labels.service}}` are out of sync. |         `{{$labels.service}}` are out of sync. | ||||||
|       summary: Alertmanager configurations are inconsistent |  | ||||||
|   - alert: AlertmanagerDownOrMissing |   - alert: AlertmanagerDownOrMissing | ||||||
|     expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", |     expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", | ||||||
|       "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 |       "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 | ||||||
| @ -21,8 +20,7 @@ groups: | |||||||
|     annotations: |     annotations: | ||||||
|       description: An unexpected number of Alertmanagers are scraped or Alertmanagers |       description: An unexpected number of Alertmanagers are scraped or Alertmanagers | ||||||
|         disappeared from discovery. |         disappeared from discovery. | ||||||
|       summary: Alertmanager down or not discovered |   - alert: AlertmanagerFailedReload | ||||||
|   - alert: FailedReload |  | ||||||
|     expr: alertmanager_config_last_reload_successful == 0 |     expr: alertmanager_config_last_reload_successful == 0 | ||||||
|     for: 10m |     for: 10m | ||||||
|     labels: |     labels: | ||||||
| @ -30,4 +28,3 @@ groups: | |||||||
|     annotations: |     annotations: | ||||||
|       description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace |       description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace | ||||||
|         }}/{{ $labels.pod}}. |         }}/{{ $labels.pod}}. | ||||||
|       summary: Alertmanager configuration reload has failed |  | ||||||
|  | |||||||
| @ -1,5 +1,5 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./general.rules | - name: general.rules | ||||||
|   rules: |   rules: | ||||||
|   - alert: TargetDown |   - alert: TargetDown | ||||||
|     expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 |     expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 | ||||||
| @ -7,7 +7,7 @@ groups: | |||||||
|     labels: |     labels: | ||||||
|       severity: warning |       severity: warning | ||||||
|     annotations: |     annotations: | ||||||
|       description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' |       description: '{{ $value }}% of {{ $labels.job }} targets are down.' | ||||||
|       summary: Targets are down |       summary: Targets are down | ||||||
|   - alert: DeadMansSwitch |   - alert: DeadMansSwitch | ||||||
|     expr: vector(1) |     expr: vector(1) | ||||||
| @ -17,32 +17,23 @@ groups: | |||||||
|       description: This is a DeadMansSwitch meant to ensure that the entire Alerting |       description: This is a DeadMansSwitch meant to ensure that the entire Alerting | ||||||
|         pipeline is functional. |         pipeline is functional. | ||||||
|       summary: Alerting DeadMansSwitch |       summary: Alerting DeadMansSwitch | ||||||
|   - alert: TooManyOpenFileDescriptors |   - record: fd_utilization | ||||||
|     expr: 100 * (process_open_fds / process_max_fds) > 95 |  | ||||||
|     for: 10m |  | ||||||
|     labels: |  | ||||||
|       severity: critical |  | ||||||
|     annotations: |  | ||||||
|       description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ |  | ||||||
|         $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' |  | ||||||
|       summary: too many open file descriptors |  | ||||||
|   - record: instance:fd_utilization |  | ||||||
|     expr: process_open_fds / process_max_fds |     expr: process_open_fds / process_max_fds | ||||||
|   - alert: FdExhaustionClose |   - alert: FdExhaustionClose | ||||||
|     expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 |     expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 | ||||||
|     for: 10m |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       severity: warning |       severity: warning | ||||||
|     annotations: |     annotations: | ||||||
|       description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ |       description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance | ||||||
|         $labels.instance }}) instance will exhaust in file/socket descriptors soon' |         will exhaust in file/socket descriptors within the next 4 hours' | ||||||
|       summary: file descriptors soon exhausted |       summary: file descriptors soon exhausted | ||||||
|   - alert: FdExhaustionClose |   - alert: FdExhaustionClose | ||||||
|     expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 |     expr: predict_linear(fd_utilization[10m], 3600) > 1 | ||||||
|     for: 10m |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       severity: critical |       severity: critical | ||||||
|     annotations: |     annotations: | ||||||
|       description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ |       description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance | ||||||
|         $labels.instance }}) instance will exhaust in file/socket descriptors soon' |         will exhaust in file/socket descriptors within the next hour' | ||||||
|       summary: file descriptors soon exhausted |       summary: file descriptors soon exhausted | ||||||
|  | |||||||
| @ -1,22 +0,0 @@ | |||||||
| groups: |  | ||||||
| - name: ./kube-apiserver.rules |  | ||||||
|   rules: |  | ||||||
|   - alert: K8SApiserverDown |  | ||||||
|     expr: absent(up{job="apiserver"} == 1) |  | ||||||
|     for: 5m |  | ||||||
|     labels: |  | ||||||
|       severity: critical |  | ||||||
|     annotations: |  | ||||||
|       description: Prometheus failed to scrape API server(s), or all API servers have |  | ||||||
|         disappeared from service discovery. |  | ||||||
|       summary: API server unreachable |  | ||||||
|   - alert: K8SApiServerLatency |  | ||||||
|     expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) |  | ||||||
|        by (le)) / 1e+06 > 1 |  | ||||||
|     for: 10m |  | ||||||
|     labels: |  | ||||||
|       severity: warning |  | ||||||
|     annotations: |  | ||||||
|       description: 99th percentile Latency for {{ $labels.verb }} requests to the |  | ||||||
|         kube-apiserver is higher than 1s. |  | ||||||
|       summary: Kubernetes apiserver latency is high |  | ||||||
| @ -1,5 +1,5 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./kube-controller-manager.rules | - name: kube-controller-manager.rules | ||||||
|   rules: |   rules: | ||||||
|   - alert: K8SControllerManagerDown |   - alert: K8SControllerManagerDown | ||||||
|     expr: absent(up{job="kube-controller-manager"} == 1) |     expr: absent(up{job="kube-controller-manager"} == 1) | ||||||
|  | |||||||
| @ -1,6 +1,51 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./kube-scheduler.rules | - name: kube-scheduler.rules | ||||||
|   rules: |   rules: | ||||||
|  |   - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.99" | ||||||
|  |   - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.9" | ||||||
|  |   - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.5" | ||||||
|  |   - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.99" | ||||||
|  |   - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.9" | ||||||
|  |   - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.5" | ||||||
|  |   - record: cluster:scheduler_binding_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.99" | ||||||
|  |   - record: cluster:scheduler_binding_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.9" | ||||||
|  |   - record: cluster:scheduler_binding_latency_seconds:quantile | ||||||
|  |     expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) | ||||||
|  |       BY (le, cluster)) / 1e+06 | ||||||
|  |     labels: | ||||||
|  |       quantile: "0.5" | ||||||
|   - alert: K8SSchedulerDown |   - alert: K8SSchedulerDown | ||||||
|     expr: absent(up{job="kube-scheduler"} == 1) |     expr: absent(up{job="kube-scheduler"} == 1) | ||||||
|     for: 5m |     for: 5m | ||||||
|  | |||||||
							
								
								
									
										55
									
								
								assets/prometheus/rules/kube-state-metrics.rules.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								assets/prometheus/rules/kube-state-metrics.rules.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | |||||||
|  | groups: | ||||||
|  | - name: kube-state-metrics.rules | ||||||
|  |   rules: | ||||||
|  |   - alert: DeploymentGenerationMismatch | ||||||
|  |     expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation | ||||||
|  |     for: 15m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Observed deployment generation does not match expected one for | ||||||
|  |         deployment {{$labels.namespaces}}{{$labels.deployment}} | ||||||
|  |   - alert: DeploymentReplicasNotUpdated | ||||||
|  |     expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) | ||||||
|  |       or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) | ||||||
|  |       unless (kube_deployment_spec_paused == 1) | ||||||
|  |     for: 15m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} | ||||||
|  |   - alert: DaemonSetRolloutStuck | ||||||
|  |     expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled | ||||||
|  |       * 100 < 100 | ||||||
|  |     for: 15m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Only {{$value}}% of desired pods scheduled and ready for daemon | ||||||
|  |         set {{$labels.namespaces}}/{{$labels.daemonset}} | ||||||
|  |   - alert: K8SDaemonSetsNotScheduled | ||||||
|  |     expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled | ||||||
|  |       > 0 | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: A number of daemonsets are not scheduled. | ||||||
|  |       summary: Daemonsets are not scheduled correctly | ||||||
|  |   - alert: DaemonSetsMissScheduled | ||||||
|  |     expr: kube_daemonset_status_number_misscheduled > 0 | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: A number of daemonsets are running where they are not supposed | ||||||
|  |         to run. | ||||||
|  |       summary: Daemonsets are not scheduled correctly | ||||||
|  |   - alert: PodFrequentlyRestarting | ||||||
|  |     expr: increase(kube_pod_container_status_restarts[1h]) > 5 | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} | ||||||
|  |         times within the last hour | ||||||
| @ -1,5 +1,5 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./kubelet.rules | - name: kubelet.rules | ||||||
|   rules: |   rules: | ||||||
|   - alert: K8SNodeNotReady |   - alert: K8SNodeNotReady | ||||||
|     expr: kube_node_status_condition{condition="Ready",status="true"} == 0 |     expr: kube_node_status_condition{condition="Ready",status="true"} == 0 | ||||||
| @ -18,20 +18,17 @@ groups: | |||||||
|     labels: |     labels: | ||||||
|       severity: critical |       severity: critical | ||||||
|     annotations: |     annotations: | ||||||
|       description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady |       description: '{{ $value }}% of Kubernetes nodes are not ready' | ||||||
|         state).' |  | ||||||
|       summary: Many Kubernetes nodes are Not Ready |  | ||||||
|   - alert: K8SKubeletDown |   - alert: K8SKubeletDown | ||||||
|     expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 |     expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 | ||||||
|     for: 1h |     for: 1h | ||||||
|     labels: |     labels: | ||||||
|       severity: warning |       severity: warning | ||||||
|     annotations: |     annotations: | ||||||
|       description: Prometheus failed to scrape {{ $value }}% of kubelets. |       description: Prometheus failed to scrape {{ $value }}% of kubelets. | ||||||
|       summary: Many Kubelets cannot be scraped |  | ||||||
|   - alert: K8SKubeletDown |   - alert: K8SKubeletDown | ||||||
|     expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) |     expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) | ||||||
|       > 0.1 |       * 100 > 1 | ||||||
|     for: 1h |     for: 1h | ||||||
|     labels: |     labels: | ||||||
|       severity: critical |       severity: critical | ||||||
| @ -41,36 +38,10 @@ groups: | |||||||
|       summary: Many Kubelets cannot be scraped |       summary: Many Kubelets cannot be scraped | ||||||
|   - alert: K8SKubeletTooManyPods |   - alert: K8SKubeletTooManyPods | ||||||
|     expr: kubelet_running_pod_count > 100 |     expr: kubelet_running_pod_count > 100 | ||||||
|  |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       severity: warning |       severity: warning | ||||||
|     annotations: |     annotations: | ||||||
|       description: Kubelet {{$labels.instance}} is running {{$value}} pods, close |       description: Kubelet {{$labels.instance}} is running {{$value}} pods, close | ||||||
|         to the limit of 110 |         to the limit of 110 | ||||||
|       summary: Kubelet is close to pod limit |       summary: Kubelet is close to pod limit | ||||||
|   - alert: K8SDaemonSetsNotScheduled |  | ||||||
|     expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled |  | ||||||
|       > 0 |  | ||||||
|     for: 10m |  | ||||||
|     labels: |  | ||||||
|       severity: warning |  | ||||||
|     annotations: |  | ||||||
|       description: A number of daemonsets are not scheduled. |  | ||||||
|       summary: Daemonsets are not scheduled correctly |  | ||||||
|   - alert: K8SDaemonSetsNotRunning |  | ||||||
|     expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready |  | ||||||
|       > 0 |  | ||||||
|     for: 10m |  | ||||||
|     labels: |  | ||||||
|       severity: warning |  | ||||||
|     annotations: |  | ||||||
|       description: A number of daemonsets are not ready. |  | ||||||
|       summary: Daemonsets are not ready |  | ||||||
|   - alert: K8SDaemonSetsMissScheduled |  | ||||||
|     expr: kube_daemonset_status_number_misscheduled > 0 |  | ||||||
|     for: 10m |  | ||||||
|     labels: |  | ||||||
|       severity: warning |  | ||||||
|     annotations: |  | ||||||
|       description: A number of daemonsets are running where they are not supposed |  | ||||||
|         to run. |  | ||||||
|       summary: Daemonsets are not scheduled correctly |  | ||||||
|  | |||||||
| @ -1,115 +1,86 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./kubernetes.rules | - name: kubernetes.rules | ||||||
|   rules: |   rules: | ||||||
|   - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes |   - record: pod_name:container_memory_usage_bytes:sum | ||||||
|     expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, |     expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY | ||||||
|       "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |       (pod_name) | ||||||
|       controller, pod_name, container_name) |   - record: pod_name:container_spec_cpu_shares:sum | ||||||
|   - record: cluster_namespace_controller_pod_container:spec_cpu_shares |     expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) | ||||||
|     expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", |   - record: pod_name:container_cpu_usage:sum | ||||||
|       "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |     expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) | ||||||
|       container_name) |       BY (pod_name) | ||||||
|   - record: cluster_namespace_controller_pod_container:cpu_usage:rate |   - record: pod_name:container_fs_usage_bytes:sum | ||||||
|     expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), |     expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) | ||||||
|       "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |   - record: namespace:container_memory_usage_bytes:sum | ||||||
|       controller, pod_name, container_name) |     expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) | ||||||
|   - record: cluster_namespace_controller_pod_container:memory_usage:bytes |   - record: namespace:container_spec_cpu_shares:sum | ||||||
|     expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", |     expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) | ||||||
|       "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |   - record: namespace:container_cpu_usage:sum | ||||||
|       container_name) |     expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) | ||||||
|   - record: cluster_namespace_controller_pod_container:memory_working_set:bytes |       BY (namespace) | ||||||
|     expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, |   - record: cluster:memory_usage:ratio | ||||||
|       "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |     expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY | ||||||
|       controller, pod_name, container_name) |       (cluster) / sum(machine_memory_bytes) BY (cluster) | ||||||
|   - record: cluster_namespace_controller_pod_container:memory_rss:bytes |   - record: cluster:container_spec_cpu_shares:ratio | ||||||
|     expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", |     expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 | ||||||
|       "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |       / sum(machine_cpu_cores) | ||||||
|       container_name) |   - record: cluster:container_cpu_usage:ratio | ||||||
|   - record: cluster_namespace_controller_pod_container:memory_cache:bytes |     expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) | ||||||
|     expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", |       / sum(machine_cpu_cores) | ||||||
|       "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |   - record: apiserver_latency_seconds:quantile | ||||||
|       container_name) |     expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / | ||||||
|   - record: cluster_namespace_controller_pod_container:disk_usage:bytes |       1e+06 | ||||||
|     expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", |  | ||||||
|       "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |  | ||||||
|       container_name) |  | ||||||
|   - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate |  | ||||||
|     expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), |  | ||||||
|       "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |  | ||||||
|       controller, pod_name, container_name, scope, type) |  | ||||||
|   - record: cluster_namespace_controller_pod_container:memory_oom:rate |  | ||||||
|     expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), |  | ||||||
|       "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |  | ||||||
|       controller, pod_name, container_name, scope, type) |  | ||||||
|   - record: cluster:memory_allocation:percent |  | ||||||
|     expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) |  | ||||||
|       / sum(machine_memory_bytes) BY (cluster) |  | ||||||
|   - record: cluster:memory_used:percent |  | ||||||
|     expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) |  | ||||||
|       BY (cluster) |  | ||||||
|   - record: cluster:cpu_allocation:percent |  | ||||||
|     expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} |  | ||||||
|       * ON(cluster, instance) machine_cpu_cores) BY (cluster) |  | ||||||
|   - record: cluster:node_cpu_use:percent |  | ||||||
|     expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) |  | ||||||
|       BY (cluster) |  | ||||||
|   - record: cluster_resource_verb:apiserver_latency:quantile_seconds |  | ||||||
|     expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, |  | ||||||
|       cluster, job, resource, verb)) / 1e+06 |  | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.99" |       quantile: "0.99" | ||||||
|   - record: cluster_resource_verb:apiserver_latency:quantile_seconds |   - record: apiserver_latency:quantile_seconds | ||||||
|     expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, |     expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / | ||||||
|       cluster, job, resource, verb)) / 1e+06 |       1e+06 | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.9" |       quantile: "0.9" | ||||||
|   - record: cluster_resource_verb:apiserver_latency:quantile_seconds |   - record: apiserver_latency_seconds:quantile | ||||||
|     expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, |     expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / | ||||||
|       cluster, job, resource, verb)) / 1e+06 |       1e+06 | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.5" |       quantile: "0.5" | ||||||
|   - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds |   - alert: APIServerLatencyHigh | ||||||
|     expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) |     expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} | ||||||
|       BY (le, cluster)) / 1e+06 |       > 1 | ||||||
|  |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.99" |       severity: warning | ||||||
|   - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds |     annotations: | ||||||
|     expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) |       description: the API server has a 99th percentile latency of {{ $value }} seconds | ||||||
|       BY (le, cluster)) / 1e+06 |         for {{$labels.verb}} {{$labels.resource}} | ||||||
|  |   - alert: APIServerLatencyHigh | ||||||
|  |     expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} | ||||||
|  |       > 4 | ||||||
|  |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.9" |       severity: critical | ||||||
|   - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds |     annotations: | ||||||
|     expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) |       description: the API server has a 99th percentile latency of {{ $value }} seconds | ||||||
|       BY (le, cluster)) / 1e+06 |         for {{$labels.verb}} {{$labels.resource}} | ||||||
|  |   - alert: APIServerErrorsHigh | ||||||
|  |     expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) | ||||||
|  |       * 100 > 2 | ||||||
|  |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.5" |       severity: warning | ||||||
|   - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds |     annotations: | ||||||
|     expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) |       description: API server returns errors for {{ $value }}% of requests | ||||||
|       BY (le, cluster)) / 1e+06 |   - alert: APIServerErrorsHigh | ||||||
|  |     expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) | ||||||
|  |       * 100 > 5 | ||||||
|  |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.99" |       severity: critical | ||||||
|   - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds |     annotations: | ||||||
|     expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) |       description: API server returns errors for {{ $value }}% of requests | ||||||
|       BY (le, cluster)) / 1e+06 |   - alert: K8SApiserverDown | ||||||
|  |     expr: absent(up{job="apiserver"} == 1) | ||||||
|  |     for: 20m | ||||||
|     labels: |     labels: | ||||||
|       quantile: "0.9" |       severity: critical | ||||||
|   - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds |     annotations: | ||||||
|     expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) |       description: No API servers are reachable or all have disappeared from service | ||||||
|       BY (le, cluster)) / 1e+06 |         discovery | ||||||
|     labels: |  | ||||||
|       quantile: "0.5" |  | ||||||
|   - record: cluster:scheduler_binding_latency:quantile_seconds |  | ||||||
|     expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) |  | ||||||
|       BY (le, cluster)) / 1e+06 |  | ||||||
|     labels: |  | ||||||
|       quantile: "0.99" |  | ||||||
|   - record: cluster:scheduler_binding_latency:quantile_seconds |  | ||||||
|     expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) |  | ||||||
|       BY (le, cluster)) / 1e+06 |  | ||||||
|     labels: |  | ||||||
|       quantile: "0.9" |  | ||||||
|   - record: cluster:scheduler_binding_latency:quantile_seconds |  | ||||||
|     expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) |  | ||||||
|       BY (le, cluster)) / 1e+06 |  | ||||||
|     labels: |  | ||||||
|       quantile: "0.5" |  | ||||||
|  | |||||||
| @ -1,6 +1,23 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./node.rules | - name: node.rules | ||||||
|   rules: |   rules: | ||||||
|  |   - record: instance:node_cpu:rate:sum | ||||||
|  |     expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) | ||||||
|  |       BY (instance) | ||||||
|  |   - record: instance:node_filesystem_usage:sum | ||||||
|  |     expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) | ||||||
|  |       BY (instance) | ||||||
|  |   - record: instance:node_network_receive_bytes:rate:sum | ||||||
|  |     expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) | ||||||
|  |   - record: instance:node_network_transmit_bytes:rate:sum | ||||||
|  |     expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) | ||||||
|  |   - record: instance:node_cpu:ratio | ||||||
|  |     expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) | ||||||
|  |       GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) | ||||||
|  |   - record: cluster:node_cpu:sum_rate5m | ||||||
|  |     expr: sum(rate(node_cpu{mode!="idle"}[5m])) | ||||||
|  |   - record: cluster:node_cpu:ratio | ||||||
|  |     expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) | ||||||
|   - alert: NodeExporterDown |   - alert: NodeExporterDown | ||||||
|     expr: absent(up{job="node-exporter"} == 1) |     expr: absent(up{job="node-exporter"} == 1) | ||||||
|     for: 10m |     for: 10m | ||||||
| @ -8,30 +25,20 @@ groups: | |||||||
|       severity: warning |       severity: warning | ||||||
|     annotations: |     annotations: | ||||||
|       description: Prometheus could not scrape a node-exporter for more than 10m, |       description: Prometheus could not scrape a node-exporter for more than 10m, | ||||||
|         or node-exporters have disappeared from discovery. |         or node-exporters have disappeared from discovery | ||||||
|       summary: node-exporter cannot be scraped |   - alert: NodeDiskRunningFull | ||||||
|   - alert: K8SNodeOutOfDisk |     expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 | ||||||
|     expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 |     for: 30m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: device {{$labels.device}} on node {{$labels.instance}} is running | ||||||
|  |         full within the next 24 hours (mounted at {{$labels.mountpoint}}) | ||||||
|  |   - alert: NodeDiskRunningFull | ||||||
|  |     expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 | ||||||
|  |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       service: k8s |  | ||||||
|       severity: critical |       severity: critical | ||||||
|     annotations: |     annotations: | ||||||
|       description: '{{ $labels.node }} has run out of disk space.' |       description: device {{$labels.device}} on node {{$labels.instance}} is running | ||||||
|       summary: Node ran out of disk space. |         full within the next 2 hours (mounted at {{$labels.mountpoint}}) | ||||||
|   - alert: K8SNodeMemoryPressure |  | ||||||
|     expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == |  | ||||||
|       1 |  | ||||||
|     labels: |  | ||||||
|       service: k8s |  | ||||||
|       severity: warning |  | ||||||
|     annotations: |  | ||||||
|       description: '{{ $labels.node }} is under memory pressure.' |  | ||||||
|       summary: Node is under memory pressure. |  | ||||||
|   - alert: K8SNodeDiskPressure |  | ||||||
|     expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 |  | ||||||
|     labels: |  | ||||||
|       service: k8s |  | ||||||
|       severity: warning |  | ||||||
|     annotations: |  | ||||||
|       description: '{{ $labels.node }} is under disk pressure.' |  | ||||||
|       summary: Node is under disk pressure. |  | ||||||
|  | |||||||
| @ -1,12 +1,44 @@ | |||||||
| groups: | groups: | ||||||
| - name: ./prometheus.rules | - name: prometheus.rules | ||||||
|   rules: |   rules: | ||||||
|   - alert: FailedReload |   - alert: PrometheusConfigReloadFailed | ||||||
|     expr: prometheus_config_last_reload_successful == 0 |     expr: prometheus_config_last_reload_successful == 0 | ||||||
|     for: 10m |     for: 10m | ||||||
|     labels: |     labels: | ||||||
|       severity: warning |       severity: warning | ||||||
|     annotations: |     annotations: | ||||||
|       description: Reloading Prometheus' configuration has failed for {{ $labels.namespace |       description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} | ||||||
|         }}/{{ $labels.pod}}. |   - alert: PrometheusNotificationQueueRunningFull | ||||||
|       summary: Prometheus configuration reload has failed |     expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ | ||||||
|  |         $labels.pod}} | ||||||
|  |   - alert: PrometheusErrorSendingAlerts | ||||||
|  |     expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) | ||||||
|  |       > 0.01 | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | ||||||
|  |         $labels.pod}} to Alertmanager {{$labels.Alertmanager}} | ||||||
|  |   - alert: PrometheusErrorSendingAlerts | ||||||
|  |     expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) | ||||||
|  |       > 0.03 | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: critical | ||||||
|  |     annotations: | ||||||
|  |       description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | ||||||
|  |         $labels.pod}} to Alertmanager {{$labels.Alertmanager}} | ||||||
|  |   - alert: PrometheusNotConnectedToAlertmanagers | ||||||
|  |     expr: prometheus_notifications_alertmanagers_discovered < 1 | ||||||
|  |     for: 10m | ||||||
|  |     labels: | ||||||
|  |       severity: warning | ||||||
|  |     annotations: | ||||||
|  |       description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected | ||||||
|  |         to any Alertmanagers | ||||||
|  | |||||||
| @ -8,7 +8,7 @@ metadata: | |||||||
| data: | data: | ||||||
|   alertmanager.rules.yaml: |+ |   alertmanager.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./alertmanager.rules |     - name: alertmanager.rules | ||||||
|       rules: |       rules: | ||||||
|       - alert: AlertmanagerConfigInconsistent |       - alert: AlertmanagerConfigInconsistent | ||||||
|         expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) |         expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) | ||||||
| @ -20,7 +20,6 @@ data: | |||||||
|         annotations: |         annotations: | ||||||
|           description: The configuration of the instances of the Alertmanager cluster |           description: The configuration of the instances of the Alertmanager cluster | ||||||
|             `{{$labels.service}}` are out of sync. |             `{{$labels.service}}` are out of sync. | ||||||
|           summary: Alertmanager configurations are inconsistent |  | ||||||
|       - alert: AlertmanagerDownOrMissing |       - alert: AlertmanagerDownOrMissing | ||||||
|         expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", |         expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", | ||||||
|           "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 |           "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 | ||||||
| @ -30,8 +29,7 @@ data: | |||||||
|         annotations: |         annotations: | ||||||
|           description: An unexpected number of Alertmanagers are scraped or Alertmanagers |           description: An unexpected number of Alertmanagers are scraped or Alertmanagers | ||||||
|             disappeared from discovery. |             disappeared from discovery. | ||||||
|           summary: Alertmanager down or not discovered |       - alert: AlertmanagerFailedReload | ||||||
|       - alert: FailedReload |  | ||||||
|         expr: alertmanager_config_last_reload_successful == 0 |         expr: alertmanager_config_last_reload_successful == 0 | ||||||
|         for: 10m |         for: 10m | ||||||
|         labels: |         labels: | ||||||
| @ -39,7 +37,6 @@ data: | |||||||
|         annotations: |         annotations: | ||||||
|           description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace |           description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace | ||||||
|             }}/{{ $labels.pod}}. |             }}/{{ $labels.pod}}. | ||||||
|           summary: Alertmanager configuration reload has failed |  | ||||||
|   etcd3.rules.yaml: |+ |   etcd3.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./etcd3.rules |     - name: ./etcd3.rules | ||||||
| @ -166,7 +163,7 @@ data: | |||||||
|           summary: high commit durations |           summary: high commit durations | ||||||
|   general.rules.yaml: |+ |   general.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./general.rules |     - name: general.rules | ||||||
|       rules: |       rules: | ||||||
|       - alert: TargetDown |       - alert: TargetDown | ||||||
|         expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 |         expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 | ||||||
| @ -174,7 +171,7 @@ data: | |||||||
|         labels: |         labels: | ||||||
|           severity: warning |           severity: warning | ||||||
|         annotations: |         annotations: | ||||||
|           description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' |           description: '{{ $value }}% of {{ $labels.job }} targets are down.' | ||||||
|           summary: Targets are down |           summary: Targets are down | ||||||
|       - alert: DeadMansSwitch |       - alert: DeadMansSwitch | ||||||
|         expr: vector(1) |         expr: vector(1) | ||||||
| @ -184,61 +181,29 @@ data: | |||||||
|           description: This is a DeadMansSwitch meant to ensure that the entire Alerting |           description: This is a DeadMansSwitch meant to ensure that the entire Alerting | ||||||
|             pipeline is functional. |             pipeline is functional. | ||||||
|           summary: Alerting DeadMansSwitch |           summary: Alerting DeadMansSwitch | ||||||
|       - alert: TooManyOpenFileDescriptors |       - record: fd_utilization | ||||||
|         expr: 100 * (process_open_fds / process_max_fds) > 95 |  | ||||||
|         for: 10m |  | ||||||
|         labels: |  | ||||||
|           severity: critical |  | ||||||
|         annotations: |  | ||||||
|           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ |  | ||||||
|             $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' |  | ||||||
|           summary: too many open file descriptors |  | ||||||
|       - record: instance:fd_utilization |  | ||||||
|         expr: process_open_fds / process_max_fds |         expr: process_open_fds / process_max_fds | ||||||
|       - alert: FdExhaustionClose |       - alert: FdExhaustionClose | ||||||
|         expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 |         expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 | ||||||
|         for: 10m |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           severity: warning |           severity: warning | ||||||
|         annotations: |         annotations: | ||||||
|           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ |           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance | ||||||
|             $labels.instance }}) instance will exhaust in file/socket descriptors soon' |             will exhaust in file/socket descriptors within the next 4 hours' | ||||||
|           summary: file descriptors soon exhausted |           summary: file descriptors soon exhausted | ||||||
|       - alert: FdExhaustionClose |       - alert: FdExhaustionClose | ||||||
|         expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 |         expr: predict_linear(fd_utilization[10m], 3600) > 1 | ||||||
|         for: 10m |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           severity: critical |           severity: critical | ||||||
|         annotations: |         annotations: | ||||||
|           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ |           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance | ||||||
|             $labels.instance }}) instance will exhaust in file/socket descriptors soon' |             will exhaust in file/socket descriptors within the next hour' | ||||||
|           summary: file descriptors soon exhausted |           summary: file descriptors soon exhausted | ||||||
|   kube-apiserver.rules.yaml: |+ |  | ||||||
|     groups: |  | ||||||
|     - name: ./kube-apiserver.rules |  | ||||||
|       rules: |  | ||||||
|       - alert: K8SApiserverDown |  | ||||||
|         expr: absent(up{job="apiserver"} == 1) |  | ||||||
|         for: 5m |  | ||||||
|         labels: |  | ||||||
|           severity: critical |  | ||||||
|         annotations: |  | ||||||
|           description: Prometheus failed to scrape API server(s), or all API servers have |  | ||||||
|             disappeared from service discovery. |  | ||||||
|           summary: API server unreachable |  | ||||||
|       - alert: K8SApiServerLatency |  | ||||||
|         expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) |  | ||||||
|            by (le)) / 1e+06 > 1 |  | ||||||
|         for: 10m |  | ||||||
|         labels: |  | ||||||
|           severity: warning |  | ||||||
|         annotations: |  | ||||||
|           description: 99th percentile Latency for {{ $labels.verb }} requests to the |  | ||||||
|             kube-apiserver is higher than 1s. |  | ||||||
|           summary: Kubernetes apiserver latency is high |  | ||||||
|   kube-controller-manager.rules.yaml: |+ |   kube-controller-manager.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./kube-controller-manager.rules |     - name: kube-controller-manager.rules | ||||||
|       rules: |       rules: | ||||||
|       - alert: K8SControllerManagerDown |       - alert: K8SControllerManagerDown | ||||||
|         expr: absent(up{job="kube-controller-manager"} == 1) |         expr: absent(up{job="kube-controller-manager"} == 1) | ||||||
| @ -252,8 +217,53 @@ data: | |||||||
|           summary: Controller manager is down |           summary: Controller manager is down | ||||||
|   kube-scheduler.rules.yaml: |+ |   kube-scheduler.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./kube-scheduler.rules |     - name: kube-scheduler.rules | ||||||
|       rules: |       rules: | ||||||
|  |       - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.99" | ||||||
|  |       - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.9" | ||||||
|  |       - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.5" | ||||||
|  |       - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.99" | ||||||
|  |       - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.9" | ||||||
|  |       - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.5" | ||||||
|  |       - record: cluster:scheduler_binding_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.99" | ||||||
|  |       - record: cluster:scheduler_binding_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.9" | ||||||
|  |       - record: cluster:scheduler_binding_latency_seconds:quantile | ||||||
|  |         expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) | ||||||
|  |           BY (le, cluster)) / 1e+06 | ||||||
|  |         labels: | ||||||
|  |           quantile: "0.5" | ||||||
|       - alert: K8SSchedulerDown |       - alert: K8SSchedulerDown | ||||||
|         expr: absent(up{job="kube-scheduler"} == 1) |         expr: absent(up{job="kube-scheduler"} == 1) | ||||||
|         for: 5m |         for: 5m | ||||||
| @ -264,9 +274,65 @@ data: | |||||||
|             to nodes. |             to nodes. | ||||||
|           runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler |           runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler | ||||||
|           summary: Scheduler is down |           summary: Scheduler is down | ||||||
|  |   kube-state-metrics.rules.yaml: |+ | ||||||
|  |     groups: | ||||||
|  |     - name: kube-state-metrics.rules | ||||||
|  |       rules: | ||||||
|  |       - alert: DeploymentGenerationMismatch | ||||||
|  |         expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation | ||||||
|  |         for: 15m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Observed deployment generation does not match expected one for | ||||||
|  |             deployment {{$labels.namespaces}}{{$labels.deployment}} | ||||||
|  |       - alert: DeploymentReplicasNotUpdated | ||||||
|  |         expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) | ||||||
|  |           or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) | ||||||
|  |           unless (kube_deployment_spec_paused == 1) | ||||||
|  |         for: 15m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} | ||||||
|  |       - alert: DaemonSetRolloutStuck | ||||||
|  |         expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled | ||||||
|  |           * 100 < 100 | ||||||
|  |         for: 15m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Only {{$value}}% of desired pods scheduled and ready for daemon | ||||||
|  |             set {{$labels.namespaces}}/{{$labels.daemonset}} | ||||||
|  |       - alert: K8SDaemonSetsNotScheduled | ||||||
|  |         expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled | ||||||
|  |           > 0 | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: A number of daemonsets are not scheduled. | ||||||
|  |           summary: Daemonsets are not scheduled correctly | ||||||
|  |       - alert: DaemonSetsMissScheduled | ||||||
|  |         expr: kube_daemonset_status_number_misscheduled > 0 | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: A number of daemonsets are running where they are not supposed | ||||||
|  |             to run. | ||||||
|  |           summary: Daemonsets are not scheduled correctly | ||||||
|  |       - alert: PodFrequentlyRestarting | ||||||
|  |         expr: increase(kube_pod_container_status_restarts[1h]) > 5 | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} | ||||||
|  |             times within the last hour | ||||||
|   kubelet.rules.yaml: |+ |   kubelet.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./kubelet.rules |     - name: kubelet.rules | ||||||
|       rules: |       rules: | ||||||
|       - alert: K8SNodeNotReady |       - alert: K8SNodeNotReady | ||||||
|         expr: kube_node_status_condition{condition="Ready",status="true"} == 0 |         expr: kube_node_status_condition{condition="Ready",status="true"} == 0 | ||||||
| @ -285,20 +351,17 @@ data: | |||||||
|         labels: |         labels: | ||||||
|           severity: critical |           severity: critical | ||||||
|         annotations: |         annotations: | ||||||
|           description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady |           description: '{{ $value }}% of Kubernetes nodes are not ready' | ||||||
|             state).' |  | ||||||
|           summary: Many Kubernetes nodes are Not Ready |  | ||||||
|       - alert: K8SKubeletDown |       - alert: K8SKubeletDown | ||||||
|         expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 |         expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 | ||||||
|         for: 1h |         for: 1h | ||||||
|         labels: |         labels: | ||||||
|           severity: warning |           severity: warning | ||||||
|         annotations: |         annotations: | ||||||
|           description: Prometheus failed to scrape {{ $value }}% of kubelets. |           description: Prometheus failed to scrape {{ $value }}% of kubelets. | ||||||
|           summary: Many Kubelets cannot be scraped |  | ||||||
|       - alert: K8SKubeletDown |       - alert: K8SKubeletDown | ||||||
|         expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) |         expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) | ||||||
|           > 0.1 |           * 100 > 1 | ||||||
|         for: 1h |         for: 1h | ||||||
|         labels: |         labels: | ||||||
|           severity: critical |           severity: critical | ||||||
| @ -308,159 +371,121 @@ data: | |||||||
|           summary: Many Kubelets cannot be scraped |           summary: Many Kubelets cannot be scraped | ||||||
|       - alert: K8SKubeletTooManyPods |       - alert: K8SKubeletTooManyPods | ||||||
|         expr: kubelet_running_pod_count > 100 |         expr: kubelet_running_pod_count > 100 | ||||||
|  |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           severity: warning |           severity: warning | ||||||
|         annotations: |         annotations: | ||||||
|           description: Kubelet {{$labels.instance}} is running {{$value}} pods, close |           description: Kubelet {{$labels.instance}} is running {{$value}} pods, close | ||||||
|             to the limit of 110 |             to the limit of 110 | ||||||
|           summary: Kubelet is close to pod limit |           summary: Kubelet is close to pod limit | ||||||
|       - alert: K8SDaemonSetsNotScheduled |  | ||||||
|         expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled |  | ||||||
|           > 0 |  | ||||||
|         for: 10m |  | ||||||
|         labels: |  | ||||||
|           severity: warning |  | ||||||
|         annotations: |  | ||||||
|           description: A number of daemonsets are not scheduled. |  | ||||||
|           summary: Daemonsets are not scheduled correctly |  | ||||||
|       - alert: K8SDaemonSetsNotRunning |  | ||||||
|         expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready |  | ||||||
|           > 0 |  | ||||||
|         for: 10m |  | ||||||
|         labels: |  | ||||||
|           severity: warning |  | ||||||
|         annotations: |  | ||||||
|           description: A number of daemonsets are not ready. |  | ||||||
|           summary: Daemonsets are not ready |  | ||||||
|       - alert: K8SDaemonSetsMissScheduled |  | ||||||
|         expr: kube_daemonset_status_number_misscheduled > 0 |  | ||||||
|         for: 10m |  | ||||||
|         labels: |  | ||||||
|           severity: warning |  | ||||||
|         annotations: |  | ||||||
|           description: A number of daemonsets are running where they are not supposed |  | ||||||
|             to run. |  | ||||||
|           summary: Daemonsets are not scheduled correctly |  | ||||||
|   kubernetes.rules.yaml: |+ |   kubernetes.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./kubernetes.rules |     - name: kubernetes.rules | ||||||
|       rules: |       rules: | ||||||
|       - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes |       - record: pod_name:container_memory_usage_bytes:sum | ||||||
|         expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, |         expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY | ||||||
|           "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |           (pod_name) | ||||||
|           controller, pod_name, container_name) |       - record: pod_name:container_spec_cpu_shares:sum | ||||||
|       - record: cluster_namespace_controller_pod_container:spec_cpu_shares |         expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) | ||||||
|         expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", |       - record: pod_name:container_cpu_usage:sum | ||||||
|           "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |         expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) | ||||||
|           container_name) |           BY (pod_name) | ||||||
|       - record: cluster_namespace_controller_pod_container:cpu_usage:rate |       - record: pod_name:container_fs_usage_bytes:sum | ||||||
|         expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), |         expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) | ||||||
|           "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |       - record: namespace:container_memory_usage_bytes:sum | ||||||
|           controller, pod_name, container_name) |         expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) | ||||||
|       - record: cluster_namespace_controller_pod_container:memory_usage:bytes |       - record: namespace:container_spec_cpu_shares:sum | ||||||
|         expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", |         expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) | ||||||
|           "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |       - record: namespace:container_cpu_usage:sum | ||||||
|           container_name) |         expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) | ||||||
|       - record: cluster_namespace_controller_pod_container:memory_working_set:bytes |           BY (namespace) | ||||||
|         expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, |       - record: cluster:memory_usage:ratio | ||||||
|           "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |         expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY | ||||||
|           controller, pod_name, container_name) |           (cluster) / sum(machine_memory_bytes) BY (cluster) | ||||||
|       - record: cluster_namespace_controller_pod_container:memory_rss:bytes |       - record: cluster:container_spec_cpu_shares:ratio | ||||||
|         expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", |         expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 | ||||||
|           "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |           / sum(machine_cpu_cores) | ||||||
|           container_name) |       - record: cluster:container_cpu_usage:ratio | ||||||
|       - record: cluster_namespace_controller_pod_container:memory_cache:bytes |         expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) | ||||||
|         expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", |           / sum(machine_cpu_cores) | ||||||
|           "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |       - record: apiserver_latency_seconds:quantile | ||||||
|           container_name) |         expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / | ||||||
|       - record: cluster_namespace_controller_pod_container:disk_usage:bytes |           1e+06 | ||||||
|         expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", |  | ||||||
|           "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, |  | ||||||
|           container_name) |  | ||||||
|       - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate |  | ||||||
|         expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), |  | ||||||
|           "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |  | ||||||
|           controller, pod_name, container_name, scope, type) |  | ||||||
|       - record: cluster_namespace_controller_pod_container:memory_oom:rate |  | ||||||
|         expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), |  | ||||||
|           "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, |  | ||||||
|           controller, pod_name, container_name, scope, type) |  | ||||||
|       - record: cluster:memory_allocation:percent |  | ||||||
|         expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) |  | ||||||
|           / sum(machine_memory_bytes) BY (cluster) |  | ||||||
|       - record: cluster:memory_used:percent |  | ||||||
|         expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) |  | ||||||
|           BY (cluster) |  | ||||||
|       - record: cluster:cpu_allocation:percent |  | ||||||
|         expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} |  | ||||||
|           * ON(cluster, instance) machine_cpu_cores) BY (cluster) |  | ||||||
|       - record: cluster:node_cpu_use:percent |  | ||||||
|         expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) |  | ||||||
|           BY (cluster) |  | ||||||
|       - record: cluster_resource_verb:apiserver_latency:quantile_seconds |  | ||||||
|         expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, |  | ||||||
|           cluster, job, resource, verb)) / 1e+06 |  | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.99" |           quantile: "0.99" | ||||||
|       - record: cluster_resource_verb:apiserver_latency:quantile_seconds |       - record: apiserver_latency:quantile_seconds | ||||||
|         expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, |         expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / | ||||||
|           cluster, job, resource, verb)) / 1e+06 |           1e+06 | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.9" |           quantile: "0.9" | ||||||
|       - record: cluster_resource_verb:apiserver_latency:quantile_seconds |       - record: apiserver_latency_seconds:quantile | ||||||
|         expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, |         expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / | ||||||
|           cluster, job, resource, verb)) / 1e+06 |           1e+06 | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.5" |           quantile: "0.5" | ||||||
|       - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds |       - alert: APIServerLatencyHigh | ||||||
|         expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) |         expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} | ||||||
|           BY (le, cluster)) / 1e+06 |           > 1 | ||||||
|  |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.99" |           severity: warning | ||||||
|       - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds |         annotations: | ||||||
|         expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) |           description: the API server has a 99th percentile latency of {{ $value }} seconds | ||||||
|           BY (le, cluster)) / 1e+06 |             for {{$labels.verb}} {{$labels.resource}} | ||||||
|  |       - alert: APIServerLatencyHigh | ||||||
|  |         expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} | ||||||
|  |           > 4 | ||||||
|  |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.9" |           severity: critical | ||||||
|       - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds |         annotations: | ||||||
|         expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) |           description: the API server has a 99th percentile latency of {{ $value }} seconds | ||||||
|           BY (le, cluster)) / 1e+06 |             for {{$labels.verb}} {{$labels.resource}} | ||||||
|  |       - alert: APIServerErrorsHigh | ||||||
|  |         expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) | ||||||
|  |           * 100 > 2 | ||||||
|  |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.5" |           severity: warning | ||||||
|       - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds |         annotations: | ||||||
|         expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) |           description: API server returns errors for {{ $value }}% of requests | ||||||
|           BY (le, cluster)) / 1e+06 |       - alert: APIServerErrorsHigh | ||||||
|  |         expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) | ||||||
|  |           * 100 > 5 | ||||||
|  |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.99" |           severity: critical | ||||||
|       - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds |         annotations: | ||||||
|         expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) |           description: API server returns errors for {{ $value }}% of requests | ||||||
|           BY (le, cluster)) / 1e+06 |       - alert: K8SApiserverDown | ||||||
|  |         expr: absent(up{job="apiserver"} == 1) | ||||||
|  |         for: 20m | ||||||
|         labels: |         labels: | ||||||
|           quantile: "0.9" |           severity: critical | ||||||
|       - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds |         annotations: | ||||||
|         expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) |           description: No API servers are reachable or all have disappeared from service | ||||||
|           BY (le, cluster)) / 1e+06 |             discovery | ||||||
|         labels: |  | ||||||
|           quantile: "0.5" |  | ||||||
|       - record: cluster:scheduler_binding_latency:quantile_seconds |  | ||||||
|         expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) |  | ||||||
|           BY (le, cluster)) / 1e+06 |  | ||||||
|         labels: |  | ||||||
|           quantile: "0.99" |  | ||||||
|       - record: cluster:scheduler_binding_latency:quantile_seconds |  | ||||||
|         expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) |  | ||||||
|           BY (le, cluster)) / 1e+06 |  | ||||||
|         labels: |  | ||||||
|           quantile: "0.9" |  | ||||||
|       - record: cluster:scheduler_binding_latency:quantile_seconds |  | ||||||
|         expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) |  | ||||||
|           BY (le, cluster)) / 1e+06 |  | ||||||
|         labels: |  | ||||||
|           quantile: "0.5" |  | ||||||
|   node.rules.yaml: |+ |   node.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./node.rules |     - name: node.rules | ||||||
|       rules: |       rules: | ||||||
|  |       - record: instance:node_cpu:rate:sum | ||||||
|  |         expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) | ||||||
|  |           BY (instance) | ||||||
|  |       - record: instance:node_filesystem_usage:sum | ||||||
|  |         expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) | ||||||
|  |           BY (instance) | ||||||
|  |       - record: instance:node_network_receive_bytes:rate:sum | ||||||
|  |         expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) | ||||||
|  |       - record: instance:node_network_transmit_bytes:rate:sum | ||||||
|  |         expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) | ||||||
|  |       - record: instance:node_cpu:ratio | ||||||
|  |         expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) | ||||||
|  |           GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) | ||||||
|  |       - record: cluster:node_cpu:sum_rate5m | ||||||
|  |         expr: sum(rate(node_cpu{mode!="idle"}[5m])) | ||||||
|  |       - record: cluster:node_cpu:ratio | ||||||
|  |         expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) | ||||||
|       - alert: NodeExporterDown |       - alert: NodeExporterDown | ||||||
|         expr: absent(up{job="node-exporter"} == 1) |         expr: absent(up{job="node-exporter"} == 1) | ||||||
|         for: 10m |         for: 10m | ||||||
| @ -468,43 +493,65 @@ data: | |||||||
|           severity: warning |           severity: warning | ||||||
|         annotations: |         annotations: | ||||||
|           description: Prometheus could not scrape a node-exporter for more than 10m, |           description: Prometheus could not scrape a node-exporter for more than 10m, | ||||||
|             or node-exporters have disappeared from discovery. |             or node-exporters have disappeared from discovery | ||||||
|           summary: node-exporter cannot be scraped |       - alert: NodeDiskRunningFull | ||||||
|       - alert: K8SNodeOutOfDisk |         expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 | ||||||
|         expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 |         for: 30m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: device {{$labels.device}} on node {{$labels.instance}} is running | ||||||
|  |             full within the next 24 hours (mounted at {{$labels.mountpoint}}) | ||||||
|  |       - alert: NodeDiskRunningFull | ||||||
|  |         expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 | ||||||
|  |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           service: k8s |  | ||||||
|           severity: critical |           severity: critical | ||||||
|         annotations: |         annotations: | ||||||
|           description: '{{ $labels.node }} has run out of disk space.' |           description: device {{$labels.device}} on node {{$labels.instance}} is running | ||||||
|           summary: Node ran out of disk space. |             full within the next 2 hours (mounted at {{$labels.mountpoint}}) | ||||||
|       - alert: K8SNodeMemoryPressure |  | ||||||
|         expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == |  | ||||||
|           1 |  | ||||||
|         labels: |  | ||||||
|           service: k8s |  | ||||||
|           severity: warning |  | ||||||
|         annotations: |  | ||||||
|           description: '{{ $labels.node }} is under memory pressure.' |  | ||||||
|           summary: Node is under memory pressure. |  | ||||||
|       - alert: K8SNodeDiskPressure |  | ||||||
|         expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 |  | ||||||
|         labels: |  | ||||||
|           service: k8s |  | ||||||
|           severity: warning |  | ||||||
|         annotations: |  | ||||||
|           description: '{{ $labels.node }} is under disk pressure.' |  | ||||||
|           summary: Node is under disk pressure. |  | ||||||
|   prometheus.rules.yaml: |+ |   prometheus.rules.yaml: |+ | ||||||
|     groups: |     groups: | ||||||
|     - name: ./prometheus.rules |     - name: prometheus.rules | ||||||
|       rules: |       rules: | ||||||
|       - alert: FailedReload |       - alert: PrometheusConfigReloadFailed | ||||||
|         expr: prometheus_config_last_reload_successful == 0 |         expr: prometheus_config_last_reload_successful == 0 | ||||||
|         for: 10m |         for: 10m | ||||||
|         labels: |         labels: | ||||||
|           severity: warning |           severity: warning | ||||||
|         annotations: |         annotations: | ||||||
|           description: Reloading Prometheus' configuration has failed for {{ $labels.namespace |           description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} | ||||||
|             }}/{{ $labels.pod}}. |       - alert: PrometheusNotificationQueueRunningFull | ||||||
|           summary: Prometheus configuration reload has failed |         expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ | ||||||
|  |             $labels.pod}} | ||||||
|  |       - alert: PrometheusErrorSendingAlerts | ||||||
|  |         expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) | ||||||
|  |           > 0.01 | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | ||||||
|  |             $labels.pod}} to Alertmanager {{$labels.Alertmanager}} | ||||||
|  |       - alert: PrometheusErrorSendingAlerts | ||||||
|  |         expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) | ||||||
|  |           > 0.03 | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: critical | ||||||
|  |         annotations: | ||||||
|  |           description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | ||||||
|  |             $labels.pod}} to Alertmanager {{$labels.Alertmanager}} | ||||||
|  |       - alert: PrometheusNotConnectedToAlertmanagers | ||||||
|  |         expr: prometheus_notifications_alertmanagers_discovered < 1 | ||||||
|  |         for: 10m | ||||||
|  |         labels: | ||||||
|  |           severity: warning | ||||||
|  |         annotations: | ||||||
|  |           description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected | ||||||
|  |             to any Alertmanagers | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user