mirror of
				https://github.com/prometheus-operator/kube-prometheus.git
				synced 2025-11-04 10:01:03 +01:00 
			
		
		
		
	Merge pull request #272 from karancode/aws_eks_cni
AWS EKS CNI Monitoring Support
This commit is contained in:
		
						commit
						6a6a43e227
					
				@ -653,6 +653,7 @@ As described in the [Prerequisites](#prerequisites) section, in order to retriev
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
 | 
					If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md)
 | 
				
			||||||
#### Authentication problem
 | 
					#### Authentication problem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.
 | 
					The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										42
									
								
								docs/EKS-cni-support.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								docs/EKS-cni-support.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,42 @@
 | 
				
			|||||||
 | 
					# CNI monitoring special configuration updates for EKS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin for pod networking in Kubernetes using Elastic Network Interfaces on AWS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can monitor the `awscni` using kube-promethus with : 
 | 
				
			||||||
 | 
					[embedmd]:# (../examples/eks-cni-example.jsonnet)
 | 
				
			||||||
 | 
					```jsonnet
 | 
				
			||||||
 | 
					local kp =  (import 'kube-prometheus/kube-prometheus.libsonnet') +
 | 
				
			||||||
 | 
					            (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + {
 | 
				
			||||||
 | 
					  _config+:: {
 | 
				
			||||||
 | 
					    namespace: 'monitoring',
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  prometheusRules+:: {
 | 
				
			||||||
 | 
					    groups+: [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'example-group',
 | 
				
			||||||
 | 
					        rules: [
 | 
				
			||||||
 | 
					          {
 | 
				
			||||||
 | 
					            record: 'aws_eks_available_ip',
 | 
				
			||||||
 | 
					            expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
 | 
				
			||||||
 | 
					{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
 | 
				
			||||||
 | 
					{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
 | 
				
			||||||
 | 
					{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
 | 
				
			||||||
 | 
					{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
 | 
				
			||||||
 | 
					{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } 
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					After you have the required yaml file please run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					kubectl apply -f manifests/prometheus-serviceMonitorAwsEksCNI.yaml
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
							
								
								
									
										26
									
								
								examples/eks-cni-example.jsonnet
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								examples/eks-cni-example.jsonnet
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,26 @@
 | 
				
			|||||||
 | 
					local kp =  (import 'kube-prometheus/kube-prometheus.libsonnet') +
 | 
				
			||||||
 | 
					            (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + {
 | 
				
			||||||
 | 
					  _config+:: {
 | 
				
			||||||
 | 
					    namespace: 'monitoring',
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  prometheusRules+:: {
 | 
				
			||||||
 | 
					    groups+: [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'example-group',
 | 
				
			||||||
 | 
					        rules: [
 | 
				
			||||||
 | 
					          {
 | 
				
			||||||
 | 
					            record: 'aws_eks_available_ip',
 | 
				
			||||||
 | 
					            expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
 | 
				
			||||||
 | 
					{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
 | 
				
			||||||
 | 
					{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
 | 
				
			||||||
 | 
					{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
 | 
				
			||||||
 | 
					{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
 | 
				
			||||||
 | 
					{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } 
 | 
				
			||||||
							
								
								
									
										65
									
								
								jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,65 @@
 | 
				
			|||||||
 | 
					local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
 | 
				
			||||||
 | 
					local service = k.core.v1.service;
 | 
				
			||||||
 | 
					local servicePort = k.core.v1.service.mixin.spec.portsType;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  prometheus+: {
 | 
				
			||||||
 | 
					    AwsEksCniMetricService:
 | 
				
			||||||
 | 
					        service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
 | 
				
			||||||
 | 
					        service.mixin.metadata.withNamespace('kube-system') +
 | 
				
			||||||
 | 
					        service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) +
 | 
				
			||||||
 | 
					        service.mixin.spec.withClusterIp('None'),
 | 
				
			||||||
 | 
					    serviceMonitorAwsEksCNI:
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        apiVersion: 'monitoring.coreos.com/v1',
 | 
				
			||||||
 | 
					        kind: 'ServiceMonitor',
 | 
				
			||||||
 | 
					        metadata: {
 | 
				
			||||||
 | 
					          name: 'awsekscni',
 | 
				
			||||||
 | 
					          namespace: $._config.namespace,
 | 
				
			||||||
 | 
					          labels: {
 | 
				
			||||||
 | 
					            'k8s-app': 'eks-cni',
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        spec: {
 | 
				
			||||||
 | 
					          jobLabel: 'k8s-app',
 | 
				
			||||||
 | 
					          selector: {
 | 
				
			||||||
 | 
					            matchLabels: {
 | 
				
			||||||
 | 
					              'k8s-app': 'aws-node',
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					          namespaceSelector: {
 | 
				
			||||||
 | 
					            matchNames: [
 | 
				
			||||||
 | 
					              'kube-system',
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					          endpoints: [
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					              port: 'cni-metrics-port',
 | 
				
			||||||
 | 
					              interval: '30s',
 | 
				
			||||||
 | 
					              path: '/metrics',
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					          ],
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  prometheusRules+: {
 | 
				
			||||||
 | 
					    groups+: [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'kube-prometheus-eks.rules',
 | 
				
			||||||
 | 
					        rules: [
 | 
				
			||||||
 | 
					          {
 | 
				
			||||||
 | 
					            expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
 | 
				
			||||||
 | 
					            labels: {
 | 
				
			||||||
 | 
					              severity: 'critical',
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            annotations: {
 | 
				
			||||||
 | 
					              message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            'for': '10m',
 | 
				
			||||||
 | 
					            alert: 'EksAvailableIPs'
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@ -8,7 +8,7 @@
 | 
				
			|||||||
          "subdir": "Documentation/etcd-mixin"
 | 
					          "subdir": "Documentation/etcd-mixin"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      },
 | 
					      },
 | 
				
			||||||
      "version": "fa972cf29666e821c44195c51df15b6e28ed29c4",
 | 
					      "version": "cbc1340af53f50728181f97f6bce442ac33d8993",
 | 
				
			||||||
      "sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
 | 
					      "sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@ -30,7 +30,7 @@
 | 
				
			|||||||
          "subdir": "grafana-builder"
 | 
					          "subdir": "grafana-builder"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      },
 | 
					      },
 | 
				
			||||||
      "version": "1f273dd3c7a619bcd05c3e1c2650204104a273d8",
 | 
					      "version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd",
 | 
				
			||||||
      "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
 | 
					      "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@ -83,8 +83,8 @@
 | 
				
			|||||||
          "subdir": "docs/node-mixin"
 | 
					          "subdir": "docs/node-mixin"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      },
 | 
					      },
 | 
				
			||||||
      "version": "5a7b85876d6108a91f0d8673c0d7eca38687671b",
 | 
					      "version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5",
 | 
				
			||||||
      "sum": "3N77msMjqClzQHbZOxn4GTlV+FZpU+y1gCekvCvxwy0="
 | 
					      "sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      "name": "prometheus",
 | 
					      "name": "prometheus",
 | 
				
			||||||
@ -94,7 +94,7 @@
 | 
				
			|||||||
          "subdir": "documentation/prometheus-mixin"
 | 
					          "subdir": "documentation/prometheus-mixin"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      },
 | 
					      },
 | 
				
			||||||
      "version": "74726367cf7a7e8d0332238defd2e7f4169030bd",
 | 
					      "version": "e94503ff5c412590ce7616accdd3c62a2189bcd3",
 | 
				
			||||||
      "sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
 | 
					      "sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
 | 
				
			|||||||
@ -20280,7 +20280,7 @@ items:
 | 
				
			|||||||
                          "steppedLine": false,
 | 
					                          "steppedLine": false,
 | 
				
			||||||
                          "targets": [
 | 
					                          "targets": [
 | 
				
			||||||
                              {
 | 
					                              {
 | 
				
			||||||
                                  "expr": "(\n  instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n  instance:node_num_cpu:sum{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n  sum without (instance) (instance:node_num_cpu:sum{job=\"node-exporter\"})\n)\n",
 | 
					                                  "expr": "(\n  instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n  instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n",
 | 
				
			||||||
                                  "format": "time_series",
 | 
					                                  "format": "time_series",
 | 
				
			||||||
                                  "intervalFactor": 2,
 | 
					                                  "intervalFactor": 2,
 | 
				
			||||||
                                  "legendFormat": "{{instance}}",
 | 
					                                  "legendFormat": "{{instance}}",
 | 
				
			||||||
@ -20366,7 +20366,7 @@ items:
 | 
				
			|||||||
                          "steppedLine": false,
 | 
					                          "steppedLine": false,
 | 
				
			||||||
                          "targets": [
 | 
					                          "targets": [
 | 
				
			||||||
                              {
 | 
					                              {
 | 
				
			||||||
                                  "expr": "(\n  instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n  count without (instance) (instance:node_load1_per_cpu:ratio{job=\"node-exporter\"})\n)\n",
 | 
					                                  "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n",
 | 
				
			||||||
                                  "format": "time_series",
 | 
					                                  "format": "time_series",
 | 
				
			||||||
                                  "intervalFactor": 2,
 | 
					                                  "intervalFactor": 2,
 | 
				
			||||||
                                  "legendFormat": "{{instance}}",
 | 
					                                  "legendFormat": "{{instance}}",
 | 
				
			||||||
@ -20464,7 +20464,7 @@ items:
 | 
				
			|||||||
                          "steppedLine": false,
 | 
					                          "steppedLine": false,
 | 
				
			||||||
                          "targets": [
 | 
					                          "targets": [
 | 
				
			||||||
                              {
 | 
					                              {
 | 
				
			||||||
                                  "expr": "(\n  instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n  count without (instance) (instance:node_memory_utilisation:ratio{job=\"node-exporter\"})\n)\n",
 | 
					                                  "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n",
 | 
				
			||||||
                                  "format": "time_series",
 | 
					                                  "format": "time_series",
 | 
				
			||||||
                                  "intervalFactor": 2,
 | 
					                                  "intervalFactor": 2,
 | 
				
			||||||
                                  "legendFormat": "{{instance}}",
 | 
					                                  "legendFormat": "{{instance}}",
 | 
				
			||||||
@ -20864,7 +20864,7 @@ items:
 | 
				
			|||||||
                          "steppedLine": false,
 | 
					                          "steppedLine": false,
 | 
				
			||||||
                          "targets": [
 | 
					                          "targets": [
 | 
				
			||||||
                              {
 | 
					                              {
 | 
				
			||||||
                                  "expr": "(\n  instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n  count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"})\n)\n",
 | 
					                                  "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n",
 | 
				
			||||||
                                  "format": "time_series",
 | 
					                                  "format": "time_series",
 | 
				
			||||||
                                  "intervalFactor": 2,
 | 
					                                  "intervalFactor": 2,
 | 
				
			||||||
                                  "legendFormat": "{{instance}} {{device}}",
 | 
					                                  "legendFormat": "{{instance}} {{device}}",
 | 
				
			||||||
@ -20950,7 +20950,7 @@ items:
 | 
				
			|||||||
                          "steppedLine": false,
 | 
					                          "steppedLine": false,
 | 
				
			||||||
                          "targets": [
 | 
					                          "targets": [
 | 
				
			||||||
                              {
 | 
					                              {
 | 
				
			||||||
                                  "expr": "(\n  instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n  count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"})\n)\n",
 | 
					                                  "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n",
 | 
				
			||||||
                                  "format": "time_series",
 | 
					                                  "format": "time_series",
 | 
				
			||||||
                                  "intervalFactor": 2,
 | 
					                                  "intervalFactor": 2,
 | 
				
			||||||
                                  "legendFormat": "{{instance}} {{device}}",
 | 
					                                  "legendFormat": "{{instance}} {{device}}",
 | 
				
			||||||
@ -21048,7 +21048,7 @@ items:
 | 
				
			|||||||
                          "steppedLine": false,
 | 
					                          "steppedLine": false,
 | 
				
			||||||
                          "targets": [
 | 
					                          "targets": [
 | 
				
			||||||
                              {
 | 
					                              {
 | 
				
			||||||
                                  "expr": "(\n  sum without (device) (\n    max without (fstype, mountpoint) (\n      node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n    )\n  ) \n/ ignoring (instance) group_left\n  sum without (instance, device) (\n    max without (fstype, mountpoint) (\n      node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"}\n    )\n  )\n)  \n",
 | 
					                                  "expr": "sum without (device) (\n  max without (fstype, mountpoint) (\n    node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n  )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n",
 | 
				
			||||||
                                  "format": "time_series",
 | 
					                                  "format": "time_series",
 | 
				
			||||||
                                  "intervalFactor": 2,
 | 
					                                  "intervalFactor": 2,
 | 
				
			||||||
                                  "legendFormat": "{{instance}}",
 | 
					                                  "legendFormat": "{{instance}}",
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user