Merge pull request #272 from karancode/aws_eks_cni

AWS EKS CNI Monitoring Support
This commit is contained in:
Frederic Branczyk 2019-11-05 15:53:46 +01:00 committed by GitHub
commit 6a6a43e227
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 145 additions and 11 deletions

View File

@ -653,6 +653,7 @@ As described in the [Prerequisites](#prerequisites) section, in order to retriev
If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md)
#### Authentication problem
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.

42
docs/EKS-cni-support.md Normal file
View File

@ -0,0 +1,42 @@
# CNI monitoring special configuration updates for EKS
AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin for pod networking in Kubernetes using Elastic Network Interfaces on AWS
One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling).
You can monitor the `awscni` using kube-promethus with :
[embedmd]:# (../examples/eks-cni-example.jsonnet)
```jsonnet
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
(import 'kube-prometheus/kube-prometheus-eks.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
prometheusRules+:: {
groups+: [
{
name: 'example-group',
rules: [
{
record: 'aws_eks_available_ip',
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
},
],
},
],
},
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) }
```
After you have the required yaml file please run
```
kubectl apply -f manifests/prometheus-serviceMonitorAwsEksCNI.yaml
```

View File

@ -0,0 +1,26 @@
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
(import 'kube-prometheus/kube-prometheus-eks.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
prometheusRules+:: {
groups+: [
{
name: 'example-group',
rules: [
{
record: 'aws_eks_available_ip',
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
},
],
},
],
},
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) }

View File

@ -0,0 +1,65 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
AwsEksCniMetricService:
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) +
service.mixin.spec.withClusterIp('None'),
serviceMonitorAwsEksCNI:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'awsekscni',
namespace: $._config.namespace,
labels: {
'k8s-app': 'eks-cni',
},
},
spec: {
jobLabel: 'k8s-app',
selector: {
matchLabels: {
'k8s-app': 'aws-node',
},
},
namespaceSelector: {
matchNames: [
'kube-system',
],
},
endpoints: [
{
port: 'cni-metrics-port',
interval: '30s',
path: '/metrics',
},
],
},
},
},
prometheusRules+: {
groups+: [
{
name: 'kube-prometheus-eks.rules',
rules: [
{
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
labels: {
severity: 'critical',
},
annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
},
'for': '10m',
alert: 'EksAvailableIPs'
},
],
},
],
},
}

View File

@ -8,7 +8,7 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "fa972cf29666e821c44195c51df15b6e28ed29c4",
"version": "cbc1340af53f50728181f97f6bce442ac33d8993",
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
},
{
@ -30,7 +30,7 @@
"subdir": "grafana-builder"
}
},
"version": "1f273dd3c7a619bcd05c3e1c2650204104a273d8",
"version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd",
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
},
{
@ -83,8 +83,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "5a7b85876d6108a91f0d8673c0d7eca38687671b",
"sum": "3N77msMjqClzQHbZOxn4GTlV+FZpU+y1gCekvCvxwy0="
"version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5",
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
},
{
"name": "prometheus",
@ -94,7 +94,7 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "74726367cf7a7e8d0332238defd2e7f4169030bd",
"version": "e94503ff5c412590ce7616accdd3c62a2189bcd3",
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
},
{

View File

@ -20280,7 +20280,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n sum without (instance) (instance:node_num_cpu:sum{job=\"node-exporter\"})\n)\n",
"expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -20366,7 +20366,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_load1_per_cpu:ratio{job=\"node-exporter\"})\n)\n",
"expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -20464,7 +20464,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_memory_utilisation:ratio{job=\"node-exporter\"})\n)\n",
"expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -20864,7 +20864,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "(\n instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"})\n)\n",
"expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{device}}",
@ -20950,7 +20950,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "(\n instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"})\n)\n",
"expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{device}}",
@ -21048,7 +21048,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "(\n sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n ) \n/ ignoring (instance) group_left\n sum without (instance, device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n )\n) \n",
"expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",