mirror of
https://github.com/prometheus-operator/kube-prometheus.git
synced 2025-11-01 16:41:02 +01:00
Merge pull request #272 from karancode/aws_eks_cni
AWS EKS CNI Monitoring Support
This commit is contained in:
commit
6a6a43e227
@ -653,6 +653,7 @@ As described in the [Prerequisites](#prerequisites) section, in order to retriev
|
||||
|
||||
If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
|
||||
|
||||
If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md)
|
||||
#### Authentication problem
|
||||
|
||||
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.
|
||||
|
||||
42
docs/EKS-cni-support.md
Normal file
42
docs/EKS-cni-support.md
Normal file
@ -0,0 +1,42 @@
|
||||
# CNI monitoring special configuration updates for EKS
|
||||
|
||||
AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin for pod networking in Kubernetes using Elastic Network Interfaces on AWS
|
||||
|
||||
One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling).
|
||||
|
||||
You can monitor the `awscni` using kube-promethus with :
|
||||
[embedmd]:# (../examples/eks-cni-example.jsonnet)
|
||||
```jsonnet
|
||||
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
|
||||
(import 'kube-prometheus/kube-prometheus-eks.libsonnet') + {
|
||||
_config+:: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'example-group',
|
||||
rules: [
|
||||
{
|
||||
record: 'aws_eks_available_ip',
|
||||
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) }
|
||||
```
|
||||
|
||||
After you have the required yaml file please run
|
||||
|
||||
```
|
||||
kubectl apply -f manifests/prometheus-serviceMonitorAwsEksCNI.yaml
|
||||
```
|
||||
26
examples/eks-cni-example.jsonnet
Normal file
26
examples/eks-cni-example.jsonnet
Normal file
@ -0,0 +1,26 @@
|
||||
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
|
||||
(import 'kube-prometheus/kube-prometheus-eks.libsonnet') + {
|
||||
_config+:: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'example-group',
|
||||
rules: [
|
||||
{
|
||||
record: 'aws_eks_available_ip',
|
||||
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) }
|
||||
65
jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet
Normal file
65
jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet
Normal file
@ -0,0 +1,65 @@
|
||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local service = k.core.v1.service;
|
||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||
|
||||
{
|
||||
prometheus+: {
|
||||
AwsEksCniMetricService:
|
||||
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
serviceMonitorAwsEksCNI:
|
||||
{
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'awsekscni',
|
||||
namespace: $._config.namespace,
|
||||
labels: {
|
||||
'k8s-app': 'eks-cni',
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'k8s-app',
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'k8s-app': 'aws-node',
|
||||
},
|
||||
},
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
endpoints: [
|
||||
{
|
||||
port: 'cni-metrics-port',
|
||||
interval: '30s',
|
||||
path: '/metrics',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRules+: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-eks.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
|
||||
},
|
||||
'for': '10m',
|
||||
alert: 'EksAvailableIPs'
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
@ -8,7 +8,7 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "fa972cf29666e821c44195c51df15b6e28ed29c4",
|
||||
"version": "cbc1340af53f50728181f97f6bce442ac33d8993",
|
||||
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
|
||||
},
|
||||
{
|
||||
@ -30,7 +30,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "1f273dd3c7a619bcd05c3e1c2650204104a273d8",
|
||||
"version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd",
|
||||
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
|
||||
},
|
||||
{
|
||||
@ -83,8 +83,8 @@
|
||||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "5a7b85876d6108a91f0d8673c0d7eca38687671b",
|
||||
"sum": "3N77msMjqClzQHbZOxn4GTlV+FZpU+y1gCekvCvxwy0="
|
||||
"version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5",
|
||||
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
|
||||
},
|
||||
{
|
||||
"name": "prometheus",
|
||||
@ -94,7 +94,7 @@
|
||||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "74726367cf7a7e8d0332238defd2e7f4169030bd",
|
||||
"version": "e94503ff5c412590ce7616accdd3c62a2189bcd3",
|
||||
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
|
||||
},
|
||||
{
|
||||
|
||||
@ -20280,7 +20280,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n sum without (instance) (instance:node_num_cpu:sum{job=\"node-exporter\"})\n)\n",
|
||||
"expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
@ -20366,7 +20366,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_load1_per_cpu:ratio{job=\"node-exporter\"})\n)\n",
|
||||
"expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
@ -20464,7 +20464,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_memory_utilisation:ratio{job=\"node-exporter\"})\n)\n",
|
||||
"expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
@ -20864,7 +20864,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"})\n)\n",
|
||||
"expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{device}}",
|
||||
@ -20950,7 +20950,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"})\n)\n",
|
||||
"expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{device}}",
|
||||
@ -21048,7 +21048,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n ) \n/ ignoring (instance) group_left\n sum without (instance, device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n )\n) \n",
|
||||
"expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user