From 19624d9defb9082e0c31f449371a2170b56ae6b8 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 01:45:55 +0900 Subject: [PATCH 01/29] add aws_eks_cni serviceMonitor --- .../prometheus/prometheus.libsonnet | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 42baf52a..324e7da6 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -440,5 +440,37 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; ], }, }, + serviceMonitorAwsEksCNI: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'awscni', + namespace: p.namespace, + labels: { + 'k8s-app': 'aws-cni', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'aws-cni', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + }, + ], + }, + }, }, } From 3640448229ce65c3dab0117c35310d278f105a4b Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 02:03:58 +0900 Subject: [PATCH 02/29] fix name --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 324e7da6..860623bc 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -445,17 +445,17 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: { - name: 'awscni', + name: 'awsekscni', namespace: p.namespace, labels: { - 'k8s-app': 'aws-cni', + 'k8s-app': 'eks-cni', }, }, spec: { jobLabel: 'k8s-app', selector: { matchLabels: { - 'k8s-app': 'aws-cni', + 'k8s-app': 'eks-cni', }, }, namespaceSelector: { From 3b8e685082c403c9844895e972e4a4dd1ee3febd Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 02:24:07 +0900 Subject: [PATCH 03/29] add aws-eks-cni service --- examples/kustomize.jsonnet | 2 +- .../kube-prometheus-aws-eks-cni.libsonnet | 13 +++ jsonnetfile.lock.json | 14 +-- kustomization.yaml | 1 + manifests/grafana-dashboardDefinitions.yaml | 98 ++++++++++++------- manifests/prometheus-rules.yaml | 16 +++ .../prometheus-serviceMonitorAwsEksCNI.yaml | 19 ++++ 7 files changed, 119 insertions(+), 44 deletions(-) create mode 100644 jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet create mode 100644 manifests/prometheus-serviceMonitorAwsEksCNI.yaml diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index db3ba344..438e69b1 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,5 +1,5 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + { + (import 'kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet') + { _config+:: { namespace: 'monitoring', }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet new file mode 100644 index 00000000..e6dce38b --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet @@ -0,0 +1,13 @@ +local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + kubePrometheusAwsEksCniMetricService: + service.new('aws-eks-cni', { 'k8s-app' : 'eks-cni' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'eks-cni' }) + + service.mixin.spec.withClusterIp('None'), + }, +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5d054e2b..66bfafd6 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -7,7 +7,7 @@ "directory": "jsonnet/kube-prometheus" } }, - "version": "" + "version": "aws_eks_cni" }, { "name": "ksonnet", @@ -27,7 +27,7 @@ "subdir": "" } }, - "version": "15ddfa20a6921ffbd43172eb54f6bdc1bcf8d3d3" + "version": "3ad401ea3ef7fb0879298fa411772984ffa7f31f" }, { "name": "grafonnet", @@ -37,7 +37,7 @@ "subdir": "grafonnet" } }, - "version": "69bc267211790a1c3f4ea6e6211f3e8ffe22f987" + "version": "47db72da03fc4a7a0658a87791e13c3315a3a252" }, { "name": "grafana-builder", @@ -47,7 +47,7 @@ "subdir": "grafana-builder" } }, - "version": "e59d64a96a73e65ba53ba7fe05c9598974cc4a52" + "version": "3fe9a46d5fe0b70cbcabec1d2054f8ac3b3faae7" }, { "name": "grafana", @@ -77,7 +77,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "3ef2ad8e115449a7004b628a873e2629855ed468" + "version": "5dc12f27251ad6f5f0744ad33ea7d731480f4b87" }, { "name": "prometheus", @@ -87,7 +87,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "b05b5f9a300b0209689c06d70f676291f23774c4" + "version": "b5a16a8f861c29799f9a903f1e0859f513e862ed" }, { "name": "node-mixin", @@ -107,7 +107,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "15ddfa20a6921ffbd43172eb54f6bdc1bcf8d3d3" + "version": "3ad401ea3ef7fb0879298fa411772984ffa7f31f" } ] } diff --git a/kustomization.yaml b/kustomization.yaml index a580ed8e..79f9624f 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -62,6 +62,7 @@ resources: - ./manifests/prometheus-serviceAccount.yaml - ./manifests/prometheus-serviceMonitor.yaml - ./manifests/prometheus-serviceMonitorApiserver.yaml +- ./manifests/prometheus-serviceMonitorAwsEksCNI.yaml - ./manifests/prometheus-serviceMonitorCoreDNS.yaml - ./manifests/prometheus-serviceMonitorKubeControllerManager.yaml - ./manifests/prometheus-serviceMonitorKubeScheduler.yaml diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1065be04..9ab00dad 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -5522,7 +5522,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5540,7 +5540,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5558,7 +5558,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5567,7 +5567,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5576,7 +5576,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_cache{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_cache{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5585,7 +5585,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_swap{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_swap{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8717,7 +8717,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kubelet\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kubelet\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -8801,7 +8801,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_pod_count{job=\"kubelet\", instance=~\"$instance\"})", + "expr": "sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -8885,7 +8885,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_container_count{job=\"kubelet\", instance=~\"$instance\"})", + "expr": "sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -8969,7 +8969,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(volume_manager_total_volumes{job=\"kubelet\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\", state=\"actual_state_of_world\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -9053,7 +9053,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(volume_manager_total_volumes{job=\"kubelet\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\",state=\"desired_state_of_world\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -9137,7 +9137,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(kubelet_node_config_error{job=\"kubelet\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -9217,7 +9217,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubelet_runtime_operations_total{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_type}}", @@ -9308,7 +9308,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_type}}", @@ -9412,7 +9412,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_type}}", @@ -9516,14 +9516,14 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} pod", "refId": "A" }, { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} worker", @@ -9614,14 +9614,14 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} pod", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} worker", @@ -9727,7 +9727,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(storage_operation_duration_seconds_count{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", @@ -9820,7 +9820,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(storage_operation_errors_total{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", @@ -9926,7 +9926,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", @@ -10030,7 +10030,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{operation_type}}", @@ -10121,7 +10121,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{operation_type}}", @@ -10226,7 +10226,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10317,7 +10317,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10421,7 +10421,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10525,28 +10525,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kubelet\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kubelet\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kubelet\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kubelet\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -10650,7 +10650,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{verb}} {{url}}", @@ -10754,7 +10754,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kubelet\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10845,7 +10845,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kubelet\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10936,7 +10936,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kubelet\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -11020,6 +11020,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -11030,7 +11056,7 @@ items: "options": [ ], - "query": "label_values(kubelet_runtime_operations{job=\"kubelet\"}, instance)", + "query": "label_values(kubelet_runtime_operations{cluster=\"$cluster\", job=\"kubelet\"}, instance)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 007b3548..63a606c4 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -79,6 +79,22 @@ spec: rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m]) ) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + - expr: | + container_memory_working_set_bytes{job="kubelet", image!=""} + * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="kubelet", image!=""} + * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="kubelet", image!=""} + * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="kubelet", image!=""} + * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + record: node_namespace_pod_container:container_memory_swap - expr: | sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace) record: namespace:container_memory_usage_bytes:sum diff --git a/manifests/prometheus-serviceMonitorAwsEksCNI.yaml b/manifests/prometheus-serviceMonitorAwsEksCNI.yaml new file mode 100644 index 00000000..07d14f1f --- /dev/null +++ b/manifests/prometheus-serviceMonitorAwsEksCNI.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: eks-cni + name: awsekscni + namespace: monitoring +spec: + endpoints: + - interval: 30s + path: /metrics + port: cni-metrics-port + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: eks-cni From 55db3208da2911c6fa29991a15a7eb8e2677dbd5 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 02:33:55 +0900 Subject: [PATCH 04/29] fix names for service --- examples/kustomize.jsonnet | 1 + jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet | 4 ++-- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 438e69b1..2bd3c532 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,4 +1,5 @@ local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + (import 'kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet') + { _config+:: { namespace: 'monitoring', diff --git a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet index e6dce38b..1fd3b9a3 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet @@ -5,9 +5,9 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; { prometheus+: { kubePrometheusAwsEksCniMetricService: - service.new('aws-eks-cni', { 'k8s-app' : 'eks-cni' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + service.new('aws-aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + service.mixin.metadata.withNamespace('kube-system') + - service.mixin.metadata.withLabels({ 'k8s-app': 'eks-cni' }) + + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + service.mixin.spec.withClusterIp('None'), }, } diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 860623bc..a2e311b1 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -455,7 +455,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; jobLabel: 'k8s-app', selector: { matchLabels: { - 'k8s-app': 'eks-cni', + 'k8s-app': 'aws-node', }, }, namespaceSelector: { From c156f21d50526cb38c8ab34ac7f4f5b44b0d198f Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 02:44:57 +0900 Subject: [PATCH 05/29] bugfix service name --- jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet index 1fd3b9a3..65914040 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet @@ -4,8 +4,8 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; { prometheus+: { - kubePrometheusAwsEksCniMetricService: - service.new('aws-aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + AwsEksCniMetricService: + service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + service.mixin.metadata.withNamespace('kube-system') + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + service.mixin.spec.withClusterIp('None'), From 5cc6daab4aeceee80adcc788cbc8449e0019fb98 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 02:57:40 +0900 Subject: [PATCH 06/29] add aws eks cni service yaml --- kustomization.yaml | 1 + manifests/prometheus-AwsEksCniMetricService.yaml | 15 +++++++++++++++ manifests/prometheus-serviceMonitorAwsEksCNI.yaml | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 manifests/prometheus-AwsEksCniMetricService.yaml diff --git a/kustomization.yaml b/kustomization.yaml index 79f9624f..d8ecbcb6 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -39,6 +39,7 @@ resources: - ./manifests/node-exporter-service.yaml - ./manifests/node-exporter-serviceAccount.yaml - ./manifests/node-exporter-serviceMonitor.yaml +- ./manifests/prometheus-AwsEksCniMetricService.yaml - ./manifests/prometheus-adapter-apiService.yaml - ./manifests/prometheus-adapter-clusterRole.yaml - ./manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml diff --git a/manifests/prometheus-AwsEksCniMetricService.yaml b/manifests/prometheus-AwsEksCniMetricService.yaml new file mode 100644 index 00000000..4e3ee1dd --- /dev/null +++ b/manifests/prometheus-AwsEksCniMetricService.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: aws-node + name: aws-node + namespace: kube-system +spec: + clusterIP: None + ports: + - name: cni-metrics-port + port: 61678 + targetPort: 61678 + selector: + k8s-app: aws-node diff --git a/manifests/prometheus-serviceMonitorAwsEksCNI.yaml b/manifests/prometheus-serviceMonitorAwsEksCNI.yaml index 07d14f1f..bcb9d1e9 100644 --- a/manifests/prometheus-serviceMonitorAwsEksCNI.yaml +++ b/manifests/prometheus-serviceMonitorAwsEksCNI.yaml @@ -16,4 +16,4 @@ spec: - kube-system selector: matchLabels: - k8s-app: eks-cni + k8s-app: aws-node From 648db9d544e2c74fa214b8ea732b80e29ec56bc2 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 03:24:31 +0900 Subject: [PATCH 07/29] add readme --- README.md | 1 + docs/EKS-cni-support.md | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 docs/EKS-cni-support.md diff --git a/README.md b/README.md index 6fcdb3c4..604b7e7e 100644 --- a/README.md +++ b/README.md @@ -645,6 +645,7 @@ As described in the [Prerequisites](#prerequisites) section, in order to retriev If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md). +If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md) #### Authentication problem The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations. diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md new file mode 100644 index 00000000..b75b749b --- /dev/null +++ b/docs/EKS-cni-support.md @@ -0,0 +1,22 @@ +# CNI monitoring special configuration updates for EKS + +AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin for pod networking in Kubernetes using Elastic Network Interfaces on AWS + +One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling). + +You can monitor the `awscni` using kube-promethus with : +``` +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet') + + { + _config+:: { + # ... config here + } + }; +``` + +After you have the required yaml file please run + +``` +kubectl apply -f manifests/prometheus-serviceMonitorAwsEksCNI.yaml +``` From 6ef4b3d330d224fdd1fb0a8dbc83859daddd8622 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 03:26:02 +0900 Subject: [PATCH 08/29] remove local version --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 66bfafd6..d286c775 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -7,7 +7,7 @@ "directory": "jsonnet/kube-prometheus" } }, - "version": "aws_eks_cni" + "version": "" }, { "name": "ksonnet", From cbbfa0cad5ca7e231f8a255e95255d75fb75612c Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 20:33:03 +0900 Subject: [PATCH 09/29] move eks serviceMonitor to patch files --- .../kube-prometheus-aws-eks-cni.libsonnet | 32 +++++++++++++++++++ .../prometheus/prometheus.libsonnet | 32 ------------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet index 65914040..1d782e90 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet @@ -9,5 +9,37 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; service.mixin.metadata.withNamespace('kube-system') + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + service.mixin.spec.withClusterIp('None'), + serviceMonitorAwsEksCNI: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'awsekscni', + namespace: p.namespace, + labels: { + 'k8s-app': 'eks-cni', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + }, + ], + }, + }, }, } diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index a2e311b1..42baf52a 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -440,37 +440,5 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; ], }, }, - serviceMonitorAwsEksCNI: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'awsekscni', - namespace: p.namespace, - labels: { - 'k8s-app': 'eks-cni', - }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'aws-node', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'cni-metrics-port', - interval: '30s', - path: '/metrics', - }, - ], - }, - }, }, } From 1cbc9943443fe44a15fe729594b87ba8889988fe Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 20:44:28 +0900 Subject: [PATCH 10/29] fix namespace --- jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet index 1d782e90..aa749534 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet @@ -15,7 +15,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; kind: 'ServiceMonitor', metadata: { name: 'awsekscni', - namespace: p.namespace, + namespace: 'monitoring', labels: { 'k8s-app': 'eks-cni', }, From d32e859a11b154bfdd29c8341a0d39cdab499215 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 20:52:09 +0900 Subject: [PATCH 11/29] remove example --- examples/kustomize.jsonnet | 4 +- jsonnetfile.lock.json | 6 +- manifests/grafana-dashboardDefinitions.yaml | 616 +++++++++++++++++--- 3 files changed, 525 insertions(+), 101 deletions(-) diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 2bd3c532..8ee6f8ab 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,6 +1,6 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet') + { + (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { namespace: 'monitoring', }, diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index d286c775..7e4bf047 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -7,7 +7,7 @@ "directory": "jsonnet/kube-prometheus" } }, - "version": "" + "version": "aws_eks_cni" }, { "name": "ksonnet", @@ -77,7 +77,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "5dc12f27251ad6f5f0744ad33ea7d731480f4b87" + "version": "92f313811e7a9e0dbe73f288d27408817cb7865d" }, { "name": "prometheus", @@ -87,7 +87,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "b5a16a8f861c29799f9a903f1e0859f513e862ed" + "version": "5f1be2cf455409d3577ac9ddb43144ec2d39c90b" }, { "name": "node-mixin", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 9ab00dad..75a8b63b 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -15227,6 +15227,12 @@ items: data: prometheus-remote-write.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -15236,14 +15242,15 @@ items: "gnetId": null, "graphTooltip": 0, "hideControls": false, + "id": null, "links": [ ], - "refresh": "10s", + "refresh": "", "rows": [ { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -15254,12 +15261,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "gridPos": { + + }, + "id": 2, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15269,11 +15281,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -15283,12 +15296,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15314,11 +15326,11 @@ items: }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15327,7 +15339,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -15340,12 +15352,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "gridPos": { + + }, + "id": 3, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15355,11 +15372,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -15369,12 +15387,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15404,7 +15421,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15413,7 +15430,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -15423,11 +15440,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "Timestamps", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -15438,12 +15456,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "gridPos": { + + }, + "id": 4, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15453,11 +15476,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -15467,12 +15491,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15502,7 +15525,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15511,7 +15534,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -15521,11 +15544,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "Samples", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -15536,12 +15560,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "gridPos": { + + }, + "id": 5, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15551,26 +15580,51 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 6, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": "queue", "seriesOverrides": [ - + { + "alias": "/max_shards/", + "yaxis": 2 + } ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": false, "steppedLine": false, "targets": [ + { + "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "max_shards:{{queue}}", + "refId": "A" + }, + { + "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "min_shards:{{queue}}", + "refId": "B" + }, + { + "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "desired_shards:{{queue}}", + "refId": "C" + }, { "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "legendFormat": "current_shards:{{queue}}", + "refId": "D" } ], "thresholds": [ @@ -15578,7 +15632,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Num. Shards", + "title": "Shards: $queue", "tooltip": { "shared": true, "sort": 0, @@ -15600,7 +15654,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15609,10 +15663,23 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Shards", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -15622,12 +15689,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "gridPos": { + + }, + "id": 6, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15637,11 +15709,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": "queue", "seriesOverrides": [ ], @@ -15655,8 +15728,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15664,7 +15736,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Capacity", + "title": "Shard Capacity: $queue", "tooltip": { "shared": true, "sort": 0, @@ -15686,7 +15758,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15695,7 +15767,98 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "queue", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pending Samples: $queue", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } ] } @@ -15704,12 +15867,13 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Shards", - "titleSize": "h6" + "title": "Shard Details", + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -15720,12 +15884,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 6, + "gridPos": { + + }, + "id": 8, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15735,11 +15904,207 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_wal_segment_current{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "TSDB Current Segment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Remote Write Current Segment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Segments", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -15753,8 +16118,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15784,7 +16148,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15793,7 +16157,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -15806,12 +16170,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "gridPos": { + + }, + "id": 11, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15821,11 +16190,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -15839,8 +16209,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15870,7 +16239,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15879,7 +16248,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -15892,12 +16261,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 8, + "gridPos": { + + }, + "id": 12, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15907,11 +16281,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -15925,8 +16300,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -15956,7 +16330,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -15965,7 +16339,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -15978,12 +16352,17 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 9, + "gridPos": { + + }, + "id": 13, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -15993,11 +16372,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -16011,8 +16391,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -16042,7 +16421,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -16051,7 +16430,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -16060,8 +16439,9 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Misc Rates.", - "titleSize": "h6" + "title": "Misc. Rates", + "titleSize": "h6", + "type": "row" } ], "schemaVersion": 14, @@ -16072,10 +16452,6 @@ items: "templating": { "list": [ { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, "hide": 0, "label": null, "name": "datasource", @@ -16090,23 +16466,30 @@ items: { "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "instance", - "multi": true, + "label": null, + "multi": false, "name": "instance", "options": [ ], "query": "label_values(prometheus_build_info, instance)", - "refresh": 1, + "refresh": 2, "regex": "", - "sort": 2, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -16118,23 +16501,65 @@ items: { "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "cluster", - "multi": true, + "label": null, + "multi": false, "name": "cluster", "options": [ ], "query": "label_values(kube_pod_container_info{image=~\".*prometheus.*\"}, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", - "sort": 2, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "queue", + "options": [ + + ], + "query": "label_values(prometheus_remote_storage_shards, queue)", + "refresh": 2, + "regex": "", + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -16146,7 +16571,7 @@ items: ] }, "time": { - "from": "now-1h", + "from": "now-6h", "to": "now" }, "timepicker": { @@ -16174,9 +16599,8 @@ items: "30d" ] }, - "timezone": "utc", + "timezone": "browser", "title": "Prometheus Remote Write", - "uid": "", "version": 0 } kind: ConfigMap From e2b7e7f17d5925b90e177a4917915e42d9645847 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 22 Oct 2019 20:58:05 +0900 Subject: [PATCH 12/29] remove yamls from general mamnifests --- jsonnetfile.lock.json | 2 +- kustomization.yaml | 2 -- .../prometheus-AwsEksCniMetricService.yaml | 15 --------------- .../prometheus-serviceMonitorAwsEksCNI.yaml | 19 ------------------- 4 files changed, 1 insertion(+), 37 deletions(-) delete mode 100644 manifests/prometheus-AwsEksCniMetricService.yaml delete mode 100644 manifests/prometheus-serviceMonitorAwsEksCNI.yaml diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 7e4bf047..0282336c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -7,7 +7,7 @@ "directory": "jsonnet/kube-prometheus" } }, - "version": "aws_eks_cni" + "version": "" }, { "name": "ksonnet", diff --git a/kustomization.yaml b/kustomization.yaml index d8ecbcb6..a580ed8e 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -39,7 +39,6 @@ resources: - ./manifests/node-exporter-service.yaml - ./manifests/node-exporter-serviceAccount.yaml - ./manifests/node-exporter-serviceMonitor.yaml -- ./manifests/prometheus-AwsEksCniMetricService.yaml - ./manifests/prometheus-adapter-apiService.yaml - ./manifests/prometheus-adapter-clusterRole.yaml - ./manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml @@ -63,7 +62,6 @@ resources: - ./manifests/prometheus-serviceAccount.yaml - ./manifests/prometheus-serviceMonitor.yaml - ./manifests/prometheus-serviceMonitorApiserver.yaml -- ./manifests/prometheus-serviceMonitorAwsEksCNI.yaml - ./manifests/prometheus-serviceMonitorCoreDNS.yaml - ./manifests/prometheus-serviceMonitorKubeControllerManager.yaml - ./manifests/prometheus-serviceMonitorKubeScheduler.yaml diff --git a/manifests/prometheus-AwsEksCniMetricService.yaml b/manifests/prometheus-AwsEksCniMetricService.yaml deleted file mode 100644 index 4e3ee1dd..00000000 --- a/manifests/prometheus-AwsEksCniMetricService.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - k8s-app: aws-node - name: aws-node - namespace: kube-system -spec: - clusterIP: None - ports: - - name: cni-metrics-port - port: 61678 - targetPort: 61678 - selector: - k8s-app: aws-node diff --git a/manifests/prometheus-serviceMonitorAwsEksCNI.yaml b/manifests/prometheus-serviceMonitorAwsEksCNI.yaml deleted file mode 100644 index bcb9d1e9..00000000 --- a/manifests/prometheus-serviceMonitorAwsEksCNI.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: eks-cni - name: awsekscni - namespace: monitoring -spec: - endpoints: - - interval: 30s - path: /metrics - port: cni-metrics-port - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: aws-node From 9249256b4ab82eb3e2736665a1d7e0ca93f0e91b Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 00:00:16 +0900 Subject: [PATCH 13/29] revert examples to original --- examples/kustomize.jsonnet | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 8ee6f8ab..84de0cbd 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,6 +1,5 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + { - + (import 'kube-prometheus/kube-prometheus.libsonnet') + { _config+:: { namespace: 'monitoring', }, @@ -27,4 +26,4 @@ local kustomization = { manifests { '../kustomization': kustomization, -} +} \ No newline at end of file From 13c114a72f4a7a24d2579dcd8aed15e76fd0b838 Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 00:15:05 +0900 Subject: [PATCH 14/29] catch all eks --- .../kube-prometheus-eks.libsonnet | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet new file mode 100644 index 00000000..aa749534 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet @@ -0,0 +1,45 @@ +local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + AwsEksCniMetricService: + service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + + service.mixin.spec.withClusterIp('None'), + serviceMonitorAwsEksCNI: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'awsekscni', + namespace: 'monitoring', + labels: { + 'k8s-app': 'eks-cni', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + }, + ], + }, + }, + }, +} From edb327531d88d876b6b3532f2f759aefb1625280 Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 00:15:25 +0900 Subject: [PATCH 15/29] update example --- docs/EKS-cni-support.md | 62 ++++++++++++++++--- examples/eks-cni-example.jsonnet | 55 ++++++++++++++++ .../kube-prometheus-aws-eks-cni.libsonnet | 45 -------------- 3 files changed, 109 insertions(+), 53 deletions(-) create mode 100644 examples/eks-cni-example.jsonnet delete mode 100644 jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md index b75b749b..e41f38f8 100644 --- a/docs/EKS-cni-support.md +++ b/docs/EKS-cni-support.md @@ -5,14 +5,60 @@ AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling). You can monitor the `awscni` using kube-promethus with : -``` -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet') + - { - _config+:: { - # ... config here - } - }; +[embedmd]:# (../examples/eks-cni-example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { + local service = kp.core.v1.service, + local servicePort = kp.core.v1.service.mixin.spec.portsType, + _config+:: { + namespace: 'monitoring', + }, + prometheus+: { + AwsEksCniMetricService: + service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + + service.mixin.spec.withClusterIp('None'), + serviceMonitorAwsEksCNI: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'awsekscni', + namespace: kp.namespace, + labels: { + 'k8s-app': 'eks-cni', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + }, + ], + }, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ``` After you have the required yaml file please run diff --git a/examples/eks-cni-example.jsonnet b/examples/eks-cni-example.jsonnet new file mode 100644 index 00000000..62c9170c --- /dev/null +++ b/examples/eks-cni-example.jsonnet @@ -0,0 +1,55 @@ +local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { + local service = kp.core.v1.service, + local servicePort = kp.core.v1.service.mixin.spec.portsType, + _config+:: { + namespace: 'monitoring', + }, + prometheus+: { + AwsEksCniMetricService: + service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + + service.mixin.spec.withClusterIp('None'), + serviceMonitorAwsEksCNI: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'awsekscni', + namespace: kp.namespace, + labels: { + 'k8s-app': 'eks-cni', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + }, + ], + }, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet deleted file mode 100644 index aa749534..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-aws-eks-cni.libsonnet +++ /dev/null @@ -1,45 +0,0 @@ -local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; - -{ - prometheus+: { - AwsEksCniMetricService: - service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + - service.mixin.metadata.withNamespace('kube-system') + - service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + - service.mixin.spec.withClusterIp('None'), - serviceMonitorAwsEksCNI: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'awsekscni', - namespace: 'monitoring', - labels: { - 'k8s-app': 'eks-cni', - }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'aws-node', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'cni-metrics-port', - interval: '30s', - path: '/metrics', - }, - ], - }, - }, - }, -} From 8228ebd2bacafdf39d7a1ac5f45e171dfcc6baa9 Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 00:22:59 +0900 Subject: [PATCH 16/29] fix example --- examples/eks-cni-example.jsonnet | 50 ++++--------------- .../kube-prometheus-eks.libsonnet | 2 +- 2 files changed, 11 insertions(+), 41 deletions(-) diff --git a/examples/eks-cni-example.jsonnet b/examples/eks-cni-example.jsonnet index 62c9170c..6c678045 100644 --- a/examples/eks-cni-example.jsonnet +++ b/examples/eks-cni-example.jsonnet @@ -1,47 +1,19 @@ local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { - local service = kp.core.v1.service, - local servicePort = kp.core.v1.service.mixin.spec.portsType, _config+:: { namespace: 'monitoring', }, - prometheus+: { - AwsEksCniMetricService: - service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + - service.mixin.metadata.withNamespace('kube-system') + - service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + - service.mixin.spec.withClusterIp('None'), - serviceMonitorAwsEksCNI: + prometheusRules+:: { + groups+: [ { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'awsekscni', - namespace: kp.namespace, - labels: { - 'k8s-app': 'eks-cni', + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'aws-node', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'cni-metrics-port', - interval: '30s', - path: '/metrics', - }, - ], - }, + ], }, + ], }, }; @@ -49,7 +21,5 @@ local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet index aa749534..701d3c41 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet @@ -15,7 +15,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; kind: 'ServiceMonitor', metadata: { name: 'awsekscni', - namespace: 'monitoring', + namespace: $._config.namespace, labels: { 'k8s-app': 'eks-cni', }, From b03ff4f5931c82c4acaf49a43972d3cfdf94f01f Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 00:24:00 +0900 Subject: [PATCH 17/29] embedmd for doc --- docs/EKS-cni-support.md | 51 ++++++++++------------------------------- 1 file changed, 12 insertions(+), 39 deletions(-) diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md index e41f38f8..15e44f54 100644 --- a/docs/EKS-cni-support.md +++ b/docs/EKS-cni-support.md @@ -8,57 +8,30 @@ You can monitor the `awscni` using kube-promethus with : [embedmd]:# (../examples/eks-cni-example.jsonnet) ```jsonnet local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { - local service = kp.core.v1.service, - local servicePort = kp.core.v1.service.mixin.spec.portsType, _config+:: { namespace: 'monitoring', }, - prometheus+: { - AwsEksCniMetricService: - service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + - service.mixin.metadata.withNamespace('kube-system') + - service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + - service.mixin.spec.withClusterIp('None'), - serviceMonitorAwsEksCNI: + prometheusRules+:: { + groups+: [ { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'awsekscni', - namespace: kp.namespace, - labels: { - 'k8s-app': 'eks-cni', + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'aws-node', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'cni-metrics-port', - interval: '30s', - path: '/metrics', - }, - ], - }, + ], }, + ], }, }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ``` After you have the required yaml file please run From 79c670bcd052cee7d0dcaf786cf5ff0b5b797421 Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 00:37:05 +0900 Subject: [PATCH 18/29] revert examples/kustomize.jsonnet --- examples/kustomize.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 84de0cbd..db3ba344 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -26,4 +26,4 @@ local kustomization = { manifests { '../kustomization': kustomization, -} \ No newline at end of file +} From 3c4dbc52d9b7727439d1d8595791c351b1c391fc Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 01:10:12 +0900 Subject: [PATCH 19/29] bugfix eexamples/eks-cni-example --- examples/eks-cni-example.jsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/eks-cni-example.jsonnet b/examples/eks-cni-example.jsonnet index 6c678045..df6ca072 100644 --- a/examples/eks-cni-example.jsonnet +++ b/examples/eks-cni-example.jsonnet @@ -1,4 +1,5 @@ -local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { _config+:: { namespace: 'monitoring', }, From d4ba158f9baab0a89466f6405c2d937954582247 Mon Sep 17 00:00:00 2001 From: karancode Date: Wed, 23 Oct 2019 01:26:35 +0900 Subject: [PATCH 20/29] bugfix final --- docs/EKS-cni-support.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md index 15e44f54..eafa74ad 100644 --- a/docs/EKS-cni-support.md +++ b/docs/EKS-cni-support.md @@ -7,7 +7,8 @@ One fatal issue that can occur is that you run out of IP addresses in your eks c You can monitor the `awscni` using kube-promethus with : [embedmd]:# (../examples/eks-cni-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { _config+:: { namespace: 'monitoring', }, From a3ab6bd49b39158222eef7309e1a28fc17f1dc07 Mon Sep 17 00:00:00 2001 From: karancode Date: Thu, 24 Oct 2019 04:12:07 +0900 Subject: [PATCH 21/29] add available_ip rule --- .../kube-prometheus/kube-prometheus-eks.libsonnet | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet index 701d3c41..2251685a 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet @@ -42,4 +42,17 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; }, }, }, + prometheusRules+: { + groups+: [ + { + name: 'kube-prometheus-eks.rules', + rules: [ + { + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + record: 'eks_available_ip' + }, + ], + }, + ], + }, } From 4bd3cb586a1a84c21055a4d268d37cd5ef1d8329 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 16:39:45 +0900 Subject: [PATCH 22/29] add prometheus rule to patch --- jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet index 2251685a..74c50e2f 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet @@ -49,7 +49,14 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; rules: [ { expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', - record: 'eks_available_ip' + labels: { + severity: 'critical', + }, + annotations: { + message: 'Instance {{ $labels.instance }} has less than 10 IPs available.' + }, + 'for': '10m', + alert: 'EksAvailableIPs' }, ], }, From 737720c119a7b18aeb6de6926e89238f2f71f577 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 16:57:39 +0900 Subject: [PATCH 23/29] test --- examples/kustomize.jsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 38dd6c89..cd262e73 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,5 +1,6 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + { + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { _config+:: { namespace: 'monitoring', }, From 9072e3530aebfc1a2100ff96aeff49c4dcaf8486 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 17:15:45 +0900 Subject: [PATCH 24/29] fix: remove garbage character --- manifests/grafana-dashboardDefinitions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 9fb51bed..af378821 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -26124,7 +26124,7 @@ items: }, { "collapse": false, - "collapsed": false,∏ + "collapsed": false, "panels": [ { "aliasColors": { From 60bd13b34bfd78bdc755531772fce1e3b64947d7 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 18:02:46 +0900 Subject: [PATCH 25/29] remove example --- examples/kustomize.jsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index cd262e73..38dd6c89 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,6 +1,5 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { + (import 'kube-prometheus/kube-prometheus.libsonnet') + { _config+:: { namespace: 'monitoring', }, From f05e73881a91ea9fa040e9faeb31297eaaf38153 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 18:03:23 +0900 Subject: [PATCH 26/29] update dependency --- jsonnetfile.json | 10 -- jsonnetfile.lock.json | 230 ++++++++++++++++++++---------------------- 2 files changed, 110 insertions(+), 130 deletions(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index 561224c2..ad5fe5bc 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -8,16 +8,6 @@ } }, "version": "" - }, - { - "name": "node-mixin", - "source": { - "git": { - "remote": "https://github.com/prometheus/node_exporter", - "subdir": "docs/node-mixin" - } - }, - "version": "5a7b85876d6108a91f0d8673c0d7eca38687671b" } ] } diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index daa3bb8f..951f2da8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -1,123 +1,113 @@ { - "dependencies": [ - { - "name": "etcd-mixin", - "source": { - "git": { - "remote": "https://github.com/coreos/etcd", - "subdir": "Documentation/etcd-mixin" + "dependencies": [ + { + "name": "kube-prometheus", + "source": { + "local": { + "directory": "jsonnet/kube-prometheus" + } + }, + "version": "aws_eks_cni" + }, + { + "name": "ksonnet", + "source": { + "git": { + "remote": "https://github.com/ksonnet/ksonnet-lib", + "subdir": "" + } + }, + "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f" + }, + { + "name": "kubernetes-mixin", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "" + } + }, + "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5" + }, + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "47db72da03fc4a7a0658a87791e13c3315a3a252" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd" + }, + { + "name": "grafana", + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana", + "subdir": "grafana" + } + }, + "version": "539a90dbf63c812ad0194d8078dd776868a11c81" + }, + { + "name": "prometheus-operator", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "8d44e0990230144177f97cf62ae4f43b1c4e3168" + }, + { + "name": "etcd-mixin", + "source": { + "git": { + "remote": "https://github.com/coreos/etcd", + "subdir": "Documentation/etcd-mixin" + } + }, + "version": "cbc1340af53f50728181f97f6bce442ac33d8993" + }, + { + "name": "prometheus", + "source": { + "git": { + "remote": "https://github.com/prometheus/prometheus", + "subdir": "documentation/prometheus-mixin" + } + }, + "version": "e94503ff5c412590ce7616accdd3c62a2189bcd3" + }, + { + "name": "node-mixin", + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter", + "subdir": "docs/node-mixin" + } + }, + "version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5" + }, + { + "name": "promgrafonnet", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "lib/promgrafonnet" + } + }, + "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5" } - }, - "version": "fa972cf29666e821c44195c51df15b6e28ed29c4", - "sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw=" - }, - { - "name": "grafana", - "source": { - "git": { - "remote": "https://github.com/brancz/kubernetes-grafana", - "subdir": "grafana" - } - }, - "version": "539a90dbf63c812ad0194d8078dd776868a11c81", - "sum": "b8faWX1qqLGyN67sA36oRqYZ5HX+tHBRMPtrWRqIysE=" - }, - { - "name": "grafana-builder", - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs", - "subdir": "grafana-builder" - } - }, - "version": "1f273dd3c7a619bcd05c3e1c2650204104a273d8", - "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" - }, - { - "name": "grafonnet", - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib", - "subdir": "grafonnet" - } - }, - "version": "47db72da03fc4a7a0658a87791e13c3315a3a252", - "sum": "ssLOSIiYWYBvEYJHwRtwMY4kPQoP+MuIUkT0bp2Mb6A=" - }, - { - "name": "ksonnet", - "source": { - "git": { - "remote": "https://github.com/ksonnet/ksonnet-lib", - "subdir": "" - } - }, - "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f", - "sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg=" - }, - { - "name": "kube-prometheus", - "source": { - "local": { - "directory": "jsonnet/kube-prometheus" - } - }, - "version": "" - }, - { - "name": "kubernetes-mixin", - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", - "subdir": "" - } - }, - "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", - "sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU=" - }, - { - "name": "node-mixin", - "source": { - "git": { - "remote": "https://github.com/prometheus/node_exporter", - "subdir": "docs/node-mixin" - } - }, - "version": "5a7b85876d6108a91f0d8673c0d7eca38687671b", - "sum": "3N77msMjqClzQHbZOxn4GTlV+FZpU+y1gCekvCvxwy0=" - }, - { - "name": "prometheus", - "source": { - "git": { - "remote": "https://github.com/prometheus/prometheus", - "subdir": "documentation/prometheus-mixin" - } - }, - "version": "74726367cf7a7e8d0332238defd2e7f4169030bd", - "sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM=" - }, - { - "name": "prometheus-operator", - "source": { - "git": { - "remote": "https://github.com/coreos/prometheus-operator", - "subdir": "jsonnet/prometheus-operator" - } - }, - "version": "8d44e0990230144177f97cf62ae4f43b1c4e3168", - "sum": "5U7/8MD3pF9O0YDTtUhg4vctkUBRVFxZxWUyhtNiBM8=" - }, - { - "name": "promgrafonnet", - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", - "subdir": "lib/promgrafonnet" - } - }, - "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", - "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" - } - ] + ] } From 78edcc0276d25831524ec0f82afddde45c3a9265 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 20:25:55 +0900 Subject: [PATCH 27/29] make clean generate --- manifests/grafana-dashboardDefinitions.yaml | 86 +++++++++------------ 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af378821..1ac6ae66 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -17827,6 +17827,24 @@ items: "colorMode": null, "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets", + "colorMode": null, + "colors": [ + ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, @@ -17836,6 +17854,7 @@ items: "pattern": "Value #D", "thresholds": [ + ], "type": "number", "unit": "pps" }, @@ -20261,7 +20280,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n sum without (instance) (instance:node_num_cpu:sum{job=\"node-exporter\"})\n)\n", + "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -20347,7 +20366,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_load1_per_cpu:ratio{job=\"node-exporter\"})\n)\n", + "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -20445,7 +20464,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_memory_utilisation:ratio{job=\"node-exporter\"})\n)\n", + "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -20845,7 +20864,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"})\n)\n", + "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -20931,7 +20950,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"})\n)\n", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -21029,7 +21048,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n ) \n/ ignoring (instance) group_left\n sum without (instance, device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n )\n) \n", + "expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -26162,13 +26181,10 @@ items: "renderer": "flot", "repeat": null, "seriesOverrides": [ - { - "alias": "/max_shards/", - "yaxis": 2 - } + ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -26219,20 +26235,7 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Shards", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -26376,12 +26379,12 @@ items: ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", @@ -26393,7 +26396,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Pending Samples: $queue", + "title": "Dropped Samples", "tooltip": { "shared": true, "sort": 0, @@ -26427,20 +26430,7 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Shard Details", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -26480,12 +26470,12 @@ items: ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "prometheus_tsdb_wal_segment_current{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", @@ -26497,7 +26487,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "TSDB Current Segment", + "title": "Failed Samples", "tooltip": { "shared": true, "sort": 0, @@ -26515,7 +26505,7 @@ items: }, "yaxes": [ { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -26571,12 +26561,12 @@ items: ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", From 528f338477b01b26714564488f8f6a7710faea00 Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 20:30:50 +0900 Subject: [PATCH 28/29] revert jsonnetfile json --- jsonnetfile.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/jsonnetfile.json b/jsonnetfile.json index ad5fe5bc..1fe53891 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -8,6 +8,16 @@ } }, "version": "" + }, + { + "name": "node-mixin", + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter", + "subdir": "docs/node-mixin" + } + }, + "version": "5a7b85876d6108a91f0d8673c0d7eca38687671b" } ] } From 8ee17e673516ad0bdbfcf35bf8d2a464ad23f0cc Mon Sep 17 00:00:00 2001 From: karancode Date: Tue, 5 Nov 2019 21:10:40 +0900 Subject: [PATCH 29/29] with jsonnet-ci:latest image --- jsonnetfile.json | 18 ++-- jsonnetfile.lock.json | 230 ++++++++++++++++++++++-------------------- 2 files changed, 129 insertions(+), 119 deletions(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index 1fe53891..561224c2 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -8,15 +8,15 @@ } }, "version": "" - }, - { - "name": "node-mixin", - "source": { - "git": { - "remote": "https://github.com/prometheus/node_exporter", - "subdir": "docs/node-mixin" - } - }, + }, + { + "name": "node-mixin", + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter", + "subdir": "docs/node-mixin" + } + }, "version": "5a7b85876d6108a91f0d8673c0d7eca38687671b" } ] diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 951f2da8..3ce78152 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -1,113 +1,123 @@ { - "dependencies": [ - { - "name": "kube-prometheus", - "source": { - "local": { - "directory": "jsonnet/kube-prometheus" - } - }, - "version": "aws_eks_cni" - }, - { - "name": "ksonnet", - "source": { - "git": { - "remote": "https://github.com/ksonnet/ksonnet-lib", - "subdir": "" - } - }, - "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f" - }, - { - "name": "kubernetes-mixin", - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", - "subdir": "" - } - }, - "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5" - }, - { - "name": "grafonnet", - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib", - "subdir": "grafonnet" - } - }, - "version": "47db72da03fc4a7a0658a87791e13c3315a3a252" - }, - { - "name": "grafana-builder", - "source": { - "git": { - "remote": "https://github.com/kausalco/public", - "subdir": "grafana-builder" - } - }, - "version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd" - }, - { - "name": "grafana", - "source": { - "git": { - "remote": "https://github.com/brancz/kubernetes-grafana", - "subdir": "grafana" - } - }, - "version": "539a90dbf63c812ad0194d8078dd776868a11c81" - }, - { - "name": "prometheus-operator", - "source": { - "git": { - "remote": "https://github.com/coreos/prometheus-operator", - "subdir": "jsonnet/prometheus-operator" - } - }, - "version": "8d44e0990230144177f97cf62ae4f43b1c4e3168" - }, - { - "name": "etcd-mixin", - "source": { - "git": { - "remote": "https://github.com/coreos/etcd", - "subdir": "Documentation/etcd-mixin" - } - }, - "version": "cbc1340af53f50728181f97f6bce442ac33d8993" - }, - { - "name": "prometheus", - "source": { - "git": { - "remote": "https://github.com/prometheus/prometheus", - "subdir": "documentation/prometheus-mixin" - } - }, - "version": "e94503ff5c412590ce7616accdd3c62a2189bcd3" - }, - { - "name": "node-mixin", - "source": { - "git": { - "remote": "https://github.com/prometheus/node_exporter", - "subdir": "docs/node-mixin" - } - }, - "version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5" - }, - { - "name": "promgrafonnet", - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", - "subdir": "lib/promgrafonnet" - } - }, - "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5" + "dependencies": [ + { + "name": "etcd-mixin", + "source": { + "git": { + "remote": "https://github.com/coreos/etcd", + "subdir": "Documentation/etcd-mixin" } - ] + }, + "version": "cbc1340af53f50728181f97f6bce442ac33d8993", + "sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw=" + }, + { + "name": "grafana", + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana", + "subdir": "grafana" + } + }, + "version": "539a90dbf63c812ad0194d8078dd776868a11c81", + "sum": "b8faWX1qqLGyN67sA36oRqYZ5HX+tHBRMPtrWRqIysE=" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd", + "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" + }, + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "47db72da03fc4a7a0658a87791e13c3315a3a252", + "sum": "ssLOSIiYWYBvEYJHwRtwMY4kPQoP+MuIUkT0bp2Mb6A=" + }, + { + "name": "ksonnet", + "source": { + "git": { + "remote": "https://github.com/ksonnet/ksonnet-lib", + "subdir": "" + } + }, + "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f", + "sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg=" + }, + { + "name": "kube-prometheus", + "source": { + "local": { + "directory": "jsonnet/kube-prometheus" + } + }, + "version": "" + }, + { + "name": "kubernetes-mixin", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "" + } + }, + "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", + "sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU=" + }, + { + "name": "node-mixin", + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter", + "subdir": "docs/node-mixin" + } + }, + "version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5", + "sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg=" + }, + { + "name": "prometheus", + "source": { + "git": { + "remote": "https://github.com/prometheus/prometheus", + "subdir": "documentation/prometheus-mixin" + } + }, + "version": "e94503ff5c412590ce7616accdd3c62a2189bcd3", + "sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM=" + }, + { + "name": "prometheus-operator", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "8d44e0990230144177f97cf62ae4f43b1c4e3168", + "sum": "5U7/8MD3pF9O0YDTtUhg4vctkUBRVFxZxWUyhtNiBM8=" + }, + { + "name": "promgrafonnet", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "lib/promgrafonnet" + } + }, + "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", + "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" + } + ] }