From 89ed6773e76e18dcfddefdff01ba9d09e66c39fb Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 6 Mar 2017 09:55:36 +0100 Subject: [PATCH] Add 'contrib/kube-prometheus/' from commit '81c0d2f4d30f63a4e274c2870c5afc89241827b0' git-subtree-dir: contrib/kube-prometheus git-subtree-mainline: 050ca21276696c8603375c699513ec487301ed62 git-subtree-split: 81c0d2f4d30f63a4e274c2870c5afc89241827b0 --- README.md | 138 + assets/grafana/all-nodes-dashboard.json | 860 +++++ assets/grafana/deployment-dashboard.json | 817 +++++ assets/grafana/kubernetes-pods-dashboard.json | 409 +++ assets/grafana/node-dashboard.json | 880 +++++ assets/grafana/prometheus-datasource.json | 7 + assets/prometheus/rules/etcd2.rules | 121 + assets/prometheus/rules/kubernetes.rules | 388 +++ docs/KOPSonAWS.md | 35 + hack/cluster-monitoring/deploy | 41 + hack/cluster-monitoring/minikube-deploy | 6 + hack/cluster-monitoring/minikube-teardown | 6 + hack/cluster-monitoring/self-hosted-deploy | 6 + hack/cluster-monitoring/self-hosted-teardown | 6 + hack/cluster-monitoring/teardown | 24 + hack/example-service-monitoring/deploy | 19 + hack/example-service-monitoring/teardown | 12 + hack/scripts/generate-configmaps.sh | 8 + hack/scripts/wrap-dashboard.sh | 50 + .../alertmanager/alertmanager-config.yaml | 18 + .../alertmanager/alertmanager-service.yaml | 14 + manifests/alertmanager/alertmanager.yaml | 9 + manifests/etcd/etcd-bootkube-gce.yaml | 28 + .../etcd/etcd-bootkube-vagrant-multi.yaml | 28 + .../examples/example-app/example-app.yaml | 34 + .../example-app/prometheus-frontend-svc.yaml | 14 + .../example-app/prometheus-frontend.yaml | 24 + .../example-app/servicemonitor-frontend.yaml | 13 + .../kube-state-metrics-deployment.yaml | 25 + .../exporters/kube-state-metrics-service.yaml | 18 + .../exporters/node-exporter-daemonset.yaml | 45 + .../exporters/node-exporter-service.yaml | 17 + manifests/grafana/grafana-dashboards.yaml | 2984 +++++++++++++++++ manifests/grafana/grafana-deployment.yaml | 56 + manifests/grafana/grafana-service.yaml | 15 + .../k8s/minikube/kube-controller-manager.yaml | 28 + manifests/k8s/minikube/kube-scheduler.yaml | 28 + .../self-hosted/kube-controller-manager.yaml | 16 + manifests/k8s/self-hosted/kube-dns.yaml | 20 + manifests/k8s/self-hosted/kube-scheduler.yaml | 16 + manifests/prometheus-operator.yaml | 26 + .../prometheus/prometheus-k8s-rules.yaml | 447 +++ .../prometheus/prometheus-k8s-service.yaml | 14 + .../prometheus-k8s-servicemonitors.yaml | 69 + manifests/prometheus/prometheus-k8s.yaml | 24 + 45 files changed, 7863 insertions(+) create mode 100644 README.md create mode 100644 assets/grafana/all-nodes-dashboard.json create mode 100644 assets/grafana/deployment-dashboard.json create mode 100644 assets/grafana/kubernetes-pods-dashboard.json create mode 100644 assets/grafana/node-dashboard.json create mode 100644 assets/grafana/prometheus-datasource.json create mode 100644 assets/prometheus/rules/etcd2.rules create mode 100644 assets/prometheus/rules/kubernetes.rules create mode 100644 docs/KOPSonAWS.md create mode 100755 hack/cluster-monitoring/deploy create mode 100755 hack/cluster-monitoring/minikube-deploy create mode 100755 hack/cluster-monitoring/minikube-teardown create mode 100755 hack/cluster-monitoring/self-hosted-deploy create mode 100755 hack/cluster-monitoring/self-hosted-teardown create mode 100755 hack/cluster-monitoring/teardown create mode 100755 hack/example-service-monitoring/deploy create mode 100755 hack/example-service-monitoring/teardown create mode 100755 hack/scripts/generate-configmaps.sh create mode 100755 hack/scripts/wrap-dashboard.sh create mode 100644 manifests/alertmanager/alertmanager-config.yaml create mode 100644 manifests/alertmanager/alertmanager-service.yaml create mode 100644 manifests/alertmanager/alertmanager.yaml create mode 100644 manifests/etcd/etcd-bootkube-gce.yaml create mode 100644 manifests/etcd/etcd-bootkube-vagrant-multi.yaml create mode 100644 manifests/examples/example-app/example-app.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend-svc.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend.yaml create mode 100644 manifests/examples/example-app/servicemonitor-frontend.yaml create mode 100644 manifests/exporters/kube-state-metrics-deployment.yaml create mode 100644 manifests/exporters/kube-state-metrics-service.yaml create mode 100644 manifests/exporters/node-exporter-daemonset.yaml create mode 100644 manifests/exporters/node-exporter-service.yaml create mode 100644 manifests/grafana/grafana-dashboards.yaml create mode 100644 manifests/grafana/grafana-deployment.yaml create mode 100644 manifests/grafana/grafana-service.yaml create mode 100644 manifests/k8s/minikube/kube-controller-manager.yaml create mode 100644 manifests/k8s/minikube/kube-scheduler.yaml create mode 100644 manifests/k8s/self-hosted/kube-controller-manager.yaml create mode 100644 manifests/k8s/self-hosted/kube-dns.yaml create mode 100644 manifests/k8s/self-hosted/kube-scheduler.yaml create mode 100644 manifests/prometheus-operator.yaml create mode 100644 manifests/prometheus/prometheus-k8s-rules.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service.yaml create mode 100644 manifests/prometheus/prometheus-k8s-servicemonitors.yaml create mode 100644 manifests/prometheus/prometheus-k8s.yaml diff --git a/README.md b/README.md new file mode 100644 index 00000000..db4f554d --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# kube-prometheus + +This repository collects Kubernetes manifests, dashboards, and alerting rules +combined with documentation and scripts to provide single-command deployments +of end-to-end Kubernetes cluster monitoring. + +## Prerequisites + +First, you need a running Kubernetes cluster. If you don't have one, follow the +instructions of [bootkube](https://github.com/kubernetes-incubator/bootkube) or +[minikube](https://github.com/kubernetes/minikube). Some sample contents of this +repository are adapted to work with a [multi-node setup](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node) +using [bootkube](https://github.com/kubernetes-incubator/bootkube). + +## Monitoring Kubernetes + +The manifests used here use the [Prometheus Operator](https://github.com/coreos/prometheus-operator), +which manages Prometheus servers and their configuration in a cluster. With a single command we can install + +* The Operator itself +* The Prometheus [node_exporter](https://github.com/prometheus/node_exporter) +* [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) +* The [Prometheus specification](https://github.com/coreos/prometheus-operator/blob/master/Documentation/prometheus.md) based on which the Operator deploys a Prometheus setup +* A Prometheus configuration covering monitoring of all Kubernetes core components and exporters +* A default set of alerting rules on the cluster component's health +* A Grafana instance serving dashboards on cluster metrics +* A three node highly available Alertmanager cluster + +Simply run: + +```bash +export KUBECONFIG= # defaults to "~/.kube/config" +hack/cluster-monitoring/deploy +``` + +After all pods are ready, you can reach: + +* Prometheus UI on node port `30900` +* Alertmanager UI on node port `30903` +* Grafana on node port `30902` + +To tear it all down again, run: + +```bash +hack/cluster-monitoring/teardown +``` + +## Monitoring custom services + +The example manifests in [/manifests/examples/example-app](/manifests/examples/example-app) +deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus +server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/service-monitor.md), +which specifies how the example service should be monitored. +The Prometheus Operator will deploy and configure the desired Prometheus instance and continiously +manage its life cycle. + +```bash +hack/example-service-monitoring/deploy +``` + +After all pods are ready you can reach the Prometheus server on node port `30100` and observe +how it monitors the service as specified. Same as before, this Prometheus server automatically +discovers the Alertmanager cluster deployed in the [Monitoring Kubernetes](#Monitoring-Kubernetes) +section. + +Teardown: + +```bash +hack/example-service-monitoring/teardown +``` + +## Dashboarding + +The provided manifests deploy a Grafana instance serving dashboards provided via a ConfigMap. +To modify, delete, or add dashboards, the `grafana-dashboards` ConfigMap must be modified. + +Currently, Grafana does not support serving dashboards from static files. Instead, the `grafana-watcher` +sidecar container aims to emulate the behavior, by keeping the Grafana database always in sync +with the provided ConfigMap. Hence, the Grafana pod is effectively stateless. +This allows managing dashboards via `git` etc. and easily deploying them via CD pipelines. + +In the future, a separate Grafana operator will support gathering dashboards from multiple +ConfigMaps based on label selection. + +## Roadmap + +* Grafana Operator that dynamically discovers and deploys dashboards from ConfigMaps +* KPM/Helm packages to easily provide production-ready cluster-monitoring setup (essentially contents of `hack/cluster-monitoring`) +* Add meta-monitoring to default cluster monitoring setup +* Build out the provided dashboards and alerts for cluster monitoring to have full coverage of all system aspects + +## Monitoring other Cluster Components + +Discovery of API servers and kubelets works the same across all clusters. +Depending on a cluster's setup several other core components, such as etcd or the +scheduler, may be deployed in different ways. +The easiest integration point is for the cluster operator to provide headless services +of all those components to provide a common interface of discovering them. With that +setup they will automatically be discovered by the provided Prometheus configuration. + +For the `kube-scheduler` and `kube-controller-manager` there are headless +services prepared, simply add them to your running cluster: + +```bash +kubectl -n kube-system create manifests/k8s/ +``` + +> Hint: if you use this for a cluster not created with bootkube, make sure you +> populate an endpoints object with the address to your `kube-scheduler` and +> `kube-controller-manager`, or adapt the label selectors to match your setup. + +Aside from Kubernetes specific components, etcd is an important part of a +working cluster, but is typically deployed outside of it. This monitoring +setup assumes that it is made visible from within the cluster through a headless +service as well. + +> Note that minikube hides some components like etcd so to see the extend of +> this setup we recommend setting up a [local cluster using bootkube](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node). + +An example for bootkube's multi-node vagrant setup is [here](/manifests/etcd/etcd-bootkube-vagrant-multi.yaml). + +> Hint: this is merely an example for a local setup. The addresses will have to +> be adapted for a setup, that is not a single etcd bootkube created cluster. + +With that setup the headless services provide endpoint lists consumed by +Prometheus to discover the endpoints as targets: + +```bash +$ kubectl get endpoints --all-namespaces +NAMESPACE NAME ENDPOINTS AGE +default kubernetes 172.17.4.101:443 2h +kube-system kube-controller-manager-prometheus-discovery 10.2.30.2:10252 1h +kube-system kube-scheduler-prometheus-discovery 10.2.30.4:10251 1h +monitoring etcd-k8s 172.17.4.51:2379 1h +``` + +## Other Documentation +[Install Docs for a cluster created with KOPS on AWS](docs/KOPSonAWS.md) diff --git a/assets/grafana/all-nodes-dashboard.json b/assets/grafana/all-nodes-dashboard.json new file mode 100644 index 00000000..7a7c2bde --- /dev/null +++ b/assets/grafana/all-nodes-dashboard.json @@ -0,0 +1,860 @@ +{ + "dashboard": +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 4, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "All Nodes", + "version": 1 +}, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json new file mode 100644 index 00000000..69638d15 --- /dev/null +++ b/assets/grafana/deployment-dashboard.json @@ -0,0 +1,817 @@ +{ + "dashboard": { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "id": null, + "title": "Deployment", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": true, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "title": "CPU", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 8, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "cores", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Memory", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 9, + "targets": [ + { + "refId": "A", + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "GB", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "80%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Network", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 7, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "Bps", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + } + } + ], + "title": "Row", + "showTitle": false + }, + { + "title": "New row", + "height": "100px", + "editable": true, + "collapse": false, + "panels": [ + { + "title": "Desired Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 5, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600, + "metric": "kube_deployment_spec_replicas" + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + }, + "decimals": null + }, + { + "title": "Available Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 6, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ] + }, + { + "collapse": false, + "editable": true, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false, + "hideZero": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "transparent": false + } + ], + "title": "New row", + "showTitle": false + } + ], + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "annotations": { + "list": [] + }, + "schemaVersion": 12, + "version": 2, + "links": [], + "gnetId": null +}, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-pods-dashboard.json b/assets/grafana/kubernetes-pods-dashboard.json new file mode 100644 index 00000000..035da015 --- /dev/null +++ b/assets/grafana/kubernetes-pods-dashboard.json @@ -0,0 +1,409 @@ +{ + "dashboard": { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_requested_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 26 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} diff --git a/assets/grafana/node-dashboard.json b/assets/grafana/node-dashboard.json new file mode 100644 index 00000000..78a5bb37 --- /dev/null +++ b/assets/grafana/node-dashboard.json @@ -0,0 +1,880 @@ +{ + "dashboard": +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 4 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 4 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 1 +}, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} diff --git a/assets/grafana/prometheus-datasource.json b/assets/grafana/prometheus-datasource.json new file mode 100644 index 00000000..47b8f1b2 --- /dev/null +++ b/assets/grafana/prometheus-datasource.json @@ -0,0 +1,7 @@ +{ + "access": "proxy", + "basicAuth": false, + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090" +} diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules new file mode 100644 index 00000000..4a38894e --- /dev/null +++ b/assets/prometheus/rules/etcd2.rules @@ -0,0 +1,121 @@ +### General cluster availability ### + +# alert if another failed peer will result in an unavailable cluster +ALERT InsufficientPeers + IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Etcd cluster small", + description = "If one more etcd peer goes down the cluster will be unavailable", + } + +### HTTP requests alerts ### + +# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if 50% of requests get a 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + } + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + +### File descriptor alerts ### + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + +### etcd proposal alerts ### + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of failed proposals within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + +### etcd disk io latency alerts ### + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "ectd instance {{ $labels.instance }} fync durations are high", + } diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules new file mode 100644 index 00000000..157eb3fa --- /dev/null +++ b/assets/prometheus/rules/kubernetes.rules @@ -0,0 +1,388 @@ +# NOTE: These rules were kindly contributed by the SoundCloud engineering team. + +### Container resources ### + +cluster_namespace_controller_pod_container:spec_memory_limit_bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:spec_cpu_shares = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_cpu_shares{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:cpu_usage:rate = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + irate( + container_cpu_usage_seconds_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_working_set:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_rss:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_rss{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_cache:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_cache{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:disk_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_disk_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_pagefaults:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failures_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_oom:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failcnt{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +### Cluster resources ### + +cluster:memory_allocation:percent = + 100 * sum by (cluster) ( + container_spec_memory_limit_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + +cluster:memory_used:percent = + 100 * sum by (cluster) ( + container_memory_usage_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + +cluster:cpu_allocation:percent = + 100 * sum by (cluster) ( + container_spec_cpu_shares{pod_name!=""} + ) / sum by (cluster) ( + container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores + ) + +cluster:node_cpu_use:percent = + 100 * sum by (cluster) ( + rate(node_cpu{mode!="idle"}[5m]) + ) / sum by (cluster) ( + machine_cpu_cores + ) + +### API latency ### + +# Raw metrics are in microseconds. Convert to seconds. +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile( + 0.99, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile( + 0.9, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile( + 0.5, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + +### Scheduling latency ### + +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + +ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletNodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet node_exporter cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SApiserverDown + IF up{job="kubernetes"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + +# Disable for non HA kubernetes setups. +ALERT K8SApiserverDown + IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) + FOR 5m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } + +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + +ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +# To catch the conntrack sysctl de-tuning when it happens +ALERT K8SConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 + FOR 10m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + +ALERT K8SApiServerEtcdAccessLatency + IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Access to etcd is slow", + description = "99th percentile latency for apiserver to access etcd is higher than 1s.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } + diff --git a/docs/KOPSonAWS.md b/docs/KOPSonAWS.md new file mode 100644 index 00000000..902dab74 --- /dev/null +++ b/docs/KOPSonAWS.md @@ -0,0 +1,35 @@ +# Adding kube-prometheus to [KOPS](https://github.com/kubernetes/kops) on AWS 1.5.x + + +## Prerequisites + +A running Kubernetes cluster created with [KOPS](https://github.com/kubernetes/kops). + +These instructions have currently been tested with **topology=public** on AWS with KOPS 1.5.1 and Kubernetes 1.5.x + +## Open AWS Security Groups: +1. Open port 9100 on the masters security group to the nodes security group +1. Open ports 10250-10252 on the masters security group to the nodes security group. + +Example script below requires $AWS\_DEFAULT_PROFILE and [$NAME](https://github.com/kubernetes/kops/blob/master/docs/aws.md#prepare-local-environment) + +```bash +MASTER_SG=$(aws --profile ${AWS_DEFAULT_PROFILE} ec2 describe-security-groups --filters "Name=tag:Name,Values=masters.$NAME" --query "SecurityGroups[*].GroupId[]" --output=text) +NODES_SG=$(aws --profile ${AWS_DEFAULT_PROFILE} ec2 describe-security-groups --filters "Name=tag:Name,Values=nodes.$NAME" --query "SecurityGroups[*].GroupId[]" --output=text) +aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --group-id $MASTER_SG --protocol tcp --port 9100 --source-group $NODES_SG +aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --group-id $MASTER_SG --protocol tcp --port 10250-10252 --source-group $NODES_SG +``` + +## Adding kube-prometheus +Following the instructions in the [README](https://github.com/coreos/kube-prometheus/blob/master/README.md): + +Example: + +```bash +git clone -b master https://github.com/coreos/kube-prometheus.git kube-prometheus-temp; +cd kube-prometheus-temp +./hack/cluster-monitoring/deploy +kubectl -n kube-system create -f manifests/k8s/self-hosted/ +cd - +rm -rf kube-prometheus-temp +``` diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy new file mode 100755 index 00000000..9ad91eb0 --- /dev/null +++ b/hack/cluster-monitoring/deploy @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + export KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=monitoring +fi + +kubectl create namespace "$NAMESPACE" + +kctl() { + kubectl --namespace "$NAMESPACE" "$@" +} + +kctl apply -f manifests/prometheus-operator.yaml + +# Wait for TPRs to be ready. +printf "Waiting for Operator to register third party objects..." +until kctl get servicemonitor > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get prometheus > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done +echo "done!" + +kctl apply -f manifests/exporters +kctl apply -f manifests/grafana + +kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml +kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml + +kctl apply -f manifests/alertmanager/alertmanager-config.yaml +kctl apply -f manifests/alertmanager/alertmanager-service.yaml + +# `kubectl apply` is currently not working for third party resources so we are +# using `kubectl create` here for the time being. +# (https://github.com/kubernetes/kubernetes/issues/29542) +kctl create -f manifests/prometheus/prometheus-k8s-servicemonitors.yaml +kctl create -f manifests/prometheus/prometheus-k8s.yaml +kctl create -f manifests/alertmanager/alertmanager.yaml + diff --git a/hack/cluster-monitoring/minikube-deploy b/hack/cluster-monitoring/minikube-deploy new file mode 100755 index 00000000..ab7e72e4 --- /dev/null +++ b/hack/cluster-monitoring/minikube-deploy @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/deploy + +awk 'FNR==1{print "---"}1' manifests/k8s/minikube/*.yaml | sed s/MINIKUBE_IP/`minikube ip`/g | kubectl --namespace=kube-system apply -f - + diff --git a/hack/cluster-monitoring/minikube-teardown b/hack/cluster-monitoring/minikube-teardown new file mode 100755 index 00000000..3a4c986e --- /dev/null +++ b/hack/cluster-monitoring/minikube-teardown @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/teardown + +kubectl --namespace=kube-system delete -f manifests/k8s/minikube + diff --git a/hack/cluster-monitoring/self-hosted-deploy b/hack/cluster-monitoring/self-hosted-deploy new file mode 100755 index 00000000..a25f7ed3 --- /dev/null +++ b/hack/cluster-monitoring/self-hosted-deploy @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/deploy + +kubectl --namespace=kube-system apply -f manifests/k8s/self-hosted + diff --git a/hack/cluster-monitoring/self-hosted-teardown b/hack/cluster-monitoring/self-hosted-teardown new file mode 100755 index 00000000..05fd625a --- /dev/null +++ b/hack/cluster-monitoring/self-hosted-teardown @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/teardown + +kubectl --namespace=kube-system delete -f manifests/k8s/self-hosted + diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown new file mode 100755 index 00000000..45ae61ed --- /dev/null +++ b/hack/cluster-monitoring/teardown @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + export KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=monitoring +fi + +kctl() { + kubectl --namespace "$NAMESPACE" "$@" +} + +kctl delete -f manifests/exporters +kctl delete -f manifests/grafana +kctl delete -f manifests/prometheus +kctl delete -f manifests/alertmanager + +# Hack: wait a bit to let the controller delete the deployed Prometheus server. +sleep 5 + +kctl delete -f manifests/prometheus-operator.yaml + diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy new file mode 100755 index 00000000..420b5940 --- /dev/null +++ b/hack/example-service-monitoring/deploy @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=default +fi + +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-svc.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/example-app.yaml + +# `kubectl apply` is currently not working for third party resources so we are +# using `kubectl create` here for the time being. +# (https://github.com/kubernetes/kubernetes/issues/29542) +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" create -f manifests/examples/example-app/prometheus-frontend.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" create -f manifests/examples/example-app/servicemonitor-frontend.yaml + diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown new file mode 100755 index 00000000..a631fe3e --- /dev/null +++ b/hack/example-service-monitoring/teardown @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=default +fi + +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" delete -f manifests/examples/example-app + diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh new file mode 100755 index 00000000..02ba18e9 --- /dev/null +++ b/hack/scripts/generate-configmaps.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Generate Alert Rules ConfigMap +kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml + +# Generate Dashboard ConfigMap +kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml + diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh new file mode 100755 index 00000000..1b514387 --- /dev/null +++ b/hack/scripts/wrap-dashboard.sh @@ -0,0 +1,50 @@ +#!/bin/bash -eu + +# Intended usage: +# * Edit dashboard in Grafana (you need to login first with admin/admin +# login/password). +# * Save dashboard in Grafana to check is specification is correct. +# Looks like this is the only way to check is dashboard specification +# has error. +# * Download dashboard specification as JSON file in Grafana: +# Share -> Export -> Save to file. +# * Wrap dashboard specification to make it digestable by kube-prometheus: +# ./hack/scripts/wrap-dashboard.sh Nodes-1488465802729.json +# * Replace dashboard specification: +# mv Nodes-1488465802729.json assets/grafana/node-dashboard.json +# * Regenerate Grafana configmap: +# ./hack/scripts/generate-configmaps.sh +# * Apply new configmap: +# kubectl -n monitoring apply -f manifests/grafana/grafana-cm.yaml + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 path-to-dashboard.json" + exit 1 +fi + +json=$1 +temp=$(tempfile -m 0644) + +cat >> $temp <> $temp + +cat >> $temp <2Gi + # memory. Modify based on your target and time-series count for + # production use. This value is mainly meant for demonstration/testing + # purposes. + memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/manifests/examples/example-app/servicemonitor-frontend.yaml new file mode 100644 index 00000000..4ceaacd6 --- /dev/null +++ b/manifests/examples/example-app/servicemonitor-frontend.yaml @@ -0,0 +1,13 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: frontend + labels: + tier: frontend +spec: + selector: + matchLabels: + tier: frontend + endpoints: + - port: web + interval: 10s \ No newline at end of file diff --git a/manifests/exporters/kube-state-metrics-deployment.yaml b/manifests/exporters/kube-state-metrics-deployment.yaml new file mode 100644 index 00000000..6ef971ce --- /dev/null +++ b/manifests/exporters/kube-state-metrics-deployment.yaml @@ -0,0 +1,25 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: kube-state-metrics +spec: + replicas: 1 + template: + metadata: + labels: + app: kube-state-metrics + spec: + containers: + - name: kube-state-metrics + image: gcr.io/google_containers/kube-state-metrics:v0.4.1 + ports: + - name: metrics + containerPort: 8080 + resources: + requests: + memory: 30Mi + cpu: 100m + limits: + memory: 50Mi + cpu: 200m + diff --git a/manifests/exporters/kube-state-metrics-service.yaml b/manifests/exporters/kube-state-metrics-service.yaml new file mode 100644 index 00000000..607869e1 --- /dev/null +++ b/manifests/exporters/kube-state-metrics-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: kube-state-metrics + k8s-app: kube-state-metrics + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" + name: kube-state-metrics +spec: + ports: + - name: http-metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: kube-state-metrics + diff --git a/manifests/exporters/node-exporter-daemonset.yaml b/manifests/exporters/node-exporter-daemonset.yaml new file mode 100644 index 00000000..8c9565ba --- /dev/null +++ b/manifests/exporters/node-exporter-daemonset.yaml @@ -0,0 +1,45 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: node-exporter +spec: + template: + metadata: + labels: + app: node-exporter + name: node-exporter + spec: + hostNetwork: true + hostPID: true + containers: + - image: quay.io/prometheus/node-exporter:v0.13.0 + args: + - "-collector.procfs=/host/proc" + - "-collector.sysfs=/host/sys" + name: node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 + name: scrape + resources: + requests: + memory: 30Mi + cpu: 100m + limits: + memory: 50Mi + cpu: 200m + volumeMounts: + - name: proc + readOnly: true + mountPath: /host/proc + - name: sys + readOnly: true + mountPath: /host/sys + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + diff --git a/manifests/exporters/node-exporter-service.yaml b/manifests/exporters/node-exporter-service.yaml new file mode 100644 index 00000000..46b1a3fd --- /dev/null +++ b/manifests/exporters/node-exporter-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: node-exporter + k8s-app: node-exporter + name: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 9100 + protocol: TCP + selector: + app: node-exporter + diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml new file mode 100644 index 00000000..f9757dc1 --- /dev/null +++ b/manifests/grafana/grafana-dashboards.yaml @@ -0,0 +1,2984 @@ +apiVersion: v1 +data: + all-nodes-dashboard.json: | + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 4, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "All Nodes", + "version": 1 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + deployment-dashboard.json: |- + { + "dashboard": { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "id": null, + "title": "Deployment", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": true, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "title": "CPU", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 8, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "cores", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Memory", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 9, + "targets": [ + { + "refId": "A", + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "GB", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "80%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Network", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 7, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "Bps", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + } + } + ], + "title": "Row", + "showTitle": false + }, + { + "title": "New row", + "height": "100px", + "editable": true, + "collapse": false, + "panels": [ + { + "title": "Desired Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 5, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600, + "metric": "kube_deployment_spec_replicas" + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + }, + "decimals": null + }, + { + "title": "Available Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 6, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ] + }, + { + "collapse": false, + "editable": true, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false, + "hideZero": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "transparent": false + } + ], + "title": "New row", + "showTitle": false + } + ], + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "annotations": { + "list": [] + }, + "schemaVersion": 12, + "version": 2, + "links": [], + "gnetId": null + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + kubernetes-pods-dashboard.json: | + { + "dashboard": { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_requested_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 26 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + node-dashboard.json: | + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 4 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 4 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 1 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + prometheus-datasource.json: | + { + "access": "proxy", + "basicAuth": false, + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090" + } +kind: ConfigMap +metadata: + creationTimestamp: null + name: grafana-dashboards diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml new file mode 100644 index 00000000..5a70df49 --- /dev/null +++ b/manifests/grafana/grafana-deployment.yaml @@ -0,0 +1,56 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:4.1.1 + env: + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + volumeMounts: + - name: grafana-storage + mountPath: /var/grafana-storage + ports: + - name: web + containerPort: 3000 + resources: + requests: + memory: 100Mi + cpu: 100m + limits: + memory: 300Mi + cpu: 300m + - name: grafana-watcher + image: quay.io/coreos/grafana-watcher:latest + args: + - '--watch-dir=/var/grafana-dashboards' + - '--grafana-url=http://admin:admin@localhost:3000' + volumeMounts: + - name: grafana-dashboards + mountPath: /var/grafana-dashboards + resources: + requests: + memory: "16Mi" + cpu: "50m" + limits: + memory: "32Mi" + cpu: "100m" + volumeMounts: + - name: grafana-dashboards + mountPath: /var/grafana-dashboards + volumes: + - name: grafana-storage + emptyDir: {} + - name: grafana-dashboards + configMap: + name: grafana-dashboards diff --git a/manifests/grafana/grafana-service.yaml b/manifests/grafana/grafana-service.yaml new file mode 100644 index 00000000..adb26233 --- /dev/null +++ b/manifests/grafana/grafana-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana + labels: + app: grafana +spec: + type: NodePort + ports: + - name: web + port: 3000 + protocol: TCP + nodePort: 30902 + selector: + app: grafana diff --git a/manifests/k8s/minikube/kube-controller-manager.yaml b/manifests/k8s/minikube/kube-controller-manager.yaml new file mode 100644 index 00000000..135dd24c --- /dev/null +++ b/manifests/k8s/minikube/kube-controller-manager.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + protocol: TCP +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +subsets: +- addresses: + - ip: MINIKUBE_IP + ports: + - name: http-metrics + port: 10252 + protocol: TCP diff --git a/manifests/k8s/minikube/kube-scheduler.yaml b/manifests/k8s/minikube/kube-scheduler.yaml new file mode 100644 index 00000000..b3b51f38 --- /dev/null +++ b/manifests/k8s/minikube/kube-scheduler.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + protocol: TCP +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +subsets: +- addresses: + - ip: MINIKUBE_IP + ports: + - name: http-metrics + port: 10251 + protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-controller-manager.yaml b/manifests/k8s/self-hosted/kube-controller-manager.yaml new file mode 100644 index 00000000..2f22a6f2 --- /dev/null +++ b/manifests/k8s/self-hosted/kube-controller-manager.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +spec: + selector: + k8s-app: kube-controller-manager + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-dns.yaml b/manifests/k8s/self-hosted/kube-dns.yaml new file mode 100644 index 00000000..36d9a0ad --- /dev/null +++ b/manifests/k8s/self-hosted/kube-dns.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-dns-prometheus-discovery + labels: + k8s-app: kube-dns +spec: + selector: + k8s-app: kube-dns + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics-skydns + port: 10055 + targetPort: 10055 + protocol: TCP + - name: http-metrics-dnsmasq + port: 10054 + targetPort: 10054 + protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-scheduler.yaml b/manifests/k8s/self-hosted/kube-scheduler.yaml new file mode 100644 index 00000000..331998fe --- /dev/null +++ b/manifests/k8s/self-hosted/kube-scheduler.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +spec: + selector: + k8s-app: kube-scheduler + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + protocol: TCP diff --git a/manifests/prometheus-operator.yaml b/manifests/prometheus-operator.yaml new file mode 100644 index 00000000..6c8030eb --- /dev/null +++ b/manifests/prometheus-operator.yaml @@ -0,0 +1,26 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: prometheus-operator + labels: + operator: prometheus +spec: + replicas: 1 + template: + metadata: + labels: + operator: prometheus + spec: + containers: + - name: prometheus-operator + image: quay.io/coreos/prometheus-operator:v0.6.0 + args: + - "--kubelet-object=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + resources: + requests: + cpu: 100m + memory: 50Mi + limits: + cpu: 200m + memory: 300Mi diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml new file mode 100644 index 00000000..08f6dddc --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -0,0 +1,447 @@ +apiVersion: v1 +data: + etcd2.rules: "### General cluster availability ###\n\n# alert if another failed + peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"} + == 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity + = \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n + \ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n + \ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to + an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n + \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n + \ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) > + 0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n + \ summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance + {{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP + endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n + \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m])) + \n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) + > 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance + {{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT + HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", + code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) + > 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses + on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile + of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99, + rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS + {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP + requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP + requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts + ###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert + if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n + \ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS + {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors + soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance + }} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors + are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m], + 3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"file descriptors soon exhausted\",\n description = \"{{ + $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors + soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed + proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h]) + > 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary + = \"a high number of failed proposals within the etcd cluster are happening\",\n + \ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }} + proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts + ###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT + HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) + > 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n + \ summary = \"high fsync durations\",\n description = \"ectd instance {{ + $labels.instance }} fync durations are high\",\n }\n" + kubernetes.rules: |+ + # NOTE: These rules were kindly contributed by the SoundCloud engineering team. + + ### Container resources ### + + cluster_namespace_controller_pod_container:spec_memory_limit_bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:spec_cpu_shares = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_cpu_shares{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:cpu_usage:rate = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + irate( + container_cpu_usage_seconds_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_working_set:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_rss:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_rss{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_cache:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_cache{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:disk_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_disk_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_pagefaults:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failures_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_oom:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failcnt{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + ### Cluster resources ### + + cluster:memory_allocation:percent = + 100 * sum by (cluster) ( + container_spec_memory_limit_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + + cluster:memory_used:percent = + 100 * sum by (cluster) ( + container_memory_usage_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + + cluster:cpu_allocation:percent = + 100 * sum by (cluster) ( + container_spec_cpu_shares{pod_name!=""} + ) / sum by (cluster) ( + container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores + ) + + cluster:node_cpu_use:percent = + 100 * sum by (cluster) ( + rate(node_cpu{mode!="idle"}[5m]) + ) / sum by (cluster) ( + machine_cpu_cores + ) + + ### API latency ### + + # Raw metrics are in microseconds. Convert to seconds. + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile( + 0.99, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile( + 0.9, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile( + 0.5, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + + ### Scheduling latency ### + + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + + ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + + ALERT K8SKubeletNodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet node_exporter cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SApiserverDown + IF up{job="kubernetes"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + + # Disable for non HA kubernetes setups. + ALERT K8SApiserverDown + IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) + FOR 5m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + + ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } + + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + + ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + # To catch the conntrack sysctl de-tuning when it happens + ALERT K8SConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 + FOR 10m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + + ALERT K8SApiServerEtcdAccessLatency + IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Access to etcd is slow", + description = "99th percentile latency for apiserver to access etcd is higher than 1s.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } + +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-k8s-rules diff --git a/manifests/prometheus/prometheus-k8s-service.yaml b/manifests/prometheus/prometheus-k8s-service.yaml new file mode 100644 index 00000000..a558f30f --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-k8s +spec: + type: NodePort + ports: + - name: web + nodePort: 30900 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml new file mode 100644 index 00000000..110dfa42 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-apiserver + labels: + k8s-apps: https +spec: + jobLabel: provider + selector: + matchLabels: + component: apiserver + provider: kubernetes + namespaceSelector: + matchNames: + - default + endpoints: + - port: https + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-https + labels: + k8s-apps: https +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: https-metrics + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-http + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + - monitoring + endpoints: + - port: http-metrics + interval: 15s + - port: http-metrics-dnsmasq + interval: 15s + - port: http-metrics-skydns + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml new file mode 100644 index 00000000..9054ea58 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: Prometheus +metadata: + name: k8s + labels: + prometheus: k8s +spec: + replicas: 2 + version: v1.5.2 + serviceMonitorSelector: + matchExpression: + - {key: k8s-apps, operator: Exists} + resources: + requests: + # 2Gi is default, but won't schedule if you don't have a node with >2Gi + # memory. Modify based on your target and time-series count for + # production use. This value is mainly meant for demonstration/testing + # purposes. + memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web