From c0a3447c01825dbb75a928b0d23c03a3a70ebcd1 Mon Sep 17 00:00:00 2001 From: Gabi Davar Date: Sat, 8 Jul 2017 14:29:23 +0300 Subject: [PATCH 01/10] remove duplicate `volumeMount` entry. --- manifests/grafana/grafana-deployment.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a45f893e..6f3e72d4 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -56,9 +56,6 @@ spec: secretKeyRef: name: grafana-credentials key: password - volumeMounts: - - name: grafana-dashboards - mountPath: /var/grafana-dashboards resources: requests: memory: "16Mi" From eae253db74ce10117dbef9a8ee3c6cc513ba043a Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Wed, 12 Jul 2017 12:57:32 -0400 Subject: [PATCH 02/10] Scrapes cAdvisor port for metrics in Kubernetes 1.7 --- .../prometheus/prometheus-k8s-service-monitor-kubelet.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index cdc3ffb6..0eac9630 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -9,6 +9,8 @@ spec: endpoints: - port: http-metrics interval: 30s + - port: cadvisor + interval: 30s honorLabels: true selector: matchLabels: From c97a329792e65665a2ad1ee2fab11b14bd25d3d9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 29 Jun 2017 16:32:51 +0200 Subject: [PATCH 03/10] kube-prometheus: run prometheus-k8s with only those roles it needs --- hack/cluster-monitoring/deploy | 4 +- hack/cluster-monitoring/teardown | 4 +- .../prometheus-cluster-role-binding.yaml | 12 ----- .../prometheus/prometheus-cluster-role.yaml | 18 ------- .../prometheus-k8s-role-bindings.yaml | 54 +++++++++++++++++++ .../prometheus/prometheus-k8s-roles.yaml | 50 +++++++++++++++++ 6 files changed, 110 insertions(+), 32 deletions(-) delete mode 100644 manifests/prometheus/prometheus-cluster-role-binding.yaml delete mode 100644 manifests/prometheus/prometheus-cluster-role.yaml create mode 100644 manifests/prometheus/prometheus-k8s-role-bindings.yaml create mode 100644 manifests/prometheus/prometheus-k8s-roles.yaml diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index c565d442..9176b956 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter kctl apply -f manifests/kube-state-metrics kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana -kctl apply -f manifests/prometheus/ +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; +kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml kctl apply -f manifests/alertmanager/ diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index 9fcc4513..ac4d222d 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -15,7 +15,9 @@ kctl() { kctl delete -f manifests/node-exporter kctl delete -f manifests/kube-state-metrics kctl delete -f manifests/grafana -kctl delete -f manifests/prometheus +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; +kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml kctl delete -f manifests/alertmanager # Hack: wait a bit to let the controller delete the deployed Prometheus server. diff --git a/manifests/prometheus/prometheus-cluster-role-binding.yaml b/manifests/prometheus/prometheus-cluster-role-binding.yaml deleted file mode 100644 index 3600490f..00000000 --- a/manifests/prometheus/prometheus-cluster-role-binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus/prometheus-cluster-role.yaml b/manifests/prometheus/prometheus-cluster-role.yaml deleted file mode 100644 index a85422ec..00000000 --- a/manifests/prometheus/prometheus-cluster-role.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-role-bindings.yaml b/manifests/prometheus/prometheus-k8s-role-bindings.yaml new file mode 100644 index 00000000..5f190e7a --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-role-bindings.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml new file mode 100644 index 00000000..7a3efa90 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -0,0 +1,50 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: monitoring +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: kube-system +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: default +rules: +- apiGroups: [""] + resources: + - services + - endpoints + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus-k8s +rules: +- nonResourceURLs: ["/metrics"] + verbs: ["get"] From 6571c71e8ca00cced4373ae664b8da99f0437fcb Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 20 Jul 2017 17:49:54 +0200 Subject: [PATCH 04/10] *: bump version to v0.11.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index d574b89f..64635192 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.10.1 + image: quay.io/coreos/prometheus-operator:v0.11.0 args: - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From 817bd1da972610caa0d822100cd4782d9b19db16 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 24 Jul 2017 15:25:31 +0200 Subject: [PATCH 05/10] kube-prometheus/docs: add docs on how to modify assets --- Makefile | 3 +++ docs/developing-alerts-and-dashboards.md | 27 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 Makefile create mode 100644 docs/developing-alerts-and-dashboards.md diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..04bd205a --- /dev/null +++ b/Makefile @@ -0,0 +1,3 @@ +generate: + @echo ">> Compiling assets and generating Kubernetes manifests" + @hack/scripts/generate-manifests.sh diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md new file mode 100644 index 00000000..80630940 --- /dev/null +++ b/docs/developing-alerts-and-dashboards.md @@ -0,0 +1,27 @@ +# Developing Alerts and Dashboards + +`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added. + +For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory. + +The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. + +## Alerts + +The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. + +It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`. + +To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest. + +Then the generated manifest can be applied against a Kubernetes cluster. + +## Dashboards + +The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`. + +As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions. + +To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests. + +Then the generated manifest can be applied against a Kubernetes cluster. From f63473a2697e9b730cc32832527c4068bb893c05 Mon Sep 17 00:00:00 2001 From: Wei Wei Date: Mon, 10 Jul 2017 13:07:27 +0800 Subject: [PATCH 06/10] add grafana chart for kube-prometheus --- manifests/grafana/grafana-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a45f893e..05ca87eb 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.1.1 + image: grafana/grafana:4.4.1 env: - name: GF_AUTH_BASIC_ENABLED value: "true" @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.5 + image: quay.io/coreos/grafana-watcher:v0.0.6 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From 7010e32130932372c87562881bb0d4908cc12ad0 Mon Sep 17 00:00:00 2001 From: Zachary Yonash Date: Thu, 27 Jul 2017 03:49:25 -0400 Subject: [PATCH 07/10] Added a few extra node rules (#478) --- assets/prometheus/rules/node.rules | 32 +++++++++++++++++++ .../prometheus/prometheus-k8s-rules.yaml | 32 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 36ea482c..54085392 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -8,3 +8,35 @@ ALERT NodeExporterDown summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } +ALERT K8SNodeOutOfDisk + IF kube_node_status_out_of_disk{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Node ran out of disk space.", + description = "{{ $labels.node }} has run out of disk space.", + } + +ALERT K8SNodeMemoryPressure + IF kube_node_status_memory_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under memory pressure.", + description = "{{ $labels.node }} is under memory pressure.", + } + +ALERT K8SNodeDiskPressure + IF kube_node_status_disk_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under disk pressure.", + description = "{{ $labels.node }} is under disk pressure.", + } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 181a70c7..e1740562 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -582,6 +582,38 @@ data: summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } + ALERT K8SNodeOutOfDisk + IF kube_node_status_out_of_disk{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Node ran out of disk space.", + description = "{{ $labels.node }} has run out of disk space.", + } + + ALERT K8SNodeMemoryPressure + IF kube_node_status_memory_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under memory pressure.", + description = "{{ $labels.node }} is under memory pressure.", + } + + ALERT K8SNodeDiskPressure + IF kube_node_status_disk_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under disk pressure.", + description = "{{ $labels.node }} is under disk pressure.", + } prometheus.rules: |+ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 From 154456ad799b1b12c97d6956cc4e2d1dfcb25d74 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 27 Jul 2017 14:17:57 +0200 Subject: [PATCH 08/10] generate Prometheus Operator deployments with jsonnet (#508) * *: use jsonnet to generate manifests * generate Prometheus Operator manifests with jsonnet * add jsonnet dockenfile for generating with jenkins --- .../prometheus-operator.yaml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 64635192..b2c37fd1 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -1,9 +1,9 @@ apiVersion: extensions/v1beta1 kind: Deployment metadata: - name: prometheus-operator labels: k8s-app: prometheus-operator + name: prometheus-operator spec: replicas: 1 template: @@ -11,20 +11,20 @@ spec: labels: k8s-app: prometheus-operator spec: - serviceAccountName: prometheus-operator containers: - - name: prometheus-operator + - args: + - --kubelet-service=kube-system/kubelet + - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 image: quay.io/coreos/prometheus-operator:v0.11.0 - args: - - "--kubelet-service=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + name: prometheus-operator ports: - - name: http - containerPort: 8080 + - containerPort: 8080 + name: http resources: + limits: + cpu: 200m + memory: 100Mi requests: cpu: 100m memory: 50Mi - limits: - cpu: 200m - memory: 300Mi + serviceAccountName: prometheus-operator From caeaaf52ea613f40d8c2e8913cca9fd9e24a96ff Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 28 Jul 2017 12:06:00 +0200 Subject: [PATCH 09/10] *: bump version --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index b2c37fd1..29bbf746 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.11.0 + image: quay.io/coreos/prometheus-operator:v0.11.1 name: prometheus-operator ports: - containerPort: 8080 From d34811cbe9c2656d83483bc3399d2812a7e22234 Mon Sep 17 00:00:00 2001 From: xvzup Date: Tue, 1 Aug 2017 18:10:23 +0200 Subject: [PATCH 10/10] Update prometheus-k8s-roles.yaml Resource pods added to role prometheus-k8s for namespace default. This is required to monitor kube-apiserver. --- manifests/prometheus/prometheus-k8s-roles.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml index 7a3efa90..14302ea0 100644 --- a/manifests/prometheus/prometheus-k8s-roles.yaml +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -39,6 +39,7 @@ rules: resources: - services - endpoints + - pods verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1beta1