Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Eduardo Gonzalez 2017-08-02 11:19:35 +02:00
commit 29d696ca08
13 changed files with 221 additions and 49 deletions

3
Makefile Normal file
View File

@ -0,0 +1,3 @@
generate:
@echo ">> Compiling assets and generating Kubernetes manifests"
@hack/scripts/generate-manifests.sh

View File

@ -8,3 +8,35 @@ ALERT NodeExporterDown
summary = "node-exporter cannot be scraped", summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
} }
ALERT K8SNodeOutOfDisk
IF kube_node_status_out_of_disk{condition="true"} == 1
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Node ran out of disk space.",
description = "{{ $labels.node }} has run out of disk space.",
}
ALERT K8SNodeMemoryPressure
IF kube_node_status_memory_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under memory pressure.",
description = "{{ $labels.node }} is under memory pressure.",
}
ALERT K8SNodeDiskPressure
IF kube_node_status_disk_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under disk pressure.",
description = "{{ $labels.node }} is under disk pressure.",
}

View File

@ -0,0 +1,27 @@
# Developing Alerts and Dashboards
`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added.
For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory.
The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests.
## Alerts
The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`.
It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`.
To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest.
Then the generated manifest can be applied against a Kubernetes cluster.
## Dashboards
The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`.
As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions.
To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests.
Then the generated manifest can be applied against a Kubernetes cluster.

View File

@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter
kctl apply -f manifests/kube-state-metrics kctl apply -f manifests/kube-state-metrics
kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana/grafana-credentials.yaml
kctl apply -f manifests/grafana kctl apply -f manifests/grafana
kctl apply -f manifests/prometheus/ find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \;
kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml
kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
kctl apply -f manifests/alertmanager/ kctl apply -f manifests/alertmanager/

View File

@ -15,7 +15,9 @@ kctl() {
kctl delete -f manifests/node-exporter kctl delete -f manifests/node-exporter
kctl delete -f manifests/kube-state-metrics kctl delete -f manifests/kube-state-metrics
kctl delete -f manifests/grafana kctl delete -f manifests/grafana
kctl delete -f manifests/prometheus find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
kctl delete -f manifests/alertmanager kctl delete -f manifests/alertmanager
# Hack: wait a bit to let the controller delete the deployed Prometheus server. # Hack: wait a bit to let the controller delete the deployed Prometheus server.

View File

@ -11,7 +11,7 @@ spec:
spec: spec:
containers: containers:
- name: grafana - name: grafana
image: grafana/grafana:4.1.1 image: grafana/grafana:4.4.1
env: env:
- name: GF_AUTH_BASIC_ENABLED - name: GF_AUTH_BASIC_ENABLED
value: "true" value: "true"
@ -41,7 +41,7 @@ spec:
memory: 200Mi memory: 200Mi
cpu: 200m cpu: 200m
- name: grafana-watcher - name: grafana-watcher
image: quay.io/coreos/grafana-watcher:v0.0.5 image: quay.io/coreos/grafana-watcher:v0.0.6
args: args:
- '--watch-dir=/var/grafana-dashboards' - '--watch-dir=/var/grafana-dashboards'
- '--grafana-url=http://localhost:3000' - '--grafana-url=http://localhost:3000'
@ -56,9 +56,6 @@ spec:
secretKeyRef: secretKeyRef:
name: grafana-credentials name: grafana-credentials
key: password key: password
volumeMounts:
- name: grafana-dashboards
mountPath: /var/grafana-dashboards
resources: resources:
requests: requests:
memory: "16Mi" memory: "16Mi"

View File

@ -1,9 +1,9 @@
apiVersion: extensions/v1beta1 apiVersion: extensions/v1beta1
kind: Deployment kind: Deployment
metadata: metadata:
name: prometheus-operator
labels: labels:
k8s-app: prometheus-operator k8s-app: prometheus-operator
name: prometheus-operator
spec: spec:
replicas: 1 replicas: 1
template: template:
@ -11,20 +11,20 @@ spec:
labels: labels:
k8s-app: prometheus-operator k8s-app: prometheus-operator
spec: spec:
serviceAccountName: prometheus-operator
containers: containers:
- name: prometheus-operator - args:
image: quay.io/coreos/prometheus-operator:v0.10.1 - --kubelet-service=kube-system/kubelet
args: - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1
- "--kubelet-service=kube-system/kubelet" image: quay.io/coreos/prometheus-operator:v0.11.1
- "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" name: prometheus-operator
ports: ports:
- name: http - containerPort: 8080
containerPort: 8080 name: http
resources: resources:
limits:
cpu: 200m
memory: 100Mi
requests: requests:
cpu: 100m cpu: 100m
memory: 50Mi memory: 50Mi
limits: serviceAccountName: prometheus-operator
cpu: 200m
memory: 300Mi

View File

@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@ -1,18 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@ -0,0 +1,54 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@ -0,0 +1,51 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: kube-system
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: default
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-k8s
rules:
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@ -582,6 +582,38 @@ data:
summary = "node-exporter cannot be scraped", summary = "node-exporter cannot be scraped",
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
} }
ALERT K8SNodeOutOfDisk
IF kube_node_status_out_of_disk{condition="true"} == 1
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Node ran out of disk space.",
description = "{{ $labels.node }} has run out of disk space.",
}
ALERT K8SNodeMemoryPressure
IF kube_node_status_memory_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under memory pressure.",
description = "{{ $labels.node }} is under memory pressure.",
}
ALERT K8SNodeDiskPressure
IF kube_node_status_disk_pressure{condition="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under disk pressure.",
description = "{{ $labels.node }} is under disk pressure.",
}
prometheus.rules: |+ prometheus.rules: |+
ALERT FailedReload ALERT FailedReload
IF prometheus_config_last_reload_successful == 0 IF prometheus_config_last_reload_successful == 0

View File

@ -9,6 +9,8 @@ spec:
endpoints: endpoints:
- port: http-metrics - port: http-metrics
interval: 30s interval: 30s
- port: cadvisor
interval: 30s
honorLabels: true honorLabels: true
selector: selector:
matchLabels: matchLabels: