Merge remote-tracking branch 'upstream/master'

2025-10-26 05:31:21 +01:00 · 2017-08-02 11:19:35 +02:00 · 2017-08-02 11:19:35 +02:00 · 29d696ca08
commit 29d696ca08
parent 92d7d4ad31 d34811cbe9
13 changed files with 221 additions and 49 deletions
--- a/3
+++ b/3
@ -0,0 +1,3 @@
 generate:
 	@echo ">> Compiling assets and generating Kubernetes manifests"
 	@hack/scripts/generate-manifests.sh
--- a/assets/prometheus/rules/node.rules
+++ b/assets/prometheus/rules/node.rules
@ -8,3 +8,35 @@ ALERT NodeExporterDown
    summary = "node-exporter cannot be scraped",
    description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
  }
 ALERT K8SNodeOutOfDisk
  IF kube_node_status_out_of_disk{condition="true"} == 1
  LABELS {
    service = "k8s",
    severity = "critical"
  }
  ANNOTATIONS {
    summary = "Node ran out of disk space.",
    description = "{{ $labels.node }} has run out of disk space.",
  }
 ALERT K8SNodeMemoryPressure
  IF kube_node_status_memory_pressure{condition="true"} == 1
  LABELS {
    service = "k8s",
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "Node is under memory pressure.",
    description = "{{ $labels.node }} is under memory pressure.",
  }
 ALERT K8SNodeDiskPressure
  IF kube_node_status_disk_pressure{condition="true"} == 1
  LABELS {
    service = "k8s",
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "Node is under disk pressure.",
    description = "{{ $labels.node }} is under disk pressure.",
  }
--- a/docs/developing-alerts-and-dashboards.md
+++ b/docs/developing-alerts-and-dashboards.md
@ -0,0 +1,27 @@
 # Developing Alerts and Dashboards
 `kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added.
 For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory.
 The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests.
 ## Alerts
 The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`.
 It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`.
 To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest.
 Then the generated manifest can be applied against a Kubernetes cluster.
 ## Dashboards
 The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`.
 As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions.
 To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests.
 Then the generated manifest can be applied against a Kubernetes cluster.
--- a/hack/cluster-monitoring/deploy
+++ b/hack/cluster-monitoring/deploy
@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter
 kctl apply -f manifests/kube-state-metrics
 kctl apply -f manifests/grafana/grafana-credentials.yaml
 kctl apply -f manifests/grafana
-kctl apply -f manifests/prometheus/
+find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \;
 kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml
 kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
 kctl apply -f manifests/alertmanager/
--- a/hack/cluster-monitoring/teardown
+++ b/hack/cluster-monitoring/teardown
@ -15,7 +15,9 @@ kctl() {
 kctl delete -f manifests/node-exporter
 kctl delete -f manifests/kube-state-metrics
 kctl delete -f manifests/grafana
-kctl delete -f manifests/prometheus
+find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
 kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
 kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
 kctl delete -f manifests/alertmanager
 # Hack: wait a bit to let the controller delete the deployed Prometheus server.
--- a/manifests/grafana/grafana-deployment.yaml
+++ b/manifests/grafana/grafana-deployment.yaml
@ -11,7 +11,7 @@ spec:
    spec:
      containers:
      - name: grafana
-        image: grafana/grafana:4.1.1
+        image: grafana/grafana:4.4.1
        env:
        - name: GF_AUTH_BASIC_ENABLED
          value: "true"
@ -41,7 +41,7 @@ spec:
            memory: 200Mi
            cpu: 200m
      - name: grafana-watcher
-        image: quay.io/coreos/grafana-watcher:v0.0.5
+        image: quay.io/coreos/grafana-watcher:v0.0.6
        args:
          - '--watch-dir=/var/grafana-dashboards'
          - '--grafana-url=http://localhost:3000'
@ -56,9 +56,6 @@ spec:
            secretKeyRef:
              name: grafana-credentials
              key: password
        volumeMounts:
        - name: grafana-dashboards
          mountPath: /var/grafana-dashboards
        resources:
          requests:
            memory: "16Mi"
--- a/manifests/prometheus-operator/prometheus-operator.yaml
+++ b/manifests/prometheus-operator/prometheus-operator.yaml
@ -1,9 +1,9 @@
 apiVersion: extensions/v1beta1
 kind: Deployment
 metadata:
  name: prometheus-operator
  labels:
    k8s-app: prometheus-operator
  name: prometheus-operator
 spec:
  replicas: 1
  template:
@ -11,20 +11,20 @@ spec:
      labels:
        k8s-app: prometheus-operator
    spec:
      serviceAccountName: prometheus-operator
      containers:
-      - name: prometheus-operator
+      - args:
-        image: quay.io/coreos/prometheus-operator:v0.10.1
+        - --kubelet-service=kube-system/kubelet
-        args:
+        - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1
-        - "--kubelet-service=kube-system/kubelet"
+        image: quay.io/coreos/prometheus-operator:v0.11.1
-        - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"
+        name: prometheus-operator
        ports:
-        - name: http
+        - containerPort: 8080
-          containerPort: 8080
+          name: http
        resources:
          limits:
            cpu: 200m
            memory: 100Mi
          requests:
            cpu: 100m
            memory: 50Mi
-          limits:
+      serviceAccountName: prometheus-operator
            cpu: 200m
            memory: 300Mi
--- a/manifests/prometheus/prometheus-cluster-role-binding.yaml
+++ b/manifests/prometheus/prometheus-cluster-role-binding.yaml
@ -1,12 +0,0 @@
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRoleBinding
 metadata:
  name: prometheus
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
 subjects:
 - kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
--- a/manifests/prometheus/prometheus-cluster-role.yaml
+++ b/manifests/prometheus/prometheus-cluster-role.yaml
@ -1,18 +0,0 @@
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRole
 metadata:
  name: prometheus
 rules:
 - apiGroups: [""]
  resources:
  - nodes
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 - apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
 - nonResourceURLs: ["/metrics"]
  verbs: ["get"]
--- a/manifests/prometheus/prometheus-k8s-role-bindings.yaml
+++ b/manifests/prometheus/prometheus-k8s-role-bindings.yaml
@ -0,0 +1,54 @@
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: RoleBinding
 metadata:
  name: prometheus-k8s
  namespace: monitoring
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: prometheus-k8s
 subjects:
 - kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: RoleBinding
 metadata:
  name: prometheus-k8s
  namespace: kube-system
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: prometheus-k8s
 subjects:
 - kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: RoleBinding
 metadata:
  name: prometheus-k8s
  namespace: default
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: prometheus-k8s
 subjects:
 - kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRoleBinding
 metadata:
  name: prometheus-k8s
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-k8s
 subjects:
 - kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
--- a/manifests/prometheus/prometheus-k8s-roles.yaml
+++ b/manifests/prometheus/prometheus-k8s-roles.yaml
@ -0,0 +1,51 @@
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: Role
 metadata:
  name: prometheus-k8s
  namespace: monitoring
 rules:
 - apiGroups: [""]
  resources:
  - nodes
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 - apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: Role
 metadata:
  name: prometheus-k8s
  namespace: kube-system
 rules:
 - apiGroups: [""]
  resources:
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: Role
 metadata:
  name: prometheus-k8s
  namespace: default
 rules:
 - apiGroups: [""]
  resources:
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRole
 metadata:
  name: prometheus-k8s
 rules:
 - nonResourceURLs: ["/metrics"]
  verbs: ["get"]
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@ -582,6 +582,38 @@ data:
        summary = "node-exporter cannot be scraped",
        description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
      }
    ALERT K8SNodeOutOfDisk
      IF kube_node_status_out_of_disk{condition="true"} == 1
      LABELS {
        service = "k8s",
        severity = "critical"
      }
      ANNOTATIONS {
        summary = "Node ran out of disk space.",
        description = "{{ $labels.node }} has run out of disk space.",
      }
    ALERT K8SNodeMemoryPressure
      IF kube_node_status_memory_pressure{condition="true"} == 1
      LABELS {
        service = "k8s",
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Node is under memory pressure.",
        description = "{{ $labels.node }} is under memory pressure.",
      }
    ALERT K8SNodeDiskPressure
      IF kube_node_status_disk_pressure{condition="true"} == 1
      LABELS {
        service = "k8s",
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Node is under disk pressure.",
        description = "{{ $labels.node }} is under disk pressure.",
      }
  prometheus.rules: |+
    ALERT FailedReload
      IF prometheus_config_last_reload_successful == 0
--- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml
+++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml
@ -9,6 +9,8 @@ spec:
  endpoints:
  - port: http-metrics
    interval: 30s
  - port: cadvisor
    interval: 30s
    honorLabels: true
  selector:
    matchLabels: