diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..6e12307c --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,37 @@ + + +## Description + +_Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request. +If it fixes a bug or resolves a feature request, be sure to link to that issue._ + + + +## Type of change + +_What type of changes does your code introduce to the kube-prometheus? Put an `x` in the box that apply._ + +- [ ] `CHANGE` (fix or feature that would cause existing functionality to not work as expected) +- [ ] `FEATURE` (non-breaking change which adds functionality) +- [ ] `BUGFIX` (non-breaking change which fixes an issue) +- [ ] `ENHANCEMENT` (non-breaking change which improves existing functionality) +- [ ] `NONE` (if none of the other choices apply. Example, tooling, build system, CI, docs, etc.) + +## Changelog entry + +_Please put a one-line changelog entry below. Later this will be copied to the changelog file._ + + + +```release-note + +``` diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 54a8e50c..e4b5c79e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,8 +3,8 @@ on: - push - pull_request env: - golang-version: '1.13' - kind-version: 'v0.9.0' + golang-version: '1.15' + kind-version: 'v0.11.1' jobs: generate: runs-on: ${{ matrix.os }} @@ -16,15 +16,35 @@ jobs: name: Generate steps: - uses: actions/checkout@v2 + with: + persist-credentials: false - uses: actions/setup-go@v2 with: go-version: ${{ env.golang-version }} - - run: make --always-make generate && git diff --exit-code + - run: make --always-make generate validate && git diff --exit-code + lint: + runs-on: ubuntu-latest + name: Jsonnet linter + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false + - run: make --always-make lint + fmt: + runs-on: ubuntu-latest + name: Jsonnet formatter + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false + - run: make --always-make fmt && git diff --exit-code unit-tests: runs-on: ubuntu-latest name: Unit tests steps: - uses: actions/checkout@v2 + with: + persist-credentials: false - run: make --always-make test e2e-tests: name: E2E tests @@ -32,21 +52,20 @@ jobs: strategy: matrix: kind-image: - - 'kindest/node:v1.19.0' + - 'kindest/node:v1.21.1' + - 'kindest/node:v1.22.0' steps: - uses: actions/checkout@v2 + with: + persist-credentials: false - name: Start KinD uses: engineerd/setup-kind@v0.5.0 with: version: ${{ env.kind-version }} image: ${{ matrix.kind-image }} + wait: 300s - name: Wait for cluster to finish bootstraping - run: | - until [ "$(kubectl get pods --all-namespaces --no-headers | grep -cEv '([0-9]+)/\1')" -eq 0 ]; do - sleep 5s - done - kubectl cluster-info - kubectl get pods -A + run: kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout=300s - name: Create kube-prometheus stack run: | kubectl create -f manifests/setup diff --git a/.github/workflows/versions.yaml b/.github/workflows/versions.yaml new file mode 100644 index 00000000..0230d441 --- /dev/null +++ b/.github/workflows/versions.yaml @@ -0,0 +1,68 @@ +name: Upgrade to latest versions + +on: + workflow_dispatch: + schedule: + - cron: '37 7 * * 1' +jobs: + versions: + runs-on: ubuntu-latest + strategy: + matrix: + branch: + - 'release-0.6' + - 'release-0.7' + - 'release-0.8' + - 'release-0.9' + - 'main' + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ matrix.branch }} + - uses: actions/setup-go@v2 + with: + go-version: 1.16 + - name: Upgrade versions + run: | + export GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }} + # Write to temporary file to make update atomic + scripts/generate-versions.sh > /tmp/versions.json + mv /tmp/versions.json jsonnet/kube-prometheus/versions.json + if: matrix.branch == 'main' + - name: Update jsonnet dependencies + run: | + make update + make generate + + # Reset jsonnetfile.lock.json if no dependencies were updated + changedFiles=$(git diff --name-only | grep -v 'jsonnetfile.lock.json' | wc -l) + if [[ "$changedFiles" -eq 0 ]]; then + git checkout -- jsonnetfile.lock.json; + fi + - name: Create Pull Request + uses: peter-evans/create-pull-request@v3 + with: + commit-message: "[bot] [${{ matrix.branch }}] Automated version update" + title: "[bot] [${{ matrix.branch }}] Automated version update" + body: | + ## Description + + This is an automated version and jsonnet dependencies update performed from CI. + + Configuration of the workflow is located in `.github/workflows/versions.yaml` + + ## Type of change + + - [x] `NONE` (if none of the other choices apply. Example, tooling, build system, CI, docs, etc.) + + ## Changelog entry + + ```release-note + + ``` + team-reviewers: kube-prometheus-reviewers + branch: automated-updates-${{ matrix.branch }} + delete-branch: true + # GITHUB_TOKEN cannot be used as it won't trigger CI in a created PR + # More in https://github.com/peter-evans/create-pull-request/issues/155 + token: ${{ secrets.PROM_OP_BOT_PAT }} diff --git a/.gitignore b/.gitignore index f334fb56..a82ceced 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ minikube-manifests/ vendor/ ./auth .swp +crdschemas/ + +.gitpod/_output/ \ No newline at end of file diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 00000000..936bc53a --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,47 @@ +image: gitpod/workspace-full +checkoutLocation: gitpod-k3s +tasks: + - init: | + make --always-make + export PATH="$(pwd)/tmp/bin:${PATH}" + cat > ${PWD}/.git/hooks/pre-commit < /dev/null 2>&1 + echo "Checking if manifests are correct" + make generate > /dev/null 2>&1 + + git diff --exit-code + if [[ \$? == 1 ]]; then + echo " + + This commit is being rejected because the YAML manifests are incorrect or jsonnet needs to be formatted." + echo "Please commit your changes again!" + exit 1 + fi + EOF + chmod +x ${PWD}/.git/hooks/pre-commit + - name: run kube-prometheus + command: | + .gitpod/prepare-k3s.sh + .gitpod/deploy-kube-prometheus.sh + - name: kernel dev environment + init: | + sudo apt update -y + sudo apt install qemu qemu-system-x86 linux-image-$(uname -r) libguestfs-tools sshpass netcat -y + sudo curl -o /usr/bin/kubectl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + sudo chmod +x /usr/bin/kubectl + .gitpod/prepare-rootfs.sh + command: | + .gitpod/qemu.sh +ports: + - port: 3000 + onOpen: open-browser + - port: 9090 + onOpen: open-browser + - port: 9093 + onOpen: open-browser +vscode: + extensions: + - heptio.jsonnet@0.1.0:woEDU5N62LRdgdz0g/I6sQ== \ No newline at end of file diff --git a/.gitpod/deploy-kube-prometheus.sh b/.gitpod/deploy-kube-prometheus.sh new file mode 100755 index 00000000..fdd9c1d2 --- /dev/null +++ b/.gitpod/deploy-kube-prometheus.sh @@ -0,0 +1,16 @@ +kubectl apply -f manifests/setup + +# Safety wait for CRDs to be working +sleep 30 + +kubectl apply -f manifests/ + +kubectl rollout status -n monitoring daemonset node-exporter +kubectl rollout status -n monitoring statefulset alertmanager-main +kubectl rollout status -n monitoring statefulset prometheus-k8s +kubectl rollout status -n monitoring deployment grafana +kubectl rollout status -n monitoring deployment kube-state-metrics + +kubectl port-forward -n monitoring svc/grafana 3000 > /dev/null 2>&1 & +kubectl port-forward -n monitoring svc/alertmanager-main 9093 > /dev/null 2>&1 & +kubectl port-forward -n monitoring svc/prometheus-k8s 9090 > /dev/null 2>&1 & \ No newline at end of file diff --git a/.gitpod/prepare-k3s.sh b/.gitpod/prepare-k3s.sh new file mode 100755 index 00000000..ccfd658a --- /dev/null +++ b/.gitpod/prepare-k3s.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +script_dirname="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +rootfslock="${script_dirname}/_output/rootfs/rootfs-ready.lock" +k3sreadylock="${script_dirname}/_output/rootfs/k3s-ready.lock" + +if test -f "${k3sreadylock}"; then + exit 0 +fi + +cd $script_dirname + +function waitssh() { + while ! nc -z 127.0.0.1 2222; do + sleep 0.1 + done + ./ssh.sh "whoami" &>/dev/null + if [ $? -ne 0 ]; then + sleep 1 + waitssh + fi +} + +function waitrootfs() { + while ! test -f "${rootfslock}"; do + sleep 0.1 + done +} + +echo "πŸ”₯ Installing everything, this will be done only one time per workspace." + +echo "Waiting for the rootfs to become available, it can take a while, open the terminal #2 for progress" +waitrootfs +echo "βœ… rootfs available" + +echo "Waiting for the ssh server to become available, it can take a while, after this k3s is getting installed" +waitssh +echo "βœ… ssh server available" + +./ssh.sh "curl -sfL https://get.k3s.io | sh -" + +mkdir -p ~/.kube +./scp.sh root@127.0.0.1:/etc/rancher/k3s/k3s.yaml ~/.kube/config + +echo "βœ… k3s server is ready" +touch "${k3sreadylock}" + +# safety wait for cluster availability +sleep 30s \ No newline at end of file diff --git a/.gitpod/prepare-rootfs.sh b/.gitpod/prepare-rootfs.sh new file mode 100755 index 00000000..c67e9a77 --- /dev/null +++ b/.gitpod/prepare-rootfs.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -euo pipefail + +img_url="https://cloud-images.ubuntu.com/hirsute/current/hirsute-server-cloudimg-amd64.tar.gz" + +script_dirname="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +outdir="${script_dirname}/_output/rootfs" + +rm -Rf $outdir +mkdir -p $outdir + +curl -L -o "${outdir}/rootfs.tar.gz" $img_url + +cd $outdir + +tar -xvf rootfs.tar.gz + +qemu-img resize hirsute-server-cloudimg-amd64.img +20G + +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command 'resize2fs /dev/sda' + +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --root-password password:root + +netconf=" +network: + version: 2 + renderer: networkd + ethernets: + enp0s3: + dhcp4: yes +" + +# networking setup +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command "echo '${netconf}' > /etc/netplan/01-net.yaml" + +# copy kernel modules +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --copy-in /lib/modules/$(uname -r):/lib/modules + +# ssh +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command 'apt remove openssh-server -y && apt install openssh-server -y' +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config" +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command "sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config" + +# mark as ready +touch rootfs-ready.lock + +echo "k3s development environment is ready" diff --git a/.gitpod/qemu.sh b/.gitpod/qemu.sh new file mode 100755 index 00000000..f4256439 --- /dev/null +++ b/.gitpod/qemu.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -xeuo pipefail + +script_dirname="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +outdir="${script_dirname}/_output" + +sudo qemu-system-x86_64 -kernel "/boot/vmlinuz" \ +-boot c -m 3073M -hda "${outdir}/rootfs/hirsute-server-cloudimg-amd64.img" \ +-net user \ +-smp 8 \ +-append "root=/dev/sda rw console=ttyS0,115200 acpi=off nokaslr" \ +-nic user,hostfwd=tcp::2222-:22,hostfwd=tcp::6443-:6443 \ +-serial mon:stdio -display none \ No newline at end of file diff --git a/.gitpod/scp.sh b/.gitpod/scp.sh new file mode 100755 index 00000000..2295c3c0 --- /dev/null +++ b/.gitpod/scp.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sshpass -p 'root' scp -o StrictHostKeychecking=no -P 2222 $@ \ No newline at end of file diff --git a/.gitpod/ssh.sh b/.gitpod/ssh.sh new file mode 100755 index 00000000..b4d2ca8c --- /dev/null +++ b/.gitpod/ssh.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sshpass -p 'root' ssh -o StrictHostKeychecking=no -p 2222 root@127.0.0.1 "$@" \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..db7a1140 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,44 @@ +## release-0.9 / 2021-08-19 + +* [CHANGE] Test against Kubernetes 1.21 and 1,22. #1161 #1337 +* [CHANGE] Drop cAdvisor metrics without (pod, namespace) label pairs. #1250 +* [CHANGE] Excluded deprecated `etcd_object_counts` metric. #1337 +* [FEATURE] Add PodDisruptionBudget to prometheus-adapter. #1136 +* [FEATURE] Add support for feature flags in Prometheus. #1129 +* [FEATURE] Add env parameter for grafana component. #1171 +* [FEATURE] Add gitpod deployment of kube-prometheus on k3s. #1211 +* [FEATURE] Add resource requests and limits to prometheus-adapter container. #1282 +* [FEATURE] Add PodMonitor for kube-proxy. #1230 +* [FEATURE] Turn AWS VPC CNI into a control plane add-on. #1307 +* [ENHANCEMENT] Export anti-affinity addon. #1114 +* [ENHANCEMENT] Allow changing configmap-reloader, grafana, and kube-rbac-proxy images in $.values.common.images. #1123 #1124 #1125 +* [ENHANCEMENT] Add automated version upgrader. #1166 +* [ENHANCEMENT] Improve all-namespace addon. #1131 +* [ENHANCEMENT] Add example of running without grafana deployment. #1201 +* [ENHANCEMENT] Import managed-cluster addon for the EKS platform. #1205 +* [ENHANCEMENT] Automatically update jsonnet dependencies. #1220 +* [ENHANCEMENT] Adapt kube-prometheus to changes to ovn veth interfaces names. #1224 +* [ENHANCEMENT] Add example release-0.3 to release-0.8 migration to docs. #1235 +* [ENHANCEMENT] Consolidate intervals used in prometheus-adapter CPU queries. #1231 +* [ENHANCEMENT] Create dashboardDefinitions if rawDashboards or folderDashboards are specified. #1255 +* [ENHANCEMENT] Relabel instance with node name for CNI DaemonSet on EKS. #1259 +* [ENHANCEMENT] Update doc on Prometheus rule updates since release 0.8. #1253 +* [ENHANCEMENT] Point runbooks to https://runbooks.prometheus-operator.dev. #1267 +* [ENHANCEMENT] Allow setting of kubeRbacProxyMainResources in kube-state-metrics. #1257 +* [ENHANCEMENT] Automate release branch updates. #1293 #1303 +* [ENHANCEMENT] Create Thanos Sidecar rules separately from Prometheus ones. #1308 +* [ENHANCEMENT] Allow using newer jsonnet-bundler dependency resolution when using windows addon. #1310 +* [ENHANCEMENT] Prometheus ruleSelector defaults to all rules. +* [BUGFIX] Fix kube-state-metrics metric denylist regex pattern. #1146 +* [BUGFIX] Fix missing resource config in blackbox exporter. #1148 +* [BUGFIX] Fix adding private repository. #1169 +* [BUGFIX] Fix kops selectors for scheduler, controllerManager and kube-dns. #1164 +* [BUGFIX] Fix scheduler and controller selectors for Kubespray. #1142 +* [BUGFIX] Fix label selector for coredns ServiceMonitor. #1200 +* [BUGFIX] Fix name for blackbox-exporter PodSecurityPolicy. #1213 +* [BUGFIX] Fix ingress path rules for networking.k8s.io/v1. #1212 +* [BUGFIX] Disable insecure cypher suites for prometheus-adapter. #1216 +* [BUGFIX] Fix CNI metrics relabelings on EKS. #1277 +* [BUGFIX] Fix node-exporter ignore list for OVN. #1283 +* [BUGFIX] Revert back to awscni_total_ip_addresses-based alert on EKS. #1292 +* [BUGFIX] Allow passing `thanos: {}` to prometheus configuration. #1325 diff --git a/Makefile b/Makefile index 9411829e..e2c265cd 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ SHELL=/bin/bash -o pipefail -export GO111MODULE=on - BIN_DIR?=$(shell pwd)/tmp/bin EMBEDMD_BIN=$(BIN_DIR)/embedmd JB_BIN=$(BIN_DIR)/jb GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml JSONNET_BIN=$(BIN_DIR)/jsonnet +JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt -TOOLING=$(EMBEDMD_BIN) $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) +KUBECONFORM_BIN=$(BIN_DIR)/kubeconform +TOOLING=$(EMBEDMD_BIN) $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) $(JSONNETLINT_BIN) $(JSONNETFMT_BIN) $(KUBECONFORM_BIN) JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s @@ -26,22 +26,47 @@ generate: manifests **.md **.md: $(EMBEDMD_BIN) $(shell find examples) build.sh example.jsonnet $(EMBEDMD_BIN) -w `find . -name "*.md" | grep -v vendor` -manifests: examples/kustomize.jsonnet $(GOJSONTOYAML_BIN) vendor build.sh +manifests: examples/kustomize.jsonnet $(GOJSONTOYAML_BIN) vendor ./build.sh $< vendor: $(JB_BIN) jsonnetfile.json jsonnetfile.lock.json rm -rf vendor $(JB_BIN) install +crdschemas: vendor + ./scripts/generate-schemas.sh + +.PHONY: update +update: $(JB_BIN) + $(JB_BIN) update + +.PHONY: validate +validate: validate-1.21 validate-1.22 + +validate-1.21: + KUBE_VERSION=1.21.1 $(MAKE) kubeconform + +validate-1.22: + KUBE_VERSION=1.22.0 $(MAKE) kubeconform + +.PHONY: kubeconform +kubeconform: crdschemas manifests $(KUBECONFORM_BIN) + $(KUBECONFORM_BIN) -kubernetes-version $(KUBE_VERSION) -schema-location 'default' -schema-location 'crdschemas/{{ .ResourceKind }}.json' -skip CustomResourceDefinition manifests/ + .PHONY: fmt fmt: $(JSONNETFMT_BIN) find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ xargs -n 1 -- $(JSONNETFMT_BIN) $(JSONNETFMT_ARGS) -i +.PHONY: lint +lint: $(JSONNETLINT_BIN) vendor + find jsonnet/ -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNETLINT_BIN) -J vendor + .PHONY: test test: $(JB_BIN) $(JB_BIN) install - ./test.sh + ./scripts/test.sh .PHONY: test-e2e test-e2e: @@ -52,4 +77,4 @@ $(BIN_DIR): $(TOOLING): $(BIN_DIR) @echo Installing tools from scripts/tools.go - @cat scripts/tools.go | grep _ | awk -F'"' '{print $$2}' | GOBIN=$(BIN_DIR) xargs -tI % go install % + @cd scripts && cat tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go build -modfile=go.mod -o $(BIN_DIR) % diff --git a/NOTICE b/NOTICE deleted file mode 100644 index 23a0ada2..00000000 --- a/NOTICE +++ /dev/null @@ -1,5 +0,0 @@ -CoreOS Project -Copyright 2018 CoreOS, Inc - -This product includes software developed at CoreOS, Inc. -(http://www.coreos.com/). diff --git a/OWNERS b/OWNERS deleted file mode 100644 index a4d52133..00000000 --- a/OWNERS +++ /dev/null @@ -1,14 +0,0 @@ -reviewers: - - brancz - - metalmatze - - mxinden - - s-urbaniak - - squat - - paulfantom -approvers: - - brancz - - metalmatze - - mxinden - - s-urbaniak - - squat - - paulfantom diff --git a/README.md b/README.md index e0ded79c..a130aaca 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # kube-prometheus +[![Build Status](https://github.com/prometheus-operator/kube-prometheus/workflows/ci/badge.svg)](https://github.com/prometheus-operator/kube-prometheus/actions) +[![Slack](https://img.shields.io/badge/join%20slack-%23prometheus--operator-brightgreen.svg)](http://slack.k8s.io/) +[![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/prometheus-operator/kube-prometheus) + > Note that everything is experimental and may change significantly at any time. This repository collects Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with [Prometheus](https://prometheus.io/) using the Prometheus Operator. @@ -18,9 +22,14 @@ Components included in this package: This stack is meant for cluster monitoring, so it is pre-configured to collect metrics from all Kubernetes components. In addition to that it delivers a default set of dashboards and alerting rules. Many of the useful dashboards and alerts come from the [kubernetes-mixin project](https://github.com/kubernetes-monitoring/kubernetes-mixin), similar to this project it provides composable jsonnet as a library for users to customize to their needs. +## Warning + +If you are migrating from `release-0.7` branch or earlier please read [what changed and how to migrate in our guide](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/migration-guide.md). + ## Table of contents - [kube-prometheus](#kube-prometheus) + - [Warning](#warning) - [Table of contents](#table-of-contents) - [Prerequisites](#prerequisites) - [minikube](#minikube) @@ -53,13 +62,17 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m - [Stripping container resource limits](#stripping-container-resource-limits) - [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) - [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) + - [Setting up a blackbox exporter](#setting-up-a-blackbox-exporter) - [Minikube Example](#minikube-example) + - [Continuous Delivery](#continuous-delivery) - [Troubleshooting](#troubleshooting) - [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics) - [Authentication problem](#authentication-problem) - [Authorization problem](#authorization-problem) - [kube-state-metrics resource usage](#kube-state-metrics-resource-usage) + - [Error retrieving kube-proxy metrics](#error-retrieving-kube-proxy-metrics) - [Contributing](#contributing) + - [License](#license) ## Prerequisites @@ -78,7 +91,7 @@ This adapter is an Extension API Server and Kubernetes needs to be have this fea To try out this stack, start [minikube](https://github.com/kubernetes/minikube) with the following command: ```shell -$ minikube delete && minikube start --kubernetes-version=v1.19.0 --memory=6g --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +$ minikube delete && minikube start --kubernetes-version=v1.20.0 --memory=6g --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 ``` The kube-prometheus stack includes a resource metrics API server, so the metrics-server addon is not necessary. Ensure the metrics-server addon is disabled on minikube: @@ -93,19 +106,17 @@ $ minikube addons disable metrics-server The following versions are supported and work as we test against these versions in their respective branches. But note that other versions might work! -| kube-prometheus stack | Kubernetes 1.14 | Kubernetes 1.15 | Kubernetes 1.16 | Kubernetes 1.17 | Kubernetes 1.18 | Kubernetes 1.19 | -|-----------------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| `release-0.3` | βœ” | βœ” | βœ” | βœ” | βœ— | βœ— | -| `release-0.4` | βœ— | βœ— | βœ” (v1.16.5+) | βœ” | βœ— | βœ— | -| `release-0.5` | βœ— | βœ— | βœ— | βœ— | βœ” | βœ— | -| `release-0.6` | βœ— | βœ— | βœ— | βœ— | βœ” | βœ” | -| `HEAD` | βœ— | βœ— | βœ— | βœ— | x | βœ” | - -Note: Due to [two](https://github.com/kubernetes/kubernetes/issues/83778) [bugs](https://github.com/kubernetes/kubernetes/issues/86359) in Kubernetes v1.16.1, and prior to Kubernetes v1.16.5 the kube-prometheus release-0.4 branch only supports v1.16.5 and higher. The `extension-apiserver-authentication-reader` role in the kube-system namespace can be manually edited to include list and watch permissions in order to workaround the second issue with Kubernetes v1.16.2 through v1.16.4. +| kube-prometheus stack | Kubernetes 1.18 | Kubernetes 1.19 | Kubernetes 1.20 | Kubernetes 1.21 | Kubernetes 1.22 | +|------------------------------------------------------------------------------------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| [`release-0.6`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.6) | βœ— | βœ” | βœ— | βœ— | βœ— | +| [`release-0.7`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.7) | βœ— | βœ” | βœ” | βœ— | βœ— | +| [`release-0.8`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.8) | βœ— | βœ— | βœ” | βœ” | βœ— | +| [`release-0.9`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.9) | βœ— | βœ— | βœ— | βœ” | βœ” | +| [`HEAD`](https://github.com/prometheus-operator/kube-prometheus/tree/main) | βœ— | βœ— | βœ— | βœ” | βœ” | ## Quickstart ->Note: For versions before Kubernetes v1.19.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch. +>Note: For versions before Kubernetes v1.21.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch. This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository). @@ -113,7 +124,7 @@ Though for a quickstart a compiled version of the Kubernetes [manifests](manifes * Create the monitoring stack using the config in the `manifests` directory: ```shell -# Create the namespace and CRDs, and then wait for them to be availble before creating the remaining resources +# Create the namespace and CRDs, and then wait for them to be available before creating the remaining resources kubectl create -f manifests/setup until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done kubectl create -f manifests/ @@ -174,12 +185,15 @@ Install this library in your own project with [jsonnet-bundler](https://github.c $ mkdir my-kube-prometheus; cd my-kube-prometheus $ jb init # Creates the initial/empty `jsonnetfile.json` # Install the kube-prometheus dependency -$ jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.4 # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` +$ jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.7 # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` + +$ wget https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.7/example.jsonnet -O example.jsonnet +$ wget https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.7/build.sh -O build.sh ``` > `jb` can be installed with `go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb` -> An e.g. of how to install a given version of this library: `jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.4` +> An e.g. of how to install a given version of this library: `jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.7` In order to update the kube-prometheus dependency, simply use the jsonnet-bundler update functionality: ```shell @@ -190,7 +204,7 @@ $ jb update e.g. of how to compile the manifests: `./build.sh example.jsonnet` -> before compiling, install `gojsontoyaml` tool with `go get github.com/brancz/gojsontoyaml` +> before compiling, install `gojsontoyaml` tool with `go get github.com/brancz/gojsontoyaml` and `jsonnet` with `go get github.com/google/go-jsonnet/cmd/jsonnet` Here's [example.jsonnet](example.jsonnet): @@ -199,33 +213,39 @@ Here's [example.jsonnet](example.jsonnet): [embedmd]:# (example.jsonnet) ```jsonnet local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): @@ -266,7 +286,7 @@ The previous steps (compilation) has created a bunch of manifest files in the ma Now simply use `kubectl` to install Prometheus and Grafana as per your configuration: ```shell -# Update the namespace and CRDs, and then wait for them to be availble before creating the remaining resources +# Update the namespace and CRDs, and then wait for them to be available before creating the remaining resources $ kubectl apply -f manifests/setup $ kubectl apply -f manifests/ ``` @@ -308,74 +328,22 @@ Once updated, just follow the instructions under "Compiling" and "Apply the kube Jsonnet has the concept of hidden fields. These are fields, that are not going to be rendered in a result. This is used to configure the kube-prometheus components in jsonnet. In the example jsonnet code of the above [Customizing Kube-Prometheus section](#customizing-kube-prometheus), you can see an example of this, where the `namespace` is being configured to be `monitoring`. In order to not override the whole object, use the `+::` construct of jsonnet, to merge objects, this way you can override individual settings, but retain all other settings and defaults. -These are the available fields with their respective default values: +The available fields and their default values can be seen in [main.libsonnet](jsonnet/kube-prometheus/main.libsonnet). Note that many of the fields get their default values from variables, and for example the version numbers are imported from [versions.json](jsonnet/kube-prometheus/versions.json). + +Configuration is mainly done in the `values` map. You can see this being used in the `example.jsonnet` to set the namespace to `monitoring`. This is done in the `common` field, which all other components take their default value from. See for example how Alertmanager is configured in `main.libsonnet`: + ``` -{ - _config+:: { - namespace: "default", - - versions+:: { - alertmanager: "v0.17.0", - nodeExporter: "v0.18.1", - kubeStateMetrics: "v1.5.0", - kubeRbacProxy: "v0.4.1", - prometheusOperator: "v0.30.0", - prometheus: "v2.10.0", - }, - - imageRepos+:: { - prometheus: "quay.io/prometheus/prometheus", - alertmanager: "quay.io/prometheus/alertmanager", - kubeStateMetrics: "quay.io/coreos/kube-state-metrics", - kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", - nodeExporter: "quay.io/prometheus/node-exporter", - prometheusOperator: "quay.io/prometheus-operator/prometheus-operator", - }, - - prometheus+:: { - names: 'k8s', - replicas: 2, - rules: {}, - }, - - alertmanager+:: { + alertmanager: { name: 'main', - config: ||| - global: - resolve_timeout: 5m - route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'null' - routes: - - match: - alertname: Watchdog - receiver: 'null' - receivers: - - name: 'null' - |||, - replicas: 3, + // Use the namespace specified under values.common by default. + namespace: $.values.common.namespace, + version: $.values.common.versions.alertmanager, + image: $.values.common.images.alertmanager, + mixin+: { ruleLabels: $.values.common.ruleLabels }, }, - - kubeStateMetrics+:: { - collectors: '', // empty string gets a default set - scrapeInterval: '30s', - scrapeTimeout: '30s', - - baseCPU: '100m', - baseMemory: '150Mi', - }, - - nodeExporter+:: { - port: 9100, - }, - }, -} ``` -The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `_config` field. For example to allow anonymous access to grafana, add the following `_config` section: +The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `values` field. For example to allow anonymous access to grafana, add the following `values` section: ``` grafana+:: { config: { // http://docs.grafana.org/installation/configuration/ @@ -392,57 +360,28 @@ Jsonnet is a turing complete language, any logic can be reflected in it. It also ### Cluster Creation Tools -A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm](examples/jsonnet-snippets/kubeadm.jsonnet), [bootkube](examples/jsonnet-snippets/bootkube.jsonnet), [kops](examples/jsonnet-snippets/kops.jsonnet) and [kubespray](examples/jsonnet-snippets/kubespray.jsonnet) clusters there are mixins available to easily configure these: +A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For the following clusters there are mixins available to easily configure them: -kubeadm: +* aws +* bootkube +* eks +* gke +* kops-coredns +* kubeadm +* kubespray -[embedmd]:# (examples/jsonnet-snippets/kubeadm.jsonnet) +These mixins are selectable via the `platform` field of kubePrometheus: + +[embedmd]:# (examples/jsonnet-snippets/platform.jsonnet) ```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') -``` - -bootkube: - -[embedmd]:# (examples/jsonnet-snippets/bootkube.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') -``` - -kops: - -[embedmd]:# (examples/jsonnet-snippets/kops.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') -``` - -kops with CoreDNS: - -If your kops cluster is using CoreDNS, there is an additional mixin to import. - -[embedmd]:# (examples/jsonnet-snippets/kops-coredns.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') -``` - -kubespray: - -[embedmd]:# (examples/jsonnet-snippets/kubespray.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') -``` - -kube-aws: - -[embedmd]:# (examples/jsonnet-snippets/kube-aws.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kube-aws.libsonnet') +(import 'kube-prometheus/main.libsonnet') + +{ + values+:: { + common+: { + platform: 'example-platform', + }, + }, +} ``` ### Internal Registry @@ -468,10 +407,12 @@ Then to generate manifests with `internal-registry.com/organization`, use the `w [embedmd]:# (examples/internal-registry.jsonnet) ```jsonnet -local mixin = import 'kube-prometheus/kube-prometheus-config-mixins.libsonnet'; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local mixin = import 'kube-prometheus/addons/config-mixins.libsonnet'; +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, } + mixin.withImageRepository('internal-registry.com/organization'); @@ -490,8 +431,8 @@ Another mixin that may be useful for exploring the stack is to expose the UIs of [embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet) ```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') +(import 'kube-prometheus/main.libsonnet') + +(import 'kube-prometheus/addons/node-ports.libsonnet') ``` ### Prometheus Object Name @@ -500,7 +441,7 @@ To give another customization example, the name of the `Prometheus` object provi [embedmd]:# (examples/prometheus-name-override.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { prometheus+: { prometheus+: { metadata+: { @@ -517,7 +458,7 @@ Standard Kubernetes manifests are all written using [ksonnet-lib](https://github [embedmd]:# (examples/ksonnet-example.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { nodeExporter+: { daemonset+: { metadata+: { @@ -530,12 +471,12 @@ Standard Kubernetes manifests are all written using [ksonnet-lib](https://github ### Alertmanager configuration -The Alertmanager configuration is located in the `_config.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field. +The Alertmanager configuration is located in the `values.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field. [embedmd]:# (examples/alertmanager-config.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: ||| global: @@ -562,8 +503,8 @@ In the above example the configuration has been inlined, but can just as well be [embedmd]:# (examples/alertmanager-config-external.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: importstr 'alertmanager-config.yaml', }, @@ -573,15 +514,17 @@ In the above example the configuration has been inlined, but can just as well be ### Adding additional namespaces to monitor -In order to monitor additional namespaces, the Prometheus server requires the appropriate `Role` and `RoleBinding` to be able to discover targets from that namespace. By default the Prometheus server is limited to the three namespaces it requires: default, kube-system and the namespace you configure the stack to run in via `$._config.namespace`. This is specified in `$._config.prometheus.namespaces`, to add new namespaces to monitor, simply append the additional namespaces: +In order to monitor additional namespaces, the Prometheus server requires the appropriate `Role` and `RoleBinding` to be able to discover targets from that namespace. By default the Prometheus server is limited to the three namespaces it requires: default, kube-system and the namespace you configure the stack to run in via `$.values.namespace`. This is specified in `$.values.prometheus.namespaces`, to add new namespaces to monitor, simply append the additional namespaces: [embedmd]:# (examples/additional-namespaces.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, - prometheus+:: { + prometheus+: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, @@ -606,14 +549,16 @@ You can define ServiceMonitor resources in your `jsonnet` spec. See the snippet [embedmd]:# (examples/additional-namespaces-servicemonitor.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, prometheus+:: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, - prometheus+:: { + exampleApplication: { serviceMonitorMyNamespace: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -645,7 +590,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` > NOTE: make sure your service resources have the right labels (eg. `'app': 'myapp'`) applied. Prometheus uses kubernetes labels to discover resources inside the namespaces. @@ -656,12 +602,13 @@ In case you want to monitor all namespaces in a cluster, you can add the followi [embedmd]:# (examples/all-namespaces.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-all-namespaces.libsonnet') + { - _config+:: { - namespace: 'monitoring', - - prometheus+:: { +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + prometheus+: { namespaces: [], }, }, @@ -689,11 +636,26 @@ In order to configure a static etcd cluster to scrape there is a simple [kube-pr ### Pod Anti-Affinity To prevent `Prometheus` and `Alertmanager` instances from being deployed onto the same node when -possible, one can include the [kube-prometheus-anti-affinity.libsonnet](jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet) mixin: +possible, one can include the [kube-prometheus-anti-affinity.libsonnet](jsonnet/kube-prometheus/addons/anti-affinity.libsonnet) mixin: +[embedmd]:# (examples/anti-affinity.jsonnet) ```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/anti-affinity.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` ### Stripping container resource limits @@ -703,10 +665,12 @@ To do that, one can import the following mixin [embedmd]:# (examples/strip-limits.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-strip-limits.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/strip-limits.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; @@ -727,6 +691,36 @@ See [developing Prometheus rules and Grafana dashboards](docs/developing-prometh See [exposing Prometheus/Alertmanager/Grafana](docs/exposing-prometheus-alertmanager-grafana-ingress.md) guide. +### Setting up a blackbox exporter + +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + // ... all necessary mixins ... + { + values+:: { + // ... configuration for other features ... + blackboxExporter+:: { + modules+:: { + tls_connect: { + prober: 'tcp', + tcp: { + tls: true + } + } + } + } + } + }; + +{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +// ... other rendering blocks ... +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +``` + +Then describe the actual blackbox checks you want to run using `Probe` resources. Specify `blackbox-exporter..svc.cluster.local:9115` as the `spec.prober.url` field of the `Probe` resource. + +See the [blackbox exporter guide](docs/blackbox-exporter.md) for the list of configurable options and a complete example. + ## Minikube Example To use an easy to reproduce example, see [minikube.jsonnet](examples/minikube.jsonnet), which uses the minikube setup as demonstrated in [Prerequisites](#prerequisites). Because we would like easy access to our Prometheus, Alertmanager and Grafana UIs, `minikube.jsonnet` exposes the services as NodePort type services. @@ -737,6 +731,8 @@ Working examples of use with continuous delivery tools are found in examples/con ## Troubleshooting +See the general [guidelines](docs/community-support.md) for getting support from the community. + ### Error retrieving kubelet metrics Should the Prometheus `/targets` page show kubelet targets, but not able to successfully scrape the metrics, then most likely it is a problem with the authentication and authorization setup of the kubelets. @@ -762,7 +758,7 @@ resources. One driver for more resource needs, is a high number of namespaces. There may be others. kube-state-metrics resource allocation is managed by -[addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) +[addon-resizer](https://github.com/kubernetes/autoscaler/tree/main/addon-resizer/nanny) You can control it's parameters by setting variables in the config. They default to: @@ -775,6 +771,13 @@ config. They default to: } ``` +### Error retrieving kube-proxy metrics +By default, kubeadm will configure kube-proxy to listen on 127.0.0.1 for metrics. Because of this prometheus would not be able to scrape these metrics. This would have to be changed to 0.0.0.0 in one of the following two places: + +1. Before cluster initialization, the config file passed to kubeadm init should have KubeProxyConfiguration manifest with the field metricsBindAddress set to 0.0.0.0:10249 +2. If the k8s cluster is already up and running, we'll have to modify the configmap kube-proxy in the namespace kube-system and set the metricsBindAddress field. After this kube-proxy daemonset would have to be restarted with +`kubectl -n kube-system rollout restart daemonset kube-proxy` + ## Contributing All `.yaml` files in the `/manifests` folder are generated via @@ -787,3 +790,7 @@ the following process: 3. Update the pinned kube-prometheus dependency in `jsonnetfile.lock.json`: `jb update` 3. Generate dependent `*.yaml` files: `make generate` 4. Commit the generated changes. + +## License + +Apache License 2.0, see [LICENSE](https://github.com/prometheus-operator/kube-prometheus/blob/main/LICENSE). diff --git a/code-of-conduct.md b/code-of-conduct.md index a234f360..d1adc780 100644 --- a/code-of-conduct.md +++ b/code-of-conduct.md @@ -1,4 +1,4 @@ -## CoreOS Community Code of Conduct +## Community Code of Conduct ### Contributor Code of Conduct @@ -33,29 +33,9 @@ This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting a project maintainer, Brandon Philips -, and/or Rithu John . +reported by contacting a project maintainer listed in +https://github.com/prometheus-operator/prometheus-operator/blob/master/MAINTAINERS.md. This Code of Conduct is adapted from the Contributor Covenant (http://contributor-covenant.org), version 1.2.0, available at http://contributor-covenant.org/version/1/2/0/ - -### CoreOS Events Code of Conduct - -CoreOS events are working conferences intended for professional networking and -collaboration in the CoreOS community. Attendees are expected to behave -according to professional standards and in accordance with their employer’s -policies on appropriate workplace behavior. - -While at CoreOS events or related social networking opportunities, attendees -should not engage in discriminatory or offensive speech or actions including -but not limited to gender, sexuality, race, age, disability, or religion. -Speakers should be especially aware of these concerns. - -CoreOS does not condone any statements by speakers contrary to these standards. -CoreOS reserves the right to deny entrance and/or eject from an event (without -refund) any individual found to be engaging in discriminatory or offensive -speech or actions. - -Please bring any concerns to the immediate attention of designated on-site -staff, Brandon Philips , and/or Rithu John . diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md index fb559d78..1cd8b146 100644 --- a/docs/EKS-cni-support.md +++ b/docs/EKS-cni-support.md @@ -7,23 +7,31 @@ One fatal issue that can occur is that you run out of IP addresses in your eks c You can monitor the `awscni` using kube-promethus with : [embedmd]:# (../examples/eks-cni-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + kubePrometheus+: { + platform: 'eks', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + kubernetesControlPlane+: { + prometheusRuleEksCNI+: { + spec+: { + groups+: [ { - record: 'aws_eks_available_ip', - expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + }, + ], }, ], }, - ], + }, }, }; diff --git a/docs/blackbox-exporter.md b/docs/blackbox-exporter.md new file mode 100644 index 00000000..e6a52725 --- /dev/null +++ b/docs/blackbox-exporter.md @@ -0,0 +1,97 @@ +--- +title: "Blackbox Exporter" +description: "Generated API docs for the Prometheus Operator" +lead: "This Document documents the types introduced by the Prometheus Operator to be consumed by users." +date: 2021-03-08T08:49:31+00:00 +lastmod: 2021-03-08T08:49:31+00:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 630 +toc: true +--- + +# Setting up a blackbox exporter + +The `prometheus-operator` defines a `Probe` resource type that can be used to describe blackbox checks. To execute these, a separate component called [`blackbox_exporter`](https://github.com/prometheus/blackbox_exporter) has to be deployed, which can be scraped to retrieve the results of these checks. You can use `kube-prometheus` to set up such a blackbox exporter within your Kubernetes cluster. + +## Adding blackbox exporter manifests to an existing `kube-prometheus` configuration + +1. Override blackbox-related configuration parameters as needed. +2. Add the following to the list of renderers to render the blackbox exporter manifests: +``` +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +``` + +## Configuration parameters influencing the blackbox exporter + +* `_config.namespace`: the namespace where the various generated resources (`ConfigMap`, `Deployment`, `Service`, `ServiceAccount` and `ServiceMonitor`) will reside. This does not affect where you can place `Probe` objects; that is determined by the configuration of the `Prometheus` resource. This option is shared with other `kube-prometheus` components; defaults to `default`. +* `_config.imageRepos.blackboxExporter`: the name of the blackbox exporter image to deploy. Defaults to `quay.io/prometheus/blackbox-exporter`. +* `_config.versions.blackboxExporter`: the tag of the blackbox exporter image to deploy. Defaults to the version `kube-prometheus` was tested with. +* `_config.imageRepos.configmapReloader`: the name of the ConfigMap reloader image to deploy. Defaults to `jimmidyson/configmap-reload`. +* `_config.versions.configmapReloader`: the tag of the ConfigMap reloader image to deploy. Defaults to the version `kube-prometheus` was tested with. +* `_config.resources.blackbox-exporter.requests`: the requested resources; this is used for each container. Defaults to `10m` CPU and `20Mi` RAM. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for details. +* `_config.resources.blackbox-exporter.limits`: the resource limits; this is used for each container. Defaults to `20m` CPU and `40Mi` RAM. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for details. +* `_config.blackboxExporter.port`: the exposed HTTPS port of the exporter. This is what Prometheus can scrape for metrics related to the blackbox exporter itself. Defaults to `9115`. +* `_config.blackboxExporter.internalPort`: the internal plaintext port of the exporter. Prometheus scrapes configured via `Probe` objects cannot access the HTTPS port right now, so you have to specify this port in the `url` field. Defaults to `19115`. +* `_config.blackboxExporter.replicas`: the number of exporter replicas to be deployed. Defaults to `1`. +* `_config.blackboxExporter.matchLabels`: map of the labels to be used to select resources belonging to the instance deployed. Defaults to `{ 'app.kubernetes.io/name': 'blackbox-exporter' }` +* `_config.blackboxExporter.assignLabels`: map of the labels applied to components of the instance deployed. Defaults to all the labels included in the `matchLabels` option, and additionally `app.kubernetes.io/version` is set to the version of the blackbox exporter. +* `_config.blackboxExporter.modules`: the modules available in the blackbox exporter installation, i.e. the types of checks it can perform. The default value includes most of the modules defined in the default blackbox exporter configuration: `http_2xx`, `http_post_2xx`, `tcp_connect`, `pop3s_banner`, `ssh_banner`, and `irc_banner`. `icmp` is omitted so the exporter can be run with minimum privileges, but you can add it back if needed - see the example below. See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md for the configuration format, except you have to use JSON instead of YAML here. +* `_config.blackboxExporter.privileged`: whether the `blackbox-exporter` container should be running as non-root (`false`) or root with heavily-restricted capability set (`true`). Defaults to `true` if you have any ICMP modules defined (which need the extra permissions) and `false` otherwise. + +## Complete example + +```jsonnet +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + blackboxExporter+:: { + modules+:: { + icmp: { + prober: 'icmp', + }, + }, + }, + }, + }; + +{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor is separated so that it can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +After installing the generated manifests, you can create `Probe` resources, for example: + +```yaml +kind: Probe +apiVersion: monitoring.coreos.com/v1 +metadata: + name: example-com-website + namespace: monitoring +spec: + interval: 60s + module: http_2xx + prober: + url: blackbox-exporter.monitoring.svc.cluster.local:19115 + targets: + staticConfig: + static: + - http://example.com + - https://example.com +``` diff --git a/docs/community-support.md b/docs/community-support.md new file mode 100644 index 00000000..218eaa74 --- /dev/null +++ b/docs/community-support.md @@ -0,0 +1,84 @@ +# Community support + +For bugs, you can use the GitHub [issue tracker](https://github.com/prometheus-operator/kube-prometheus/issues/new/choose). + +For questions, you can use the GitHub [discussions forum](https://github.com/prometheus-operator/kube-prometheus/discussions). + +Many of the `kube-prometheus` project's contributors and users can also be found on the #prometheus-operator channel of the [Kubernetes Slack][Kubernetes Slack]. + +`kube-prometheus` is the aggregation of many projects that all have different +channels to reach out for help and support. This community strives at +supporting all users and you should never be afraid of asking us first. However +if your request relates specifically to one of the projects listed below, it is +often more efficient to reach out to the project directly. If you are unsure, +please feel free to open an issue in this repository and we will redirect you +if applicable. + +## prometheus-operator + +For documentation, check the project's [documentation directory](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation). + +For questions, use the #prometheus-operator channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/prometheus-operator/prometheus-operator/issues/new/choose). + +## Prometheus, Alertmanager, node_exporter + +For documentation, check the Prometheus [online docs](https://prometheus.io/docs/). There is a +[section](https://prometheus.io/docs/introduction/media/) with links to blog +posts, recorded talks and presentations. This [repository](https://github.com/roaldnefs/awesome-prometheus) +(not affiliated to the Prometheus project) has also a list of curated resources +related to the Prometheus ecosystem. + +For questions, see the Prometheus [community page](https://prometheus.io/community/) for the various channels. + +There is also a #prometheus channel on the [CNCF Slack][CNCF Slack]. + +## kube-state-metrics + +For documentation, see the project's [docs directory](https://github.com/kubernetes/kube-state-metrics/tree/master/docs). + +For questions, use the #kube-state-metrics channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kube-state-metrics/issues/new/choose). + +## Kubernetes + +For documentation, check the [Kubernetes docs](https://kubernetes.io/docs/home/). + +For questions, use the [community forums](https://discuss.kubernetes.io/) and the [Kubernetes Slack][Kubernetes Slack]. Check also the [community page](https://kubernetes.io/community/#discuss). + +For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kubernetes/issues/new/choose). + +## Prometheus adapter + +For documentation, check the project's [README](https://github.com/DirectXMan12/k8s-prometheus-adapter/blob/master/README.md). + +For questions, use the #sig-instrumentation channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/DirectXMan12/k8s-prometheus-adapter/issues/new). + +## Grafana + +For documentation, check the [Grafana docs](https://grafana.com/docs/grafana/latest/). + +For questions, use the [community forums](https://community.grafana.com/). + +For bugs, use the GitHub [issue tracker](https://github.com/grafana/grafana/issues/new/choose). + +## kubernetes-mixin + +For documentation, check the project's [README](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/README.md). + +For questions, use #monitoring-mixins channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/new). + +## Jsonnet + +For documentation, check the [Jsonnet](https://jsonnet.org/) website. + +For questions, use the [mailing list](https://groups.google.com/forum/#!forum/jsonnet). + +[Kubernetes Slack]: https://slack.k8s.io/ +[CNCF Slack]: https://slack.cncf.io/ diff --git a/docs/deploy-kind.md b/docs/deploy-kind.md new file mode 100644 index 00000000..ea66c59a --- /dev/null +++ b/docs/deploy-kind.md @@ -0,0 +1,19 @@ +--- +title: "Deploy to kind" +description: "Deploy kube-prometheus to Kubernets kind." +lead: "Deploy kube-prometheus to Kubernets kind." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 500 +toc: true +--- + +--- + +Time to explain how! + +Your chance of [**contributing**](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/deploy-kind.md)! diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index a0a41c5e..6aa853cd 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -1,4 +1,16 @@ -# Developing Prometheus Rules and Grafana Dashboards +--- +title: "Prometheus Rules and Grafana Dashboards" +description: "Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus" +lead: "Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus" +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 650 +toc: true +--- `kube-prometheus` ships with a set of default [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) and [Grafana](http://grafana.com/) dashboards. At some point one might like to extend them, the purpose of this document is to explain how to do this. @@ -11,33 +23,39 @@ As a basis, all examples in this guide are based on the base example of the kube [embedmd]:# (../example.jsonnet) ```jsonnet local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ``` ## Prometheus rules @@ -52,28 +70,40 @@ The format is exactly the Prometheus format, so there should be no changes neces [embedmd]:# (../examples/prometheus-additional-alert-rule-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - alert: 'Watchdog', - expr: 'vector(1)', - labels: { - severity: 'none', - }, - annotations: { - description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', - }, + name: 'example-group', + rules: [ + { + alert: 'ExampleAlert', + expr: 'vector(1)', + labels: { + severity: 'warning', + }, + annotations: { + description: 'This is an example alert.', + }, + }, + ], }, ], }, - ], + }, }, }; @@ -84,7 +114,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ### Recording rules @@ -95,22 +126,34 @@ In order to add a recording rule, simply do the same with the `prometheusRules` [embedmd]:# (../examples/prometheus-additional-recording-rule-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - record: 'some_recording_rule_name', - expr: 'vector(1)', + name: 'example-group', + rules: [ + { + record: 'some_recording_rule_name', + expr: 'vector(1)', + }, + ], }, ], }, - ], + }, }, }; @@ -121,7 +164,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ### Pre-rendered rules @@ -142,12 +186,24 @@ Then import it in jsonnet: [embedmd]:# (../examples/prometheus-additional-rendered-rule-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: (import 'existingrule.json').groups, + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: (import 'existingrule.json').groups, + }, + }, }, }; @@ -158,76 +214,118 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ### Changing default rules -Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/kube-prometheus.libsonnet`. The recording rules can be found in [kube-prometheus/rules](../jsonnet/kube-prometheus/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/alerts](../jsonnet/kube-prometheus/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts). +Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/main.libsonnet`. The recording rules can be found in [kube-prometheus/components/mixin/rules](../jsonnet/kube-prometheus/components/mixin/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/components/mixin/alerts](../jsonnet/kube-prometheus/components/mixin/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts). Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above. #### Filter -Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). You first need to find out in which component the rule is defined (here it is kuberentesControlPlane). ```jsonnet local filter = { - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'kubernetes-apps' then - group { - rules: std.filter(function(rule) - rule.alert != "KubeStatefulSetReplicasMismatch", - group.rules - ) - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRule+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.filter( + function(rule) + rule.alert != 'KubeStatefulSetReplicasMismatch', + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; ``` + #### Adjustment -Here the expression for the alert used above is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +Here the expression for another alert in the same component is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). ```jsonnet local update = { - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'kubernetes-apps' then - group { - rules: std.map( - function(rule) - if rule.alert == "KubeStatefulSetReplicasMismatch" then - rule { - expr: "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",statefulset!=\"vault\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",statefulset!=\"vault\"}" - } - else - rule, - group.rules - ) - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRule+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.map( + function(rule) + if rule.alert == 'KubePodCrashLooping' then + rule { + expr: 'rate(kube_pod_container_status_restarts_total{namespace=kube-system,job="kube-state-metrics"}[10m]) * 60 * 5 > 0', + } + else + rule, + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; ``` + Using the example from above about adding in pre-rendered rules, the new local variables can be added in as follows: ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + filter + update + { - prometheusAlerts+:: (import 'existingrule.json'), +local add = { + exampleApplication:: { + prometheusRule+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'example-application-rules', + namespace: $.values.common.namespace, + }, + spec: (import 'existingrule.json'), + }, + }, }; - -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +local kp = (import 'kube-prometheus/main.libsonnet') + filter + update + add; +local kp = (import 'kube-prometheus/main.libsonnet') + + filter + + update + + add + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, + }; +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } + +{ ['exampleApplication-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ## Dashboards @@ -248,35 +346,37 @@ local prometheus = grafana.prometheus; local template = grafana.template; local graphPanel = grafana.graphPanel; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - dashboards+:: { - 'my-dashboard.json': - dashboard.new('My Dashboard') - .addTemplate( - { - current: { - text: 'Prometheus', - value: 'Prometheus', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { + 'my-dashboard.json': + dashboard.new('My Dashboard') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', }, - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addRow( - row.new() - .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') - .addTarget(prometheus.target('vector(1)'))) - ), + ) + .addRow( + row.new() + .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') + .addTarget(prometheus.target('vector(1)'))) + ), + }, }, }, }; @@ -296,16 +396,15 @@ As jsonnet is a superset of json, the jsonnet `import` function can be used to i [embedmd]:# (../examples/grafana-additional-rendered-dashboard-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafanaDashboards+:: { // monitoring-mixin compatibility - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), - }, - grafana+:: { - dashboards+:: { // use this method to import your dashboards to Grafana - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { // use this method to import your dashboards to Grafana + 'my-dashboard.json': (import 'example-grafana-dashboard.json'), + }, }, }, }; @@ -322,13 +421,15 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { In case you have lots of json dashboard exported out from grafana UI the above approach is going to take lots of time to improve performance we can use `rawDashboards` field and provide it's value as json string by using `importstr` [embedmd]:# (../examples/grafana-additional-rendered-dashboard-example-2.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - rawDashboards+:: { - 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + rawDashboards+:: { + 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), + }, }, }, }; @@ -341,3 +442,117 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` + +### Mixins + +Kube-prometheus comes with a couple of default mixins as the Kubernetes-mixin and the Node-exporter mixin, however there [are many more mixins](https://monitoring.mixins.dev/). To use other mixins Kube-prometheus has a jsonnet library for creating a Kubernetes PrometheusRule CRD and Grafana dashboards from a mixin. Below is an example of creating a mixin object that has Prometheus rules and Grafana dashboards: + +```jsonnet +// Import the library function for adding mixins +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); + +// Create your mixin +local myMixin = addMixin({ + name: 'myMixin', + mixin: import 'my-mixin/mixin.libsonnet', +}); +``` + +The myMixin object will have two objects - `prometheusRules` and `grafanaDashboards`. The `grafanaDashboards` object will be needed to be added to the `dashboards` field as in the example below: + +```jsonnet +values+:: { + grafana+:: { + dashboards+:: myMixin.grafanaDashboards +``` + +The `prometheusRules` object is a PrometheusRule Kubernetes CRD and it should be defined as its own jsonnet object. If you define multiple mixins in a single jsonnet object there is a possibility that they will overwrite each others' configuration and there will be unintended effects. Therefore, use the `prometheusRules` object as its own jsonnet object: + +```jsonnet +... +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ 'external-mixins/my-mixin-prometheus-rules': myMixin.prometheusRules } // one object for each mixin +``` + +As mentioned above each mixin is configurable and you would configure the mixin as in the example below: + +```jsonnet +local myMixin = addMixin({ + name: 'myMixin', + mixin: (import 'my-mixin/mixin.libsonnet') + { + _config+:: { + myMixinSelector: 'my-selector', + interval: '30d', // example + }, + }, +}); +``` + +The library has also two optional parameters - the namespace for the `PrometheusRule` CRD and the dashboard folder for the Grafana dashboards. The below example shows how to use both: + +```jsonnet +local myMixin = addMixin({ + name: 'myMixin', + namespace: 'prometheus', // default is monitoring + dashboardFolder: 'Observability', + mixin: (import 'my-mixin/mixin.libsonnet') + { + _config+:: { + myMixinSelector: 'my-selector', + interval: '30d', // example + }, + }, +}); +``` + +The created `prometheusRules` object will have the metadata field `namespace` added and the usage will remain the same. However, the `grafanaDasboards` will be added to the `folderDashboards` field instead of the `dashboards` field as shown in the example below: + +```jsonnet +values+:: { + grafana+:: { + folderDashboards+:: { + Kubernetes: { + ... + }, + Misc: { + 'grafana-home.json': import 'dashboards/misc/grafana-home.json', + }, + } + myMixin.grafanaDashboards +``` + +Full example of including etcd mixin using method described above: + +[embedmd]:# (../examples/mixin-inclusion.jsonnet) +```jsonnet +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); +local etcdMixin = addMixin({ + name: 'etcd', + mixin: (import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + { + _config+: {}, // mixin configuration object + }, +}); + +local kp = (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + grafana+: { + // Adding new dashboard to grafana. This will modify grafana configMap with dashboards + dashboards+: etcdMixin.grafanaDashboards, + }, + }, + }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +// Rendering prometheusRules object. This is an object compatible with prometheus-operator CRD definition for prometheusRule +{ 'external-mixins/etcd-mixin-prometheus-rules': etcdMixin.prometheusRules } +``` diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md index f05ab4ce..be1ba130 100644 --- a/docs/exposing-prometheus-alertmanager-grafana-ingress.md +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -1,12 +1,24 @@ -# Exposing Prometheus, Alertmanager and Grafana UIs via Ingress +--- +title: "Expose via Ingress" +description: "How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana." +lead: "How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 500 +toc: true +--- -In order to access the web interfaces via the Internet [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) is a popular option. This guide explains, how Kubernetes Ingress can be setup, in order to expose the Prometheus, Alertmanager and Grafana UIs, that are included in the [kube-prometheus](https://github.com/coreos/kube-prometheus) project. +In order to access the web interfaces via the Internet [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) is a popular option. This guide explains, how Kubernetes Ingress can be setup, in order to expose the Prometheus, Alertmanager and Grafana UIs, that are included in the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project. -Note: before continuing, it is recommended to first get familiar with the [kube-prometheus](https://github.com/coreos/kube-prometheus) stack by itself. +Note: before continuing, it is recommended to first get familiar with the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) stack by itself. ## Prerequisites -Apart from a running Kubernetes cluster with a running [kube-prometheus](https://github.com/coreos/kube-prometheus) stack, a Kubernetes Ingress controller must be installed and functional. This guide was tested with the [nginx-ingress-controller](https://github.com/kubernetes/ingress-nginx). If you wish to reproduce the exact result in as depicted in this guide we recommend using the nginx-ingress-controller. +Apart from a running Kubernetes cluster with a running [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) stack, a Kubernetes Ingress controller must be installed and functional. This guide was tested with the [nginx-ingress-controller](https://github.com/kubernetes/ingress-nginx). If you wish to reproduce the exact result in as depicted in this guide we recommend using the nginx-ingress-controller. ## Setting up Ingress diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md index db15c431..37610593 100644 --- a/docs/kube-prometheus-on-kubeadm.md +++ b/docs/kube-prometheus-on-kubeadm.md @@ -1,15 +1,22 @@ -
- +--- +title: "Deploy to kubeadm" +description: "Deploy kube-prometheus to Kubernets kubeadm." +lead: "Deploy kube-prometheus to Kubernets kubeadm." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 500 +toc: true +--- -# Kube Prometheus on Kubeadm - -The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. Kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with Kubeadm. +The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with kubeadm. This guide assumes you have a basic understanding of how to use the functionality the Prometheus Operator implements. If you haven't yet, we recommend reading through the [getting started guide](https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/getting-started.md) as well as the [alerting guide](https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/alerting.md). -## Kubeadm Pre-requisites +## kubeadm Pre-requisites This guide assumes you have some familiarity with `kubeadm` or at least have deployed a cluster using `kubeadm`. By default, `kubeadm` does not expose two of the services that we will be monitoring. Therefore, in order to get the most out of the `kube-prometheus` package, we need to make some quick tweaks to the Kubernetes cluster. Since we will be monitoring the `kube-controller-manager` and `kube-scheduler`, we must expose them to the cluster. diff --git a/docs/migration-example/my.release-0.3.jsonnet b/docs/migration-example/my.release-0.3.jsonnet new file mode 100644 index 00000000..a6a87818 --- /dev/null +++ b/docs/migration-example/my.release-0.3.jsonnet @@ -0,0 +1,296 @@ +// Has the following customisations +// Custom alert manager config +// Ingresses for the alert manager, prometheus and grafana +// Grafana admin user password +// Custom prometheus rules +// Custom grafana dashboards +// Custom prometheus config - Data retention, memory, etc. +// Node exporter role and role binding so we can use a PSP for the node exporter + + +// External variables +// See https://jsonnet.org/learning/tutorial.html +local cluster_identifier = std.extVar('cluster_identifier'); +local etcd_ip = std.extVar('etcd_ip'); +local etcd_tls_ca = std.extVar('etcd_tls_ca'); +local etcd_tls_cert = std.extVar('etcd_tls_cert'); +local etcd_tls_key = std.extVar('etcd_tls_key'); +local grafana_admin_password = std.extVar('grafana_admin_password'); +local prometheus_data_retention_period = std.extVar('prometheus_data_retention_period'); +local prometheus_request_memory = std.extVar('prometheus_request_memory'); + + +// Derived variables +local alert_manager_host = 'alertmanager.' + cluster_identifier + '.myorg.local'; +local grafana_host = 'grafana.' + cluster_identifier + '.myorg.local'; +local prometheus_host = 'prometheus.' + cluster_identifier + '.myorg.local'; + + +// Imports +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local ingress = k.extensions.v1beta1.ingress; +local ingressRule = ingress.mixin.spec.rulesType; +local ingressRuleHttpPath = ingressRule.mixin.http.pathsType; +local ingressTls = ingress.mixin.spec.tlsType; +local role = k.rbac.v1.role; +local roleBinding = k.rbac.v1.roleBinding; +local roleRulesType = k.rbac.v1.role.rulesType; + + +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + + + { + _config+:: { + // Override namespace + namespace: 'monitoring', + + + // Override alert manager config + // See https://github.com/coreos/kube-prometheus/tree/master/examples/alertmanager-config-external.jsonnet + alertmanager+: { + config: importstr 'alertmanager.yaml', + }, + + // Override etcd config + // See https://github.com/coreos/kube-prometheus/blob/master/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet + // See https://github.com/coreos/kube-prometheus/blob/master/examples/etcd-skip-verify.jsonnet + etcd+:: { + clientCA: etcd_tls_ca, + clientCert: etcd_tls_cert, + clientKey: etcd_tls_key, + ips: [etcd_ip], + }, + + // Override grafana config + // anonymous access + // See http://docs.grafana.org/installation/configuration/ + // See http://docs.grafana.org/auth/overview/#anonymous-authentication + // admin_password + // See http://docs.grafana.org/installation/configuration/#admin-password + grafana+:: { + config: { + sections: { + 'auth.anonymous': { + enabled: true, + }, + security: { + admin_password: grafana_admin_password, + }, + }, + }, + + + }, + }, + + // Additional grafana dashboards + grafanaDashboards+:: { + 'my-specific.json': (import 'my-grafana-dashboard-definitions.json'), + }, + + // Alert manager needs an externalUrl + alertmanager+:: { + alertmanager+: { + spec+: { + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/exposing-prometheus-and-alertmanager.md + externalUrl: 'https://' + alert_manager_host, + }, + }, + }, + + + // Add additional ingresses + // See https://github.com/coreos/kube-prometheus/tree/master/examples/ingress.jsonnet + ingress+:: { + alertmanager: + ingress.new() + + + + ingress.mixin.metadata.withName('alertmanager') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost(alert_manager_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + ingressRuleHttpPath.mixin.backend.withServiceName('alertmanager-operated') + + + ingressRuleHttpPath.mixin.backend.withServicePort(9093) + ), + ) + + + + // Note we do not need a TLS secretName here as we are going to use the nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(alert_manager_host) + ), + + + grafana: + ingress.new() + + + + ingress.mixin.metadata.withName('grafana') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost(grafana_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + ingressRuleHttpPath.mixin.backend.withServiceName('grafana') + + + ingressRuleHttpPath.mixin.backend.withServicePort(3000) + ), + ) + + + + // Note we do not need a TLS secretName here as we are going to use the nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(grafana_host) + ), + + + prometheus: + ingress.new() + + + + ingress.mixin.metadata.withName('prometheus') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + + ingressRule.withHost(prometheus_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + ingressRuleHttpPath.mixin.backend.withServiceName('prometheus-operated') + + + ingressRuleHttpPath.mixin.backend.withServicePort(9090) + ), + ) + + + + // Note we do not need a TLS secretName here as we are going to use the nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(prometheus_host) + ), + }, + + + // Node exporter PSP role and role binding + // Add a new top level field for this, the "node-exporter" PSP already exists, so not defining here just referencing + // See https://github.com/coreos/prometheus-operator/issues/787 + nodeExporterPSP: { + role: + role.new() + + + + role.mixin.metadata.withName('node-exporter-psp') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRules([ + roleRulesType.new() + + roleRulesType.withApiGroups(['policy']) + + roleRulesType.withResources(['podsecuritypolicies']) + + roleRulesType.withVerbs(['use']) + + roleRulesType.withResourceNames(['node-exporter']), + ]), + + roleBinding: + roleBinding.new() + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + + + roleBinding.mixin.metadata.withName('node-exporter-psp') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + + + roleBinding.mixin.roleRef.withName('node-exporter-psp') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter' }]), + + + }, + + + // Prometheus needs some extra custom config + prometheus+:: { + prometheus+: { + spec+: { + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec + externalLabels: { + cluster: cluster_identifier, + }, + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/exposing-prometheus-and-alertmanager.md + externalUrl: 'https://' + prometheus_host, + // Override reuest memory + resources: { + requests: { + memory: prometheus_request_memory, + }, + }, + // Override data retention period + retention: prometheus_data_retention_period, + }, + }, + }, + + + // Additional prometheus rules + // See https://github.com/coreos/kube-prometheus/docs/developing-prometheus-rules-and-grafana-dashboards.md + // cat my-prometheus-rules.yaml | gojsontoyaml -yamltojson | jq . > my-prometheus-rules.json + prometheusRules+:: { + + + groups+: import 'my-prometheus-rules.json', + + + }, + }; + + +// Render +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + + + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + + +{ [name + '-ingress']: kp.ingress[name] for name in std.objectFields(kp.ingress) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['node-exporter-psp-' + name]: kp.nodeExporterPSP[name] for name in std.objectFields(kp.nodeExporterPSP) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/docs/migration-example/my.release-0.8.jsonnet b/docs/migration-example/my.release-0.8.jsonnet new file mode 100644 index 00000000..368938b2 --- /dev/null +++ b/docs/migration-example/my.release-0.8.jsonnet @@ -0,0 +1,316 @@ +// Has the following customisations +// Custom alert manager config +// Ingresses for the alert manager, prometheus and grafana +// Grafana admin user password +// Custom prometheus rules +// Custom grafana dashboards +// Custom prometheus config - Data retention, memory, etc. +// Node exporter role and role binding so we can use a PSP for the node exporter + +// for help with expected content, see https://github.com/thaum-xyz/ankhmorpork + +// External variables +// See https://jsonnet.org/learning/tutorial.html +local cluster_identifier = std.extVar('cluster_identifier'); +local etcd_ip = std.extVar('etcd_ip'); +local etcd_tls_ca = std.extVar('etcd_tls_ca'); +local etcd_tls_cert = std.extVar('etcd_tls_cert'); +local etcd_tls_key = std.extVar('etcd_tls_key'); +local grafana_admin_password = std.extVar('grafana_admin_password'); +local prometheus_data_retention_period = std.extVar('prometheus_data_retention_period'); +local prometheus_request_memory = std.extVar('prometheus_request_memory'); + + +// Derived variables +local alert_manager_host = 'alertmanager.' + cluster_identifier + '.myorg.local'; +local grafana_host = 'grafana.' + cluster_identifier + '.myorg.local'; +local prometheus_host = 'prometheus.' + cluster_identifier + '.myorg.local'; + + +// ksonnet no longer required + + +local kp = + (import 'kube-prometheus/main.libsonnet') + + // kubeadm now achieved by setting platform value - see 9 lines below + (import 'kube-prometheus/addons/static-etcd.libsonnet') + + (import 'kube-prometheus/addons/podsecuritypolicies.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + + // Add kubeadm platform-specific items, + // including kube-contoller-manager and kube-scheduler discovery + kubePrometheus+: { + platform: 'kubeadm', + }, + + // Override alert manager config + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/examples/alertmanager-config-external.jsonnet + alertmanager+: { + config: importstr 'alertmanager.yaml', + }, + + // Override etcd config + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/static-etcd.libsonnet + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/examples/etcd-skip-verify.jsonnet + etcd+:: { + clientCA: etcd_tls_ca, + clientCert: etcd_tls_cert, + clientKey: etcd_tls_key, + ips: [etcd_ip], + }, + + // Override grafana config + // anonymous access + // See http://docs.grafana.org/installation/configuration/ + // See http://docs.grafana.org/auth/overview/#anonymous-authentication + // admin_password + // See http://docs.grafana.org/installation/configuration/#admin-password + grafana+:: { + config: { + sections: { + 'auth.anonymous': { + enabled: true, + }, + security: { + admin_password: grafana_admin_password, + }, + }, + }, + // Additional grafana dashboards + dashboards+:: { + 'my-specific.json': (import 'my-grafana-dashboard-definitions.json'), + }, + }, + }, + + + // Alert manager needs an externalUrl + alertmanager+:: { + alertmanager+: { + spec+: { + + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/exposing-prometheus-alertmanager-grafana-ingress.md + externalUrl: 'https://' + alert_manager_host, + }, + }, + }, + + + // Add additional ingresses + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/examples/ingress.jsonnet + ingress+:: { + alertmanager: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'alertmanager', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: alert_manager_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'alertmanager-operated', + port: { + number: 9093, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [alert_manager_host], + }], + }, + }, + grafana: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'grafana', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: grafana_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'grafana', + port: { + number: 3000, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [grafana_host], + }], + }, + }, + prometheus: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'prometheus', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: prometheus_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'prometheus-operated', + port: { + number: 9090, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [prometheus_host], + }], + }, + }, + }, + + + // Node exporter PSP role and role binding + nodeExporter+: { + 'psp-role'+: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'node-exporter-psp', + namespace: $.values.common.namespace, + }, + rules: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['node-exporter'], + }], + }, + 'psp-rolebinding'+: { + + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'node-exporter-psp', + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + name: 'node-exporter-psp', + kind: 'Role', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'node-exporter', + }], + }, + }, + + // Prometheus needs some extra custom config + prometheus+:: { + prometheus+: { + spec+: { + + externalLabels: { + cluster: cluster_identifier, + }, + + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/exposing-prometheus-alertmanager-grafana-ingress.md + externalUrl: 'https://' + prometheus_host, + // Override reuest memory + resources: { + requests: { + memory: prometheus_request_memory, + }, + }, + // Override data retention period + retention: prometheus_data_retention_period, + }, + }, + }, + + + // Additional prometheus rules + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/developing-prometheus-rules-and-grafana-dashboards.md#pre-rendered-rules + // cat my-prometheus-rules.yaml | gojsontoyaml -yamltojson | jq . > my-prometheus-rules.json + prometheusMe: { + rules: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + labels: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + prometheus: 'k8s', + role: 'alert-rules', + }, + }, + spec: { + groups: import 'my-prometheus-rules.json', + }, + }, + }, + }; + + +// Render +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ [name + '-ingress']: kp.ingress[name] for name in std.objectFields(kp.ingress) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ++ { ['prometheus-my-' + name]: kp.prometheusMe[name] for name in std.objectFields(kp.prometheusMe) } diff --git a/docs/migration-example/readme.md b/docs/migration-example/readme.md new file mode 100644 index 00000000..5e9def04 --- /dev/null +++ b/docs/migration-example/readme.md @@ -0,0 +1,250 @@ +## Example of conversion of a legacy my.jsonnet file + +An example conversion of a legacy custom jsonnet file to release-0.8 +format can be seen by viewing and comparing this +[release-0.3 jsonnet file](./my.release-0.3.jsonnet) (when the github +repo was under `https://github.com/coreos/kube-prometheus...`) +and the corresponding [release-0.8 jsonnet file](./my.release-0.8.jsonnet). + +These two files have had necessary blank lines added so that they +can be compared side-by-side and line-by-line on screen. + +The conversion covers both the change of stopping using ksonnet after +release-0.3 and also the major migration after release-0.7 as described in +[migration-guide.md](../migration-guide.md) + +The sample files are intended as an example of format conversion and +not necessarily best practice for the files in release-0.3 or release-0.8. + +Below are three sample extracts of the conversion as an indication of the +changes required. + + + + + + + + + +
release-0.3 release-0.8
+ +```jsonnet +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + + + { + _config+:: { + // Override namespace + namespace: 'monitoring', + + + + + + + +``` + + + +```jsonnet +local kp = + (import 'kube-prometheus/main.libsonnet') + + // kubeadm now achieved by setting platform value - see 9 lines below + (import 'kube-prometheus/addons/static-etcd.libsonnet') + + (import 'kube-prometheus/addons/podsecuritypolicies.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + + // Add kubeadm platform-specific items, + // including kube-contoller-manager and kube-scheduler discovery + kubePrometheus+: { + platform: 'kubeadm', + }, +``` + +
+ + + + + + + + + +
release-0.3 release-0.8
+ +```jsonnet + // Add additional ingresses + // See https://github.com/coreos/kube-prometheus/... + // tree/master/examples/ingress.jsonnet + ingress+:: { + alertmanager: + ingress.new() + + + + ingress.mixin.metadata.withName('alertmanager') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost(alert_manager_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + + + ingressRuleHttpPath.mixin.backend + .withServiceName('alertmanager-operated') + + ingressRuleHttpPath.mixin.backend.withServicePort(9093) + ), + ) + + // Note we do not need a TLS secretName here as we are going to use the + // nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, + // see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(alert_manager_host) + ), + + +``` + + + +```jsonnet + // Add additional ingresses + // See https://github.com/prometheus-operator/kube-prometheus/... + // blob/main/examples/ingress.jsonnet + ingress+:: { + alertmanager: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'alertmanager', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: alert_manager_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'alertmanager-operated', + port: { + number: 9093, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [alert_manager_host], + }], + }, + }, +``` + +
+ + + + + + + + + +
release-0.3 release-0.8
+ +```jsonnet + // Additional prometheus rules + // See https://github.com/coreos/kube-prometheus/docs/... + // developing-prometheus-rules-and-grafana-dashboards.md + // + // cat my-prometheus-rules.yaml | \ + // gojsontoyaml -yamltojson | \ + // jq . > my-prometheus-rules.json + prometheusRules+:: { + + + + + + + + + + + + + + + groups+: import 'my-prometheus-rules.json', + + + }, + }; + + + + +``` + + + +```jsonnet + // Additional prometheus rules + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/... + // docs/developing-prometheus-rules-and-grafana-dashboards.md... + // #pre-rendered-rules + // cat my-prometheus-rules.yaml | \ + // gojsontoyaml -yamltojson | \ + // jq . > my-prometheus-rules.json + prometheusMe: { + rules: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + labels: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + prometheus: 'k8s', + role: 'alert-rules', + }, + }, + spec: { + groups: import 'my-prometheus-rules.json', + }, + }, + }, + }; + +... + ++ { ['prometheus-my-' + name]: kp.prometheusMe[name] for name in std.objectFields(kp.prometheusMe) } +``` + +
diff --git a/docs/migration-guide.md b/docs/migration-guide.md new file mode 100644 index 00000000..a33a8b61 --- /dev/null +++ b/docs/migration-guide.md @@ -0,0 +1,87 @@ +# Migration guide from release-0.7 and earlier + +## Why? + +Thanks to our community we identified a lot of short-commings of previous design, varying from issues with global state to UX problems. Hoping to fix at least part of those issues we decided to do a complete refactor of the codebase. + +## Overview + +### Breaking Changes + +- global `_config` object is removed and the new `values` object is a partial replacement +- `imageRepos` field was removed and the project no longer tries to compose image strings. Use `$.values.common.images` to override default images. +- prometheus alerting and recording rules are split into multiple `PrometheusRule` objects +- kubernetes control plane ServiceMonitors and Services are now part of the new `kubernetesControlPlane` top-level object instead of `prometheus` object +- `jsonnet/kube-prometheus/kube-prometheus.libsonnet` file was renamed to `jsonnet/kube-prometheus/main.libsonnet` and slimmed down to bare minimum +- `jsonnet/kube-prometheus/kube-prometheus*-.libsonnet` files were move either to `jsonnet/kube-prometheus/addons/` or `jsonnet/kube-prometheus/platforms/` depending on the feature they provided +- all component libraries are now function- and not object-based +- monitoring-mixins are included inside each component and not globally. `prometheusRules`, `prometheusAlerts`, and `grafanaDashboards` are accessible only per component via `mixin` object (ex. `$.alertmanager.mixin.prometheusAlerts`) +- default repository branch changed from `master` to `main` +- labels on resources have changes, `kubectl apply` will not work correctly due to those field being immutable. Deleting the resource first before applying is a workaround if you are using the kubectl CLI. (This only applies to `Deployments` and `DaemonSets`.) + +### New Features + +- concept of `addons`, `components`, and `platforms` was introduced +- all main `components` are now represented internally by a function with default values and required parameters (see #Component-configuration for more information) +- `$.values` holds main configuration parameters and should be used to set basic stack configuration. +- common parameters across all `components` are stored now in `$.values.common` +- removed dependency on deprecated ksonnet library + +## Details + +### Components, Addons, Platforms + +Those concepts were already present in the repository but it wasn't clear which file is holding what. After refactoring we categorized jsonnet code into 3 buckets and put them into separate directories: +- `components` - main building blocks for kube-prometheus, written as functions responsible for creating multiple objects representing kubernetes manifests. For example all objects for node_exporter deployment are bundled in `components/node_exporter.libsonnet` library +- `addons` - everything that can enhance kube-prometheus deployment. Those are small snippets of code adding a small feature, for example adding anti-affinity to pods via [`addons/anti-affinity.libsonnet`][antiaffinity]. Addons are meant to be used in object-oriented way like `local kp = (import 'kube-prometheus/main.libsonnet') + (import 'kube-prometheus/addons/all-namespaces.libsonnet')` +- `platforms` - currently those are `addons` specialized to allow deploying kube-prometheus project on a specific platform. + +### Component configuration + +Refactoring main components to use functions allowed us to define APIs for said components. Each function has a default set of parameters that can be overridden or that are required to be set by a user. Those default parameters are represented in each component by `defaults` map at the top of each library file, for example in [`node_exporter.libsonnet`][node_exporter_defaults_example]. + +This API is meant to ease the use of kube-prometheus as parameters can be passed from a JSON file and don't need to be in jsonnet format. However, if you need to modify particular parts of the stack, jsonnet allows you to do this and we are also not restricting such access in any way. An example of such modifications can be seen in any of our `addons`, like the [`addons/anti-affinity.libsonnet`][antiaffinity] one. + +### Mixin integration + +Previously kube-prometheus project joined all mixins on a global level. However with a wider adoption of monitoring mixins this turned out to be a problem, especially apparent when two mixins started to use the same configuration field for different purposes. To fix this we moved all mixins into their own respective components: +- alertmanager mixin -> `alertmanager.libsonnet` +- kubernetes mixin -> `k8s-control-plane.libsonnet` +- kube-state-metrics mixin -> `kube-state-metrics.libsonnet` +- node_exporter mixin -> `node_exporter.libsonnet` +- prometheus and thanos sidecar mixins -> `prometheus.libsonnet` +- prometheus-operator mixin -> `prometheus-operator.libsonnet` +- kube-prometheus alerts and rules -> `components/mixin/custom.libsonnet` + +> etcd mixin is a special case as we add it inside an `addon` in `addons/static-etcd.libsonnet` + +This results in creating multiple `PrometheusRule` objects instead of having one giant object as before. It also means each mixin is configured separately and accessing mixin objects is done via `$..mixin`. + +## Examples + +All examples from `examples/` directory were adapted to the new codebase. [Please take a look at them for guideance](https://github.com/prometheus-operator/kube-prometheus/tree/main/examples) + +## Legacy migration + +An example of conversion of a legacy release-0.3 my.jsonnet file to release-0.8 can be found in [migration-example](./migration-example) + +## Advanced usage examples + +For more advanced usage examples you can take a look at those two, open to public, implementations: +- [thaum-xyz/ankhmorpork][thaum] - extending kube-prometheus to adapt to a required environment +- [openshift/cluster-monitoring-operator][openshift] - using kube-prometheus components as standalone libraries to build a custom solution + +## Final note + +Refactoring was a huge undertaking and possibly this document didn't describe in enough detail how to help you with migration to the new stack. If that is the case, please reach out to us by using [GitHub discussions][discussions] feature or directly on [#prometheus-operator kubernetes slack channel][slack]. + + +[antiaffinity]: https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet + +[node_exporter_defaults_example]: https://github.com/prometheus-operator/kube-prometheus/blob/1d2a0e275af97948667777739a18b24464480dc8/jsonnet/kube-prometheus/components/node-exporter.libsonnet#L3-L34 + +[openshift]: https://github.com/openshift/cluster-monitoring-operator/pull/1044 +[thaum]: https://github.com/thaum-xyz/ankhmorpork/blob/master/apps/monitoring/jsonnet + +[discussions]: https://github.com/prometheus-operator/kube-prometheus/discussions +[slack]: http://slack.k8s.io/ diff --git a/docs/monitoring-external-etcd.md b/docs/monitoring-external-etcd.md index f7071851..6ddecb18 100644 --- a/docs/monitoring-external-etcd.md +++ b/docs/monitoring-external-etcd.md @@ -1,5 +1,18 @@ -# Monitoring external etcd -This guide will help you monitor an external etcd cluster. When the etcd cluster is not hosted inside Kubernetes. +--- +title: "Monitoring external etcd" +description: "This guide will help you monitor an external etcd cluster." +lead: "This guide will help you monitor an external etcd cluster." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 640 +toc: true +--- + +When the etcd cluster is not hosted inside Kubernetes. This is often the case with Kubernetes setups. This approach has been tested with kube-aws but the same principals apply to other tools. Note that [etcd.jsonnet](../examples/etcd.jsonnet) & [kube-prometheus-static-etcd.libsonnet](../jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet) (which are described by a section of the [Readme](../README.md#static-etcd-configuration)) do the following: diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md index 51f21201..dc111b69 100644 --- a/docs/monitoring-other-namespaces.md +++ b/docs/monitoring-other-namespaces.md @@ -1,4 +1,17 @@ -# Monitoring other Kubernetes Namespaces +--- +title: "Monitoring other Namespaces" +description: "This guide will help you monitor applications in other Namespaces." +lead: "This guide will help you monitor applications in other Namespaces." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 640 +toc: true +--- + This guide will help you monitor applications in other Namespaces. By default the RBAC rules are only enabled for the `Default` and `kube-system` Namespace during Install. # Setup @@ -7,7 +20,7 @@ This is done in the variable `prometheus.roleSpecificNamespaces`. You usually se Example to create the needed `Role` and `RoleBinding` for the Namespace `foo` : ``` -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { +local kp = (import 'kube-prometheus/main.libsonnet') + { _config+:: { namespace: 'monitoring', diff --git a/docs/weave-net-support.md b/docs/weave-net-support.md index 9924434a..e8c2dfce 100644 --- a/docs/weave-net-support.md +++ b/docs/weave-net-support.md @@ -17,36 +17,42 @@ Using kube-prometheus and kubectl you will be able install the following for mon [embedmd]:# (../examples/weave-net-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-weave-net.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/weave-net/weave-net.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'weave-net' then - group { - rules: std.map( - function(rule) - if rule.alert == 'WeaveNetFastDPFlowsLow' then - rule { - expr: 'sum(weave_flows) < 20000', - } - else if rule.alert == 'WeaveNetIPAMUnreachable' then - rule { - expr: 'weave_ipam_unreachable_percentage > 25', - } - else - rule - , - group.rules - ), - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRuleWeaveNet+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'weave-net' then + group { + rules: std.map( + function(rule) + if rule.alert == 'WeaveNetFastDPFlowsLow' then + rule { + expr: 'sum(weave_flows) < 20000', + } + else if rule.alert == 'WeaveNetIPAMUnreachable' then + rule { + expr: 'weave_ipam_unreachable_percentage > 25', + } + else + rule + , + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; diff --git a/docs/windows.md b/docs/windows.md new file mode 100644 index 00000000..dcdc2be9 --- /dev/null +++ b/docs/windows.md @@ -0,0 +1,22 @@ +# Windows + +The [Windows addon](../examples/windows.jsonnet) adds the dashboards and rules from [kubernetes-monitoring/kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin#dashboards-for-windows-nodes). + +Currently, Windows does not support running with [windows_exporter](https://github.com/prometheus-community/windows_exporter) in a pod so this add on uses [additional scrape configuration](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/additional-scrape-config.md) to set up a static config to scrape the node ports where windows_exporter is configured. + + +The addon requires you to specify the node ips and ports where it can find the windows_exporter. See the [full example](../examples/windows.jsonnet) for setup. + +``` +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/windows.libsonnet') + + { + values+:: { + windowsScrapeConfig+:: { + static_configs: { + targets: ["10.240.0.65:5000", "10.240.0.63:5000"], + }, + }, + }, + }; +``` diff --git a/example.jsonnet b/example.jsonnet index 54de1e35..b181d647 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -1,28 +1,34 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/examples/additional-namespaces-servicemonitor.jsonnet b/examples/additional-namespaces-servicemonitor.jsonnet index 0f3add96..ec978da7 100644 --- a/examples/additional-namespaces-servicemonitor.jsonnet +++ b/examples/additional-namespaces-servicemonitor.jsonnet @@ -1,11 +1,13 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, prometheus+:: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, - prometheus+:: { + exampleApplication: { serviceMonitorMyNamespace: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -37,4 +39,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/additional-namespaces.jsonnet b/examples/additional-namespaces.jsonnet index 957fd912..45c606a6 100644 --- a/examples/additional-namespaces.jsonnet +++ b/examples/additional-namespaces.jsonnet @@ -1,8 +1,10 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, - prometheus+:: { + prometheus+: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, diff --git a/examples/alertmanager-config-external.jsonnet b/examples/alertmanager-config-external.jsonnet index c2b34cca..49f6b558 100644 --- a/examples/alertmanager-config-external.jsonnet +++ b/examples/alertmanager-config-external.jsonnet @@ -1,5 +1,5 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: importstr 'alertmanager-config.yaml', }, diff --git a/examples/alertmanager-config.jsonnet b/examples/alertmanager-config.jsonnet index f08dbe19..9702711a 100644 --- a/examples/alertmanager-config.jsonnet +++ b/examples/alertmanager-config.jsonnet @@ -1,5 +1,5 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: ||| global: diff --git a/examples/all-namespaces.jsonnet b/examples/all-namespaces.jsonnet index 7c5d149f..52534766 100644 --- a/examples/all-namespaces.jsonnet +++ b/examples/all-namespaces.jsonnet @@ -1,9 +1,10 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-all-namespaces.libsonnet') + { - _config+:: { - namespace: 'monitoring', - - prometheus+:: { +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + prometheus+: { namespaces: [], }, }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet b/examples/anti-affinity.jsonnet similarity index 66% rename from jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet rename to examples/anti-affinity.jsonnet index ad13373f..23720837 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet +++ b/examples/anti-affinity.jsonnet @@ -1,5 +1,13 @@ -local kp = (import './kube-prometheus/kube-prometheus.libsonnet'); +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/anti-affinity.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, +}; +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + diff --git a/examples/changing-default-rules.libsonnet b/examples/changing-default-rules.libsonnet new file mode 100644 index 00000000..1a0c192b --- /dev/null +++ b/examples/changing-default-rules.libsonnet @@ -0,0 +1,92 @@ +local filter = { + kubernetesControlPlane+: { + prometheusRule+:: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.filter( + function(rule) + rule.alert != 'KubeStatefulSetReplicasMismatch', + group.rules + ), + } + else + group, + super.groups + ), + }, + }, + }, +}; +local update = { + kubernetesControlPlane+: { + prometheusRule+:: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.map( + function(rule) + if rule.alert == 'KubePodCrashLooping' then + rule { + expr: 'rate(kube_pod_container_status_restarts_total{namespace=kube-system,job="kube-state-metrics"}[10m]) * 60 * 5 > 0', + } + else + rule, + group.rules + ), + } + else + group, + super.groups + ), + }, + }, + }, +}; + +local add = { + exampleApplication:: { + prometheusRule+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'example-application-rules', + namespace: $.values.common.namespace, + }, + spec: (import 'existingrule.json'), + }, + }, +}; +local kp = (import 'kube-prometheus/main.libsonnet') + + filter + + update + + add + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, +}; + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } + +{ ['exampleApplication-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet b/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet index 8600d818..94089e0c 100644 --- a/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet +++ b/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet @@ -1,7 +1,8 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; diff --git a/examples/eks-cni-example.jsonnet b/examples/eks-cni-example.jsonnet index dcebf6dd..1b37af50 100644 --- a/examples/eks-cni-example.jsonnet +++ b/examples/eks-cni-example.jsonnet @@ -1,20 +1,28 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + kubePrometheus+: { + platform: 'eks', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + kubernetesControlPlane+: { + prometheusRuleEksCNI+: { + spec+: { + groups+: [ { - record: 'aws_eks_available_ip', - expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + }, + ], }, ], }, - ], + }, }, }; diff --git a/examples/etcd-skip-verify.jsonnet b/examples/etcd-skip-verify.jsonnet index 603ba710..9982fa16 100644 --- a/examples/etcd-skip-verify.jsonnet +++ b/examples/etcd-skip-verify.jsonnet @@ -1,8 +1,9 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', - +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/static-etcd.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, etcd+:: { ips: ['127.0.0.1'], clientCA: importstr 'etcd-client-ca.crt', diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index 03d390cd..7126ee31 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -1,10 +1,12 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/static-etcd.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, // Reference info: https://github.com/coreos/kube-prometheus/blob/master/README.md#static-etcd-configuration - etcd+:: { + etcd+: { // Configure this to be the IP(s) to scrape - i.e. your etcd node(s) (use commas to separate multiple values). ips: ['127.0.0.1'], diff --git a/examples/existingrule.json b/examples/existingrule.json index 41d6620b..8486eb7d 100644 --- a/examples/existingrule.json +++ b/examples/existingrule.json @@ -1 +1 @@ -{"groups":[{"name":"example-group","rules":[{"alert":"Watchdog","annotations":{"description":"This is a Watchdog meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file +{"groups":[{"name":"example-group","rules":[{"alert":"ExampleAlert","annotations":{"description":"This is an example alert."},"expr":"vector(1)","labels":{"severity":"warning"}}]}]} \ No newline at end of file diff --git a/examples/existingrule.yaml b/examples/existingrule.yaml index 6a67032f..ab5de270 100644 --- a/examples/existingrule.yaml +++ b/examples/existingrule.yaml @@ -1,9 +1,9 @@ groups: - name: example-group rules: - - alert: Watchdog + - alert: ExampleAlert expr: vector(1) labels: - severity: "none" + severity: "warning" annotations: - description: This is a Watchdog meant to ensure that the entire alerting pipeline is functional. + description: This is an example alert. diff --git a/examples/grafana-additional-jsonnet-dashboard-example.jsonnet b/examples/grafana-additional-jsonnet-dashboard-example.jsonnet index b9b26fca..8c1973ca 100644 --- a/examples/grafana-additional-jsonnet-dashboard-example.jsonnet +++ b/examples/grafana-additional-jsonnet-dashboard-example.jsonnet @@ -5,35 +5,37 @@ local prometheus = grafana.prometheus; local template = grafana.template; local graphPanel = grafana.graphPanel; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - dashboards+:: { - 'my-dashboard.json': - dashboard.new('My Dashboard') - .addTemplate( - { - current: { - text: 'Prometheus', - value: 'Prometheus', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { + 'my-dashboard.json': + dashboard.new('My Dashboard') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', }, - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addRow( - row.new() - .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') - .addTarget(prometheus.target('vector(1)'))) - ), + ) + .addRow( + row.new() + .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') + .addTarget(prometheus.target('vector(1)'))) + ), + }, }, }, }; diff --git a/examples/grafana-additional-rendered-dashboard-example-2.jsonnet b/examples/grafana-additional-rendered-dashboard-example-2.jsonnet index 7d0926e9..7b8825d7 100644 --- a/examples/grafana-additional-rendered-dashboard-example-2.jsonnet +++ b/examples/grafana-additional-rendered-dashboard-example-2.jsonnet @@ -1,10 +1,12 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - rawDashboards+:: { - 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + rawDashboards+:: { + 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), + }, }, }, }; diff --git a/examples/grafana-additional-rendered-dashboard-example.jsonnet b/examples/grafana-additional-rendered-dashboard-example.jsonnet index 883c6097..1a8c8b9a 100644 --- a/examples/grafana-additional-rendered-dashboard-example.jsonnet +++ b/examples/grafana-additional-rendered-dashboard-example.jsonnet @@ -1,13 +1,12 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafanaDashboards+:: { // monitoring-mixin compatibility - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), - }, - grafana+:: { - dashboards+:: { // use this method to import your dashboards to Grafana - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { // use this method to import your dashboards to Grafana + 'my-dashboard.json': (import 'example-grafana-dashboard.json'), + }, }, }, }; diff --git a/examples/grafana-only-dashboards.jsonnet b/examples/grafana-only-dashboards.jsonnet new file mode 100644 index 00000000..2c5520c0 --- /dev/null +++ b/examples/grafana-only-dashboards.jsonnet @@ -0,0 +1,25 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, + + // Disable all grafana-related objects apart from dashboards and datasource + grafana: { + dashboardSources:: {}, + deployment:: {}, + serviceAccount:: {}, + serviceMonitor:: {}, + service:: {}, + }, + }; + +// Manifestation +{ + [component + '-' + resource + '.json']: kp[component][resource] + for component in std.objectFields(kp) + for resource in std.objectFields(kp[component]) +} diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index 023af577..b1197c5d 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -14,10 +14,12 @@ local ingress(name, namespace, rules) = { }; local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, grafana+:: { config+: { sections+: { @@ -47,15 +49,19 @@ local kp = ingress+:: { 'alertmanager-main': ingress( 'alertmanager-main', - $._config.namespace, + $.values.common.namespace, [{ host: 'alertmanager.example.com', http: { paths: [{ + path: '/', + pathType: 'Prefix', backend: { service: { name: 'alertmanager-main', - port: 'web', + port: { + name: 'web', + }, }, }, }], @@ -64,15 +70,19 @@ local kp = ), grafana: ingress( 'grafana', - $._config.namespace, + $.values.common.namespace, [{ host: 'grafana.example.com', http: { paths: [{ + path: '/', + pathType: 'Prefix', backend: { service: { name: 'grafana', - port: 'http', + port: { + name: 'http', + }, }, }, }], @@ -81,15 +91,19 @@ local kp = ), 'prometheus-k8s': ingress( 'prometheus-k8s', - $._config.namespace, + $.values.common.namespace, [{ host: 'prometheus.example.com', http: { paths: [{ + path: '/', + pathType: 'Prefix', backend: { service: { name: 'prometheus-k8s', - port: 'web', + port: { + name: 'web', + }, }, }, }], @@ -105,7 +119,7 @@ local kp = kind: 'Secret', metadata: { name: 'basic-auth', - namespace: $._config.namespace, + namespace: $.values.common.namespace, }, data: { auth: std.base64(importstr 'auth') }, type: 'Opaque', diff --git a/examples/internal-registry.jsonnet b/examples/internal-registry.jsonnet index f1d1e8ac..fc470de6 100644 --- a/examples/internal-registry.jsonnet +++ b/examples/internal-registry.jsonnet @@ -1,7 +1,9 @@ -local mixin = import 'kube-prometheus/kube-prometheus-config-mixins.libsonnet'; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local mixin = import 'kube-prometheus/addons/config-mixins.libsonnet'; +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, } + mixin.withImageRepository('internal-registry.com/organization'); diff --git a/examples/jsonnet-snippets/bootkube.jsonnet b/examples/jsonnet-snippets/bootkube.jsonnet deleted file mode 100644 index f7386a01..00000000 --- a/examples/jsonnet-snippets/bootkube.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') diff --git a/examples/jsonnet-snippets/kops-coredns.jsonnet b/examples/jsonnet-snippets/kops-coredns.jsonnet deleted file mode 100644 index 6ba445df..00000000 --- a/examples/jsonnet-snippets/kops-coredns.jsonnet +++ /dev/null @@ -1,3 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') diff --git a/examples/jsonnet-snippets/kops.jsonnet b/examples/jsonnet-snippets/kops.jsonnet deleted file mode 100644 index 4ff9ceae..00000000 --- a/examples/jsonnet-snippets/kops.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') diff --git a/examples/jsonnet-snippets/kube-aws.jsonnet b/examples/jsonnet-snippets/kube-aws.jsonnet deleted file mode 100644 index b0842eb2..00000000 --- a/examples/jsonnet-snippets/kube-aws.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kube-aws.libsonnet') diff --git a/examples/jsonnet-snippets/kubeadm.jsonnet b/examples/jsonnet-snippets/kubeadm.jsonnet deleted file mode 100644 index a7837163..00000000 --- a/examples/jsonnet-snippets/kubeadm.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') diff --git a/examples/jsonnet-snippets/kubespray.jsonnet b/examples/jsonnet-snippets/kubespray.jsonnet deleted file mode 100644 index 1665cf72..00000000 --- a/examples/jsonnet-snippets/kubespray.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') diff --git a/examples/jsonnet-snippets/node-ports.jsonnet b/examples/jsonnet-snippets/node-ports.jsonnet index c02f1ae7..abc70c94 100644 --- a/examples/jsonnet-snippets/node-ports.jsonnet +++ b/examples/jsonnet-snippets/node-ports.jsonnet @@ -1,2 +1,2 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') +(import 'kube-prometheus/main.libsonnet') + +(import 'kube-prometheus/addons/node-ports.libsonnet') diff --git a/examples/jsonnet-snippets/platform.jsonnet b/examples/jsonnet-snippets/platform.jsonnet new file mode 100644 index 00000000..e3a58804 --- /dev/null +++ b/examples/jsonnet-snippets/platform.jsonnet @@ -0,0 +1,8 @@ +(import 'kube-prometheus/main.libsonnet') + +{ + values+:: { + common+: { + platform: 'example-platform', + }, + }, +} diff --git a/examples/ksonnet-example.jsonnet b/examples/ksonnet-example.jsonnet index eadd2cb7..36640ab4 100644 --- a/examples/ksonnet-example.jsonnet +++ b/examples/ksonnet-example.jsonnet @@ -1,4 +1,4 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { nodeExporter+: { daemonset+: { metadata+: { diff --git a/examples/kubeProxy.jsonnet b/examples/kubeProxy.jsonnet new file mode 100644 index 00000000..03a7b3c7 --- /dev/null +++ b/examples/kubeProxy.jsonnet @@ -0,0 +1,20 @@ +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + + kubernetesControlPlane+: { + kubeProxy: true, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 38dd6c89..455b38bd 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,26 +1,32 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', + (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; local manifests = // Uncomment line below to enable vertical auto scaling of kube-state-metrics //{ ['ksm-autoscaler-' + name]: kp.ksmAutoscaler[name] for name in std.objectFields(kp.ksmAutoscaler) } + - { ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + - // serviceMonitor is separated so that it can be created after the CRDs are ready + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + - { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) }; local kustomizationResourceFile(name) = './manifests/' + name + '.yaml'; local kustomization = { diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet index 3073612a..c5a1bc68 100644 --- a/examples/minikube.jsonnet +++ b/examples/minikube.jsonnet @@ -1,15 +1,16 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. - (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + + (import 'kube-prometheus/addons/node-ports.libsonnet') + { - _config+:: { - namespace: 'monitoring', - alertmanager+:: { + values+:: { + common+: { + namespace: 'monitoring', + }, + alertmanager+: { config: importstr 'alertmanager-config.yaml', }, - grafana+:: { + grafana+: { config: { // http://docs.grafana.org/installation/configuration/ sections: { // Do not require grafana users to login/authenticate @@ -17,12 +18,15 @@ local kp = }, }, }, + kubePrometheus+: { + platform: 'kubeadm', + }, }, // For simplicity, each of the following values for 'externalUrl': // * assume that `minikube ip` prints "192.168.99.100" // * hard-code the NodePort for each app - prometheus+:: { + prometheus+: { prometheus+: { // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec spec+: { @@ -38,7 +42,7 @@ local kp = }, }, }, - alertmanager+:: { + alertmanager+: { alertmanager+: { // Reference info: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec spec+: { diff --git a/examples/mixin-inclusion.jsonnet b/examples/mixin-inclusion.jsonnet new file mode 100644 index 00000000..fc75c628 --- /dev/null +++ b/examples/mixin-inclusion.jsonnet @@ -0,0 +1,30 @@ +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); +local etcdMixin = addMixin({ + name: 'etcd', + mixin: (import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + { + _config+: {}, // mixin configuration object + }, +}); + +local kp = (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + grafana+: { + // Adding new dashboard to grafana. This will modify grafana configMap with dashboards + dashboards+: etcdMixin.grafanaDashboards, + }, + }, + }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +// Rendering prometheusRules object. This is an object compatible with prometheus-operator CRD definition for prometheusRule +{ 'external-mixins/etcd-mixin-prometheus-rules': etcdMixin.prometheusRules } diff --git a/examples/pod-security-policies.jsonnet b/examples/pod-security-policies.jsonnet new file mode 100644 index 00000000..3274c937 --- /dev/null +++ b/examples/pod-security-policies.jsonnet @@ -0,0 +1,23 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/podsecuritypolicies.libsonnet'); + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +// Add the restricted psp to setup +{ 'setup/0podsecuritypolicy-restricted': kp.restrictedPodSecurityPolicy } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/examples/prometheus-additional-alert-rule-example.jsonnet b/examples/prometheus-additional-alert-rule-example.jsonnet index 6e63382e..43dc538c 100644 --- a/examples/prometheus-additional-alert-rule-example.jsonnet +++ b/examples/prometheus-additional-alert-rule-example.jsonnet @@ -1,25 +1,37 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - alert: 'Watchdog', - expr: 'vector(1)', - labels: { - severity: 'none', - }, - annotations: { - description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', - }, + name: 'example-group', + rules: [ + { + alert: 'ExampleAlert', + expr: 'vector(1)', + labels: { + severity: 'warning', + }, + annotations: { + description: 'This is an example alert.', + }, + }, + ], }, ], }, - ], + }, }, }; @@ -30,4 +42,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/prometheus-additional-recording-rule-example.jsonnet b/examples/prometheus-additional-recording-rule-example.jsonnet index 132bd0db..5e67b03f 100644 --- a/examples/prometheus-additional-recording-rule-example.jsonnet +++ b/examples/prometheus-additional-recording-rule-example.jsonnet @@ -1,19 +1,31 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - record: 'some_recording_rule_name', - expr: 'vector(1)', + name: 'example-group', + rules: [ + { + record: 'some_recording_rule_name', + expr: 'vector(1)', + }, + ], }, ], }, - ], + }, }, }; @@ -24,4 +36,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/prometheus-additional-rendered-rule-example.jsonnet b/examples/prometheus-additional-rendered-rule-example.jsonnet index b6d39a40..66c7937c 100644 --- a/examples/prometheus-additional-rendered-rule-example.jsonnet +++ b/examples/prometheus-additional-rendered-rule-example.jsonnet @@ -1,9 +1,21 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: (import 'existingrule.json').groups, + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: (import 'existingrule.json').groups, + }, + }, }, }; @@ -14,4 +26,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/prometheus-name-override.jsonnet b/examples/prometheus-name-override.jsonnet index 86218012..b6c39060 100644 --- a/examples/prometheus-name-override.jsonnet +++ b/examples/prometheus-name-override.jsonnet @@ -1,4 +1,4 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { prometheus+: { prometheus+: { metadata+: { diff --git a/examples/prometheus-pvc.jsonnet b/examples/prometheus-pvc.jsonnet index e753499e..5beffd84 100644 --- a/examples/prometheus-pvc.jsonnet +++ b/examples/prometheus-pvc.jsonnet @@ -1,14 +1,15 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, prometheus+:: { diff --git a/examples/strip-limits.jsonnet b/examples/strip-limits.jsonnet index 69912b64..fc43e286 100644 --- a/examples/strip-limits.jsonnet +++ b/examples/strip-limits.jsonnet @@ -1,7 +1,9 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-strip-limits.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/strip-limits.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; diff --git a/examples/thanos-sidecar.jsonnet b/examples/thanos-sidecar.jsonnet new file mode 100644 index 00000000..7ed55036 --- /dev/null +++ b/examples/thanos-sidecar.jsonnet @@ -0,0 +1,33 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + prometheus+: { + thanos: { + version: '0.19.0', + image: 'quay.io/thanos/thanos:v0.19.0', + objectStorageConfig: { + key: 'thanos.yaml', // How the file inside the secret is called + name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config + }, + }, + }, + }, + }; + +{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor is separated so that it can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/tolerations.libsonnet b/examples/tolerations.libsonnet index 35776f39..f244cebf 100644 --- a/examples/tolerations.libsonnet +++ b/examples/tolerations.libsonnet @@ -1,23 +1,19 @@ { - _config+:: { - tolerations+:: [ - { - key: 'key1', - operator: 'Equal', - value: 'value1', - effect: 'NoSchedule', - }, - { - key: 'key2', - operator: 'Exists', - }, - ], - }, - prometheus+: { prometheus+: { spec+: { - tolerations: [t for t in $._config.tolerations], + tolerations: [ + { + key: 'key1', + operator: 'Equal', + value: 'value1', + effect: 'NoSchedule', + }, + { + key: 'key2', + operator: 'Exists', + }, + ], }, }, }, diff --git a/examples/weave-net-example.jsonnet b/examples/weave-net-example.jsonnet index c6cc733c..eeeb622d 100644 --- a/examples/weave-net-example.jsonnet +++ b/examples/weave-net-example.jsonnet @@ -1,33 +1,39 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-weave-net.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/weave-net/weave-net.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'weave-net' then - group { - rules: std.map( - function(rule) - if rule.alert == 'WeaveNetFastDPFlowsLow' then - rule { - expr: 'sum(weave_flows) < 20000', - } - else if rule.alert == 'WeaveNetIPAMUnreachable' then - rule { - expr: 'weave_ipam_unreachable_percentage > 25', - } - else - rule - , - group.rules - ), - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRuleWeaveNet+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'weave-net' then + group { + rules: std.map( + function(rule) + if rule.alert == 'WeaveNetFastDPFlowsLow' then + rule { + expr: 'sum(weave_flows) < 20000', + } + else if rule.alert == 'WeaveNetIPAMUnreachable' then + rule { + expr: 'weave_ipam_unreachable_percentage > 25', + } + else + rule + , + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; diff --git a/examples/windows.jsonnet b/examples/windows.jsonnet new file mode 100644 index 00000000..d90fb8da --- /dev/null +++ b/examples/windows.jsonnet @@ -0,0 +1,33 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/windows.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + windowsScrapeConfig+:: { + static_configs: [{ + targets: ['10.240.0.65:5000', '10.240.0.63:5000'], + }], + }, + }, + }; + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/go.mod b/go.mod index 1c6c40f1..b0bfa19c 100644 --- a/go.mod +++ b/go.mod @@ -1,13 +1,9 @@ module github.com/prometheus-operator/kube-prometheus -go 1.13 +go 1.15 require ( github.com/Jeffail/gabs v1.4.0 - github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c - github.com/campoy/embedmd v1.0.0 - github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f - github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.8.0 k8s.io/apimachinery v0.19.3 diff --git a/go.sum b/go.sum index 657f1cd9..1fca7e54 100644 --- a/go.sum +++ b/go.sum @@ -54,10 +54,6 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= -github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c h1:hb6WqfcKQZlNx/vahy51SaIvKnoXD5609Nm0PC4msEM= -github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c/go.mod h1:+00lOjYXPgMfxHVPvg9GDtc3BX5Xh5aFpB4gMB8gfMo= -github.com/campoy/embedmd v1.0.0 h1:V4kI2qTJJLf4J29RzI/MAt2c3Bl4dQSYPuflzwFH2hY= -github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX3MzVl8= github.com/casbin/casbin/v2 v2.1.2/go.mod h1:YcPU1XXisHhLzuxH9coDNf2FbKpjGlbCg3n9yuLkIJQ= github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -93,8 +89,6 @@ github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7 github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= -github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4= github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= @@ -155,8 +149,6 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f h1:mw4KoMG5/DXLPhpKXQRYTEIZFkFo0a1HU2R1HbeYpek= -github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f/go.mod h1:sOcuej3UW1vpPTZOr8L7RQimqai1a57bt5j22LzGZCw= github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= @@ -216,8 +208,6 @@ github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/u github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 h1:4BKZ6LDqPc2wJDmaKnmYD/vDjUptJtnUpai802MibFc= -github.com/jsonnet-bundler/jsonnet-bundler v0.4.0/go.mod h1:/by7P/OoohkI3q4CgSFqcoFsVY+IaNbzOVDknEsKDeU= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= @@ -241,15 +231,8 @@ github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0Q github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mattn/go-colorable v0.0.9 h1:UVL0vNpWh04HeJXV0KLcaT7r06gOH2l4OW6ddYRUIY4= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= -github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA= -github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.6 h1:SrwhHcpV4nWrMGdNcC2kXpMfcBVYGDuTArqyhocJgvA= -github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mattn/go-isatty v0.0.11 h1:FxPOTFNqGkuDUGi3H/qkUbQO4ZiBa2brKq5r0l8TGeM= -github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= @@ -344,8 +327,6 @@ github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= -github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= -github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= @@ -462,8 +443,6 @@ golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -474,8 +453,6 @@ golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 h1:YyJpGZS1sBuBCzLAR1VEpK193GlqGZbnPFnPV/5Rsb4= -golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy deleted file mode 100755 index 0c7cd7c1..00000000 --- a/hack/example-service-monitoring/deploy +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u - -kubectl apply -f examples/example-app diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown deleted file mode 100755 index 1a49f462..00000000 --- a/hack/example-service-monitoring/teardown +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u - -kubectl delete -f examples/example-app diff --git a/hack/jsonnet-docker-image b/hack/jsonnet-docker-image deleted file mode 100644 index e69de29b..00000000 diff --git a/jsonnet/kube-prometheus/addons/all-namespaces.libsonnet b/jsonnet/kube-prometheus/addons/all-namespaces.libsonnet new file mode 100644 index 00000000..34f83173 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/all-namespaces.libsonnet @@ -0,0 +1,22 @@ +{ + prometheus+:: { + clusterRole+: { + rules+: [ + { + apiGroups: [''], + resources: ['services', 'endpoints', 'pods'], + verbs: ['get', 'list', 'watch'], + }, + { + apiGroups: ['networking.k8s.io'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }, + ], + }, + // There is no need for specific namespaces RBAC as this addon grants + // all required permissions for every namespace + roleBindingSpecificNamespaces:: null, + roleSpecificNamespaces:: null, + }, +} diff --git a/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet b/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet new file mode 100644 index 00000000..e266b913 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet @@ -0,0 +1,99 @@ +{ + values+:: { + alertmanager+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + prometheus+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + blackboxExporter+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + prometheusAdapter+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + }, + + antiaffinity(labelSelector, namespace, type, topologyKey):: { + local podAffinityTerm = { + namespaces: [namespace], + topologyKey: topologyKey, + labelSelector: { + matchLabels: labelSelector, + }, + }, + + affinity: { + podAntiAffinity: if type == 'soft' then { + preferredDuringSchedulingIgnoredDuringExecution: [{ + weight: 100, + podAffinityTerm: podAffinityTerm, + }], + } else if type == 'hard' then { + requiredDuringSchedulingIgnoredDuringExecution: [ + podAffinityTerm, + ], + } else error 'podAntiAffinity must be either "soft" or "hard"', + }, + }, + + alertmanager+: { + alertmanager+: { + spec+: + $.antiaffinity( + $.alertmanager._config.selectorLabels, + $.values.common.namespace, + $.values.alertmanager.podAntiAffinity, + $.values.alertmanager.podAntiAffinityTopologyKey, + ), + }, + }, + + prometheus+: { + prometheus+: { + spec+: + $.antiaffinity( + $.prometheus._config.selectorLabels, + $.values.common.namespace, + $.values.prometheus.podAntiAffinity, + $.values.prometheus.podAntiAffinityTopologyKey, + ), + }, + }, + + blackboxExporter+: { + deployment+: { + spec+: { + template+: { + spec+: + $.antiaffinity( + $.blackboxExporter._config.selectorLabels, + $.values.common.namespace, + $.values.blackboxExporter.podAntiAffinity, + $.values.blackboxExporter.podAntiAffinityTopologyKey, + ), + }, + }, + }, + }, + + prometheusAdapter+: { + deployment+: { + spec+: { + template+: { + spec+: + $.antiaffinity( + $.prometheusAdapter._config.selectorLabels, + $.values.common.namespace, + $.values.prometheusAdapter.podAntiAffinity, + $.values.prometheusAdapter.podAntiAffinityTopologyKey, + ), + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/aws-vpc-cni.libsonnet b/jsonnet/kube-prometheus/addons/aws-vpc-cni.libsonnet new file mode 100644 index 00000000..61e7aaa4 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/aws-vpc-cni.libsonnet @@ -0,0 +1,110 @@ +{ + values+:: { + awsVpcCni: { + // `minimumWarmIPs` should be inferior or equal to `WARM_IP_TARGET`. + // + // References: + // https://github.com/aws/amazon-vpc-cni-k8s/blob/v1.9.0/docs/eni-and-ip-target.md + // https://github.com/aws/amazon-vpc-cni-k8s/blob/v1.9.0/pkg/ipamd/ipamd.go#L61-L71 + minimumWarmIPs: 10, + minimumWarmIPsTime: '10m', + }, + }, + kubernetesControlPlane+: { + serviceAwsVpcCni: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'aws-node', + namespace: 'kube-system', + labels: { 'app.kubernetes.io/name': 'aws-node' }, + }, + spec: { + ports: [ + { + name: 'cni-metrics-port', + port: 61678, + targetPort: 61678, + }, + ], + selector: { 'app.kubernetes.io/name': 'aws-node' }, + clusterIP: 'None', + }, + }, + + serviceMonitorAwsVpcCni: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'aws-node', + namespace: $.values.common.namespace, + labels: { + 'app.kubernetes.io/name': 'aws-node', + }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + relabelings: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + sourceLabels: ['__meta_kubernetes_pod_node_name'], + targetLabel: 'instance', + }, + ], + }, + ], + }, + }, + + prometheusRuleAwsVpcCni: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: $.prometheus._config.commonLabels + $.prometheus._config.mixin.ruleLabels, + name: 'aws-vpc-cni-rules', + namespace: $.prometheus._config.namespace, + }, + spec: { + groups: [ + { + name: 'aws-vpc-cni.rules', + rules: [ + { + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $.values.awsVpcCni.minimumWarmIPs, + labels: { + severity: 'critical', + }, + annotations: { + summary: 'AWS VPC CNI has a low warm IP pool', + description: ||| + Instance {{ $labels.instance }} has only {{ $value }} warm IPs which is lower than set threshold of %s. + It could mean the current subnet is out of available IP addresses or the CNI is unable to request them from the EC2 API. + ||| % $.values.awsVpcCni.minimumWarmIPs, + }, + 'for': $.values.awsVpcCni.minimumWarmIPsTime, + alert: 'AwsVpcCniWarmIPsLow', + }, + ], + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/config-mixins.libsonnet b/jsonnet/kube-prometheus/addons/config-mixins.libsonnet new file mode 100644 index 00000000..ca2f4468 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/config-mixins.libsonnet @@ -0,0 +1,36 @@ +local imageName(image) = + local parts = std.split(image, '/'); + local len = std.length(parts); + if len == 3 then + // registry.com/org/image + parts[2] + else if len == 2 then + // org/image + parts[1] + else if len == 1 then + // image, ie. busybox + parts[0] + else + error 'unknown image format: ' + image; + + +// withImageRepository is a mixin that replaces all images prefixes by repository. eg. +// quay.io/coreos/addon-resizer -> $repository/addon-resizer +// grafana/grafana -> grafana $repository/grafana +local withImageRepository(repository) = { + local oldRepos = super.values.common.images, + local substituteRepository(image, repository) = + if repository == null then image else repository + '/' + imageName(image), + values+:: { + common+:: { + images:: { + [field]: substituteRepository(oldRepos[field], repository) + for field in std.objectFields(oldRepos) + }, + }, + }, +}; + +{ + withImageRepository:: withImageRepository, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-custom-metrics.libsonnet b/jsonnet/kube-prometheus/addons/custom-metrics.libsonnet similarity index 94% rename from jsonnet/kube-prometheus/kube-prometheus-custom-metrics.libsonnet rename to jsonnet/kube-prometheus/addons/custom-metrics.libsonnet index b0240ec3..06e9c5a0 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-custom-metrics.libsonnet +++ b/jsonnet/kube-prometheus/addons/custom-metrics.libsonnet @@ -2,9 +2,9 @@ // For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links { - _config+:: { - prometheusAdapter+:: { - namespace: $._config.namespace, + values+:: { + prometheusAdapter+: { + namespace: $.values.common.namespace, // Rules for custom-metrics config+:: { rules+: [ @@ -78,7 +78,7 @@ }, }, - prometheusAdapter+:: { + prometheusAdapter+: { customMetricsApiService: { apiVersion: 'apiregistration.k8s.io/v1', kind: 'APIService', @@ -88,7 +88,7 @@ spec: { service: { name: $.prometheusAdapter.service.metadata.name, - namespace: $._config.prometheusAdapter.namespace, + namespace: $.values.prometheusAdapter.namespace, }, group: 'custom.metrics.k8s.io', version: 'v1beta1', @@ -106,7 +106,7 @@ spec: { service: { name: $.prometheusAdapter.service.metadata.name, - namespace: $._config.prometheusAdapter.namespace, + namespace: $.values.prometheusAdapter.namespace, }, group: 'custom.metrics.k8s.io', version: 'v1beta2', @@ -133,7 +133,6 @@ metadata: { name: 'custom-metrics-server-resources', }, - roleRef: { apiGroup: 'rbac.authorization.k8s.io', kind: 'ClusterRole', @@ -142,7 +141,7 @@ subjects: [{ kind: 'ServiceAccount', name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, + namespace: $.values.prometheusAdapter.namespace, }], }, customMetricsClusterRoleBindingHPA: { @@ -151,7 +150,6 @@ metadata: { name: 'hpa-controller-custom-metrics', }, - roleRef: { apiGroup: 'rbac.authorization.k8s.io', kind: 'ClusterRole', diff --git a/jsonnet/kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet b/jsonnet/kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet new file mode 100644 index 00000000..374b8601 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet @@ -0,0 +1,139 @@ +[ + // Drop all kubelet metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)', + action: 'drop', + }, + // Drop all scheduler metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)', + action: 'drop', + }, + // Drop all apiserver metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)', + action: 'drop', + }, + // Drop all docker metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)', + action: 'drop', + }, + // Drop all reflector metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)', + action: 'drop', + }, + // Drop all etcd metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)', + action: 'drop', + }, + // Drop all transformation metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'transformation_(transformation_latencies_microseconds|failures_total)', + action: 'drop', + }, + // Drop all other metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: '(' + std.join('|', + [ + 'admission_quota_controller_adds', + 'admission_quota_controller_depth', + 'admission_quota_controller_longest_running_processor_microseconds', + 'admission_quota_controller_queue_latency', + 'admission_quota_controller_unfinished_work_seconds', + 'admission_quota_controller_work_duration', + 'APIServiceOpenAPIAggregationControllerQueue1_adds', + 'APIServiceOpenAPIAggregationControllerQueue1_depth', + 'APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds', + 'APIServiceOpenAPIAggregationControllerQueue1_queue_latency', + 'APIServiceOpenAPIAggregationControllerQueue1_retries', + 'APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds', + 'APIServiceOpenAPIAggregationControllerQueue1_work_duration', + 'APIServiceRegistrationController_adds', + 'APIServiceRegistrationController_depth', + 'APIServiceRegistrationController_longest_running_processor_microseconds', + 'APIServiceRegistrationController_queue_latency', + 'APIServiceRegistrationController_retries', + 'APIServiceRegistrationController_unfinished_work_seconds', + 'APIServiceRegistrationController_work_duration', + 'autoregister_adds', + 'autoregister_depth', + 'autoregister_longest_running_processor_microseconds', + 'autoregister_queue_latency', + 'autoregister_retries', + 'autoregister_unfinished_work_seconds', + 'autoregister_work_duration', + 'AvailableConditionController_adds', + 'AvailableConditionController_depth', + 'AvailableConditionController_longest_running_processor_microseconds', + 'AvailableConditionController_queue_latency', + 'AvailableConditionController_retries', + 'AvailableConditionController_unfinished_work_seconds', + 'AvailableConditionController_work_duration', + 'crd_autoregistration_controller_adds', + 'crd_autoregistration_controller_depth', + 'crd_autoregistration_controller_longest_running_processor_microseconds', + 'crd_autoregistration_controller_queue_latency', + 'crd_autoregistration_controller_retries', + 'crd_autoregistration_controller_unfinished_work_seconds', + 'crd_autoregistration_controller_work_duration', + 'crdEstablishing_adds', + 'crdEstablishing_depth', + 'crdEstablishing_longest_running_processor_microseconds', + 'crdEstablishing_queue_latency', + 'crdEstablishing_retries', + 'crdEstablishing_unfinished_work_seconds', + 'crdEstablishing_work_duration', + 'crd_finalizer_adds', + 'crd_finalizer_depth', + 'crd_finalizer_longest_running_processor_microseconds', + 'crd_finalizer_queue_latency', + 'crd_finalizer_retries', + 'crd_finalizer_unfinished_work_seconds', + 'crd_finalizer_work_duration', + 'crd_naming_condition_controller_adds', + 'crd_naming_condition_controller_depth', + 'crd_naming_condition_controller_longest_running_processor_microseconds', + 'crd_naming_condition_controller_queue_latency', + 'crd_naming_condition_controller_retries', + 'crd_naming_condition_controller_unfinished_work_seconds', + 'crd_naming_condition_controller_work_duration', + 'crd_openapi_controller_adds', + 'crd_openapi_controller_depth', + 'crd_openapi_controller_longest_running_processor_microseconds', + 'crd_openapi_controller_queue_latency', + 'crd_openapi_controller_retries', + 'crd_openapi_controller_unfinished_work_seconds', + 'crd_openapi_controller_work_duration', + 'DiscoveryController_adds', + 'DiscoveryController_depth', + 'DiscoveryController_longest_running_processor_microseconds', + 'DiscoveryController_queue_latency', + 'DiscoveryController_retries', + 'DiscoveryController_unfinished_work_seconds', + 'DiscoveryController_work_duration', + 'kubeproxy_sync_proxy_rules_latency_microseconds', + 'non_structural_schema_condition_controller_adds', + 'non_structural_schema_condition_controller_depth', + 'non_structural_schema_condition_controller_longest_running_processor_microseconds', + 'non_structural_schema_condition_controller_queue_latency', + 'non_structural_schema_condition_controller_retries', + 'non_structural_schema_condition_controller_unfinished_work_seconds', + 'non_structural_schema_condition_controller_work_duration', + 'rest_client_request_latency_seconds', + 'storage_operation_errors_total', + 'storage_operation_status_count', + ]) + ')', + action: 'drop', + }, +] diff --git a/jsonnet/kube-prometheus/addons/external-metrics.libsonnet b/jsonnet/kube-prometheus/addons/external-metrics.libsonnet new file mode 100644 index 00000000..928d29e7 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/external-metrics.libsonnet @@ -0,0 +1,95 @@ +// External metrics API allows the HPA v2 to scale based on metrics coming from outside of Kubernetes cluster +// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links + +{ + values+:: { + prometheusAdapter+: { + namespace: $.values.common.namespace, + // Rules for external-metrics + config+:: { + externalRules+: [ + // { + // seriesQuery: '{__name__=~"^.*_queue$",namespace!=""}', + // seriesFilters: [], + // resources: { + // overrides: { + // namespace: { resource: 'namespace' } + // }, + // }, + // name: { matches: '^.*_queue$', as: '$0' }, + // metricsQuery: 'max(<<.Series>>{<<.LabelMatchers>>})', + // }, + ], + }, + }, + }, + + prometheusAdapter+: { + externalMetricsApiService: { + apiVersion: 'apiregistration.k8s.io/v1', + kind: 'APIService', + metadata: { + name: 'v1beta1.external.metrics.k8s.io', + }, + spec: { + service: { + name: $.prometheusAdapter.service.metadata.name, + namespace: $.values.prometheusAdapter.namespace, + }, + group: 'external.metrics.k8s.io', + version: 'v1beta1', + insecureSkipTLSVerify: true, + groupPriorityMinimum: 100, + versionPriority: 100, + }, + }, + externalMetricsClusterRoleServerResources: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'external-metrics-server-resources', + }, + rules: [{ + apiGroups: ['external.metrics.k8s.io'], + resources: ['*'], + verbs: ['*'], + }], + }, + externalMetricsClusterRoleBindingServerResources: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'external-metrics-server-resources', + }, + + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'external-metrics-server-resources', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.prometheusAdapter.serviceAccount.metadata.name, + namespace: $.values.prometheusAdapter.namespace, + }], + }, + externalMetricsClusterRoleBindingHPA: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'hpa-controller-external-metrics', + }, + + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'external-metrics-server-resources', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'horizontal-pod-autoscaler', + namespace: 'kube-system', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet b/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet similarity index 98% rename from jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet rename to jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet index 73d0b9d7..ab6f2943 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet +++ b/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet @@ -1,5 +1,5 @@ { - prometheus+:: { + prometheus+: { serviceMonitorKubelet+: { spec+: { diff --git a/jsonnet/kube-prometheus/addons/ksm-autoscaler.libsonnet b/jsonnet/kube-prometheus/addons/ksm-autoscaler.libsonnet new file mode 100644 index 00000000..fa2caf0e --- /dev/null +++ b/jsonnet/kube-prometheus/addons/ksm-autoscaler.libsonnet @@ -0,0 +1,136 @@ +{ + values+:: { + clusterVerticalAutoscaler: { + version: '0.8.1', + image: 'gcr.io/google_containers/cpvpa-amd64:v0.8.1', + baseCPU: '1m', + stepCPU: '1m', + baseMemory: '1Mi', + stepMemory: '2Mi', + }, + }, + ksmAutoscaler+: { + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { name: 'ksm-autoscaler' }, + rules: [{ + apiGroups: [''], + resources: ['nodes'], + verbs: ['list', 'watch'], + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { name: 'ksm-autoscaler' }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'ksm-autoscaler', + }, + subjects: [{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $.values.common.namespace }], + }, + + roleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'ksm-autoscaler', + }, + subjects: [{ kind: 'ServiceAccount', name: 'ksm-autoscaler' }], + }, + + role: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + }, + rules: [ + { + apiGroups: ['extensions'], + resources: ['deployments'], + verbs: ['patch'], + resourceNames: ['kube-state-metrics'], + }, + { + apiGroups: ['apps'], + resources: ['deployments'], + verbs: ['patch'], + resourceNames: ['kube-state-metrics'], + }, + ], + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + }, + }, + + deployment: + local podLabels = { app: 'ksm-autoscaler' }; + local c = { + name: 'ksm-autoscaler', + image: $.values.clusterVerticalAutoscaler.image, + args: [ + '/cpvpa', + '--target=deployment/kube-state-metrics', + '--namespace=' + $.values.common.namespace, + '--logtostderr=true', + '--poll-period-seconds=10', + '--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $.values.clusterVerticalAutoscaler.baseCPU + + '","step":"' + $.values.clusterVerticalAutoscaler.stepCPU + + '","nodesPerStep":1},"memory":{"base":"' + $.values.clusterVerticalAutoscaler.baseMemory + + '","step":"' + $.values.clusterVerticalAutoscaler.stepMemory + + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $.values.clusterVerticalAutoscaler.baseCPU + + '","step":"' + $.values.clusterVerticalAutoscaler.stepCPU + + '","nodesPerStep":1},"memory":{"base":"' + $.values.clusterVerticalAutoscaler.baseMemory + + '","step":"' + $.values.clusterVerticalAutoscaler.stepMemory + '","nodesPerStep":1}}}}', + ], + resources: { + requests: { cpu: '20m', memory: '10Mi' }, + }, + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + labels: podLabels, + }, + spec: { + replicas: 1, + selector: { matchLabels: podLabels }, + template: { + metadata: { + labels: podLabels, + }, + spec: { + containers: [c], + serviceAccount: 'ksm-autoscaler', + nodeSelector: { 'kubernetes.io/os': 'linux' }, + securityContext: { + runAsNonRoot: true, + runAsUser: 65534, + }, + }, + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet b/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet new file mode 100644 index 00000000..cf06ebdc --- /dev/null +++ b/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet @@ -0,0 +1,39 @@ +local addArgs(args, name, containers) = std.map( + function(c) if c.name == name then + c { + args+: args, + } + else c, + containers, +); + +{ + kubeStateMetrics+: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: addArgs( + [||| + --metric-denylist= + kube_.+_created, + kube_.+_metadata_resource_version, + kube_replicaset_metadata_generation, + kube_replicaset_status_observed_generation, + kube_pod_restart_policy, + kube_pod_init_container_status_terminated, + kube_pod_init_container_status_running, + kube_pod_container_status_terminated, + kube_pod_container_status_running, + kube_pod_completion_time, + kube_pod_status_scheduled + |||], + 'kube-state-metrics', + super.containers + ), + }, + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/managed-cluster.libsonnet b/jsonnet/kube-prometheus/addons/managed-cluster.libsonnet new file mode 100644 index 00000000..79c464a6 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/managed-cluster.libsonnet @@ -0,0 +1,20 @@ +// On managed Kubernetes clusters some of the control plane components are not exposed to customers. +// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'main.libsonnet' defaults + +{ + kubernetesControlPlane+: { + serviceMonitorKubeControllerManager:: null, + serviceMonitorKubeScheduler:: null, + } + { + prometheusRule+: { + spec+: { + local g = super.groups, + groups: [ + h + for h in g + if !std.setMember(h.name, ['kubernetes-system-controller-manager', 'kubernetes-system-scheduler']) + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet b/jsonnet/kube-prometheus/addons/node-ports.libsonnet similarity index 95% rename from jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet rename to jsonnet/kube-prometheus/addons/node-ports.libsonnet index b9369011..405a70ce 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet +++ b/jsonnet/kube-prometheus/addons/node-ports.libsonnet @@ -1,6 +1,6 @@ local patch(ports) = { spec+: { - ports+: ports, + ports: ports, type: 'NodePort', }, }; diff --git a/jsonnet/kube-prometheus/addons/podsecuritypolicies.libsonnet b/jsonnet/kube-prometheus/addons/podsecuritypolicies.libsonnet new file mode 100644 index 00000000..61439b59 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/podsecuritypolicies.libsonnet @@ -0,0 +1,264 @@ +local restrictedPodSecurityPolicy = { + apiVersion: 'policy/v1beta1', + kind: 'PodSecurityPolicy', + metadata: { + name: 'kube-prometheus-restricted', + }, + spec: { + privileged: false, + // Required to prevent escalations to root. + allowPrivilegeEscalation: false, + // This is redundant with non-root + disallow privilege escalation, + // but we can provide it for defense in depth. + requiredDropCapabilities: ['ALL'], + // Allow core volume types. + volumes: [ + 'configMap', + 'emptyDir', + 'secret', + // Assume that persistentVolumes set up by the cluster admin are safe to use. + 'persistentVolumeClaim', + ], + hostNetwork: false, + hostIPC: false, + hostPID: false, + runAsUser: { + // Require the container to run without root privileges. + rule: 'MustRunAsNonRoot', + }, + seLinux: { + // This policy assumes the nodes are using AppArmor rather than SELinux. + rule: 'RunAsAny', + }, + supplementalGroups: { + rule: 'MustRunAs', + ranges: [{ + // Forbid adding the root group. + min: 1, + max: 65535, + }], + }, + fsGroup: { + rule: 'MustRunAs', + ranges: [{ + // Forbid adding the root group. + min: 1, + max: 65535, + }], + }, + readOnlyRootFilesystem: false, + }, +}; + +{ + restrictedPodSecurityPolicy: restrictedPodSecurityPolicy, + + alertmanager+: { + role: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'alertmanager-' + $.values.alertmanager.name, + namespace: $.values.common.namespace, + }, + rules: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + + roleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'alertmanager-' + $.values.alertmanager.name, + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'alertmanager-' + $.values.alertmanager.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'alertmanager-' + $.values.alertmanager.name, + namespace: $.values.alertmanager.namespace, + }], + }, + }, + + blackboxExporter+: { + clusterRole+: { + rules+: [ + { + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['blackbox-exporter-psp'], + }, + ], + }, + + podSecurityPolicy: + local blackboxExporterPspPrivileged = + if $.blackboxExporter._config.privileged then + { + metadata+: { + name: 'blackbox-exporter-psp', + }, + spec+: { + privileged: true, + allowedCapabilities: ['NET_RAW'], + runAsUser: { + rule: 'RunAsAny', + }, + }, + } + else + { + metadata+: { + name: 'blackbox-exporter-psp', + }, + }; + + restrictedPodSecurityPolicy + blackboxExporterPspPrivileged, + }, + + grafana+: { + role: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'grafana', + namespace: $.values.common.namespace, + }, + rules: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + + roleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'grafana', + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'grafana', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.grafana.serviceAccount.metadata.name, + namespace: $.grafana.serviceAccount.metadata.namespace, + }], + }, + }, + + kubeStateMetrics+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['kube-state-metrics-psp'], + }], + }, + + podSecurityPolicy: restrictedPodSecurityPolicy { + metadata+: { + name: 'kube-state-metrics-psp', + }, + spec+: { + runAsUser: { + rule: 'RunAsAny', + }, + }, + }, + }, + + nodeExporter+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['node-exporter-psp'], + }], + }, + + podSecurityPolicy: restrictedPodSecurityPolicy { + metadata+: { + name: 'node-exporter-psp', + }, + spec+: { + allowedHostPaths+: [ + { + pathPrefix: '/proc', + readOnly: true, + }, + { + pathPrefix: '/sys', + readOnly: true, + }, + { + pathPrefix: '/', + readOnly: true, + }, + ], + hostNetwork: true, + hostPID: true, + hostPorts: [ + { + max: $.nodeExporter._config.port, + min: $.nodeExporter._config.port, + }, + ], + readOnlyRootFilesystem: true, + volumes+: [ + 'hostPath', + ], + }, + }, + }, + + prometheusAdapter+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + }, + + prometheusOperator+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + }, + + prometheus+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet b/jsonnet/kube-prometheus/addons/static-etcd.libsonnet similarity index 55% rename from jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet rename to jsonnet/kube-prometheus/addons/static-etcd.libsonnet index 9bc77385..4f11a076 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet +++ b/jsonnet/kube-prometheus/addons/static-etcd.libsonnet @@ -1,7 +1,5 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -(import 'github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet') + { - _config+:: { +(import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + { + values+:: { etcd: { ips: [], clientCA: null, @@ -11,14 +9,14 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; insecureSkipVerify: null, }, }, - prometheus+:: { + prometheus+: { serviceEtcd: { apiVersion: 'v1', kind: 'Service', metadata: { name: 'etcd', namespace: 'kube-system', - labels: { 'k8s-app': 'etcd' }, + labels: { 'app.kubernetes.io/name': 'etcd' }, }, spec: { ports: [ @@ -28,23 +26,23 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; }, }, endpointsEtcd: { - apiVersion: 'v1', - kind: 'Endpoints', - metadata: { - name: 'etcd', - namespace: 'kube-system', - labels: { 'k8s-app': 'etcd' }, - }, - subsets: [{ - addresses: [ - { ip: etcdIP } - for etcdIP in $._config.etcd.ips - ], - ports: [ - { name: 'metrics', port: 2379, protocol: 'TCP' }, - ], - }], + apiVersion: 'v1', + kind: 'Endpoints', + metadata: { + name: 'etcd', + namespace: 'kube-system', + labels: { 'app.kubernetes.io/name': 'etcd' }, }, + subsets: [{ + addresses: [ + { ip: etcdIP } + for etcdIP in $.values.etcd.ips + ], + ports: [ + { name: 'metrics', port: 2379, protocol: 'TCP' }, + ], + }], + }, serviceMonitorEtcd: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -52,11 +50,11 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; name: 'etcd', namespace: 'kube-system', labels: { - 'k8s-app': 'etcd', + 'app.kubernetes.io/name': 'etcd', }, }, spec: { - jobLabel: 'k8s-app', + jobLabel: 'app.kubernetes.io/name', endpoints: [ { port: 'metrics', @@ -67,14 +65,14 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt', keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key', certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt', - [if $._config.etcd.serverName != null then 'serverName']: $._config.etcd.serverName, - [if $._config.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $._config.etcd.insecureSkipVerify, + [if $.values.etcd.serverName != null then 'serverName']: $.values.etcd.serverName, + [if $.values.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $.values.etcd.insecureSkipVerify, }, }, ], selector: { matchLabels: { - 'k8s-app': 'etcd', + 'app.kubernetes.io/name': 'etcd', }, }, }, @@ -86,20 +84,19 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; type: 'Opaque', metadata: { name: 'kube-etcd-client-certs', - namespace: $._config.namespace, + namespace: $.values.common.namespace, }, data: { - 'etcd-client-ca.crt': std.base64($._config.etcd.clientCA), - 'etcd-client.key': std.base64($._config.etcd.clientKey), - 'etcd-client.crt': std.base64($._config.etcd.clientCert), + 'etcd-client-ca.crt': std.base64($.values.etcd.clientCA), + 'etcd-client.key': std.base64($.values.etcd.clientKey), + 'etcd-client.crt': std.base64($.values.etcd.clientCert), }, }, - prometheus+: - { - // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec - spec+: { - secrets+: [$.prometheus.secretEtcdCerts.metadata.name], - }, + prometheus+: { + // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec + spec+: { + secrets+: [$.prometheus.secretEtcdCerts.metadata.name], }, + }, }, } diff --git a/jsonnet/kube-prometheus/addons/strip-limits.libsonnet b/jsonnet/kube-prometheus/addons/strip-limits.libsonnet new file mode 100644 index 00000000..6faec944 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/strip-limits.libsonnet @@ -0,0 +1,48 @@ +// Strips spec.containers[].limits for certain containers +// https://github.com/prometheus-operator/kube-prometheus/issues/72 + +{ + local noLimit(c) = + //if std.objectHas(c, 'resources') && c.name != 'kube-state-metrics' + if c.name != 'kube-state-metrics' + then c { resources+: { limits: {} } } + else c, + + nodeExporter+: { + daemonset+: { + spec+: { + template+: { + spec+: { + containers: std.map(noLimit, super.containers), + }, + }, + }, + }, + }, + kubeStateMetrics+: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: std.map(noLimit, super.containers), + }, + }, + }, + }, + }, + prometheusOperator+: { + deployment+: { + spec+: { + template+: { + spec+: { + local addArgs(c) = + if c.name == 'prometheus-operator' + then c { args+: ['--config-reloader-cpu-limit=0', '--config-reloader-memory-limit=0'] } + else c, + containers: std.map(addArgs, super.containers), + }, + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet b/jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet new file mode 100644 index 00000000..c0ca940a --- /dev/null +++ b/jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet @@ -0,0 +1,134 @@ +[ + { + alert: 'WeaveNetIPAMSplitBrain', + expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.', + description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetIPAMUnreachable', + expr: 'weave_ipam_unreachable_percentage > 25', + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.', + description: 'actionable: Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetIPAMPendingAllocates', + expr: 'sum(weave_ipam_pending_allocates) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Number of pending allocates is above the threshold.', + description: 'actionable: Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetIPAMPendingClaims', + expr: 'sum(weave_ipam_pending_claims) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Number of pending claims is above the threshold.', + description: 'actionable: Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetFastDPFlowsLow', + expr: 'sum(weave_flows) < 15000', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Number of FastDP flows is below the threshold.', + description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.', + }, + }, + { + alert: 'WeaveNetFastDPFlowsOff', + expr: 'sum(weave_flows == bool 0) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'FastDP flows is zero.', + description: 'actionable: Please find the reason for FastDP flows to be off and fix it.', + }, + }, + { + alert: 'WeaveNetHighConnectionTerminationRate', + expr: 'rate(weave_connection_terminations_total[5m]) > 0.1', + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are getting terminated.', + description: 'actionable: Please find the reason for the high connection termination rate and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsConnecting', + expr: 'sum(weave_connections{state="connecting"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in connecting state.', + description: 'actionable: Please find the reason for this and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsRetying', + expr: 'sum(weave_connections{state="retrying"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in retrying state.', + description: 'actionable: Please find the reason for this and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsPending', + expr: 'sum(weave_connections{state="pending"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in pending state.', + description: 'actionable: Please find the reason for this and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsFailed', + expr: 'sum(weave_connections{state="failed"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in failed state.', + description: 'actionable: Please find the reason and fix it.', + }, + }, +] diff --git a/jsonnet/kube-prometheus/grafana-weave-net-cluster.json b/jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net-cluster.json similarity index 100% rename from jsonnet/kube-prometheus/grafana-weave-net-cluster.json rename to jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net-cluster.json diff --git a/jsonnet/kube-prometheus/grafana-weave-net.json b/jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net.json similarity index 100% rename from jsonnet/kube-prometheus/grafana-weave-net.json rename to jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net.json diff --git a/jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet b/jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet new file mode 100644 index 00000000..d5cc9ead --- /dev/null +++ b/jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet @@ -0,0 +1,73 @@ +{ + prometheus+: { + local p = self, + serviceWeaveNet: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'weave-net', + namespace: 'kube-system', + labels: { 'app.kubernetes.io/name': 'weave-net' }, + }, + spec: { + ports: [ + { name: 'weave-net-metrics', targetPort: 6782, port: 6782 }, + ], + selector: { name: 'weave-net' }, + clusterIP: 'None', + }, + }, + serviceMonitorWeaveNet: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'weave-net', + labels: { + 'app.kubernetes.io/name': 'weave-net', + }, + namespace: 'monitoring', + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [ + { + port: 'weave-net-metrics', + path: '/metrics', + interval: '15s', + }, + ], + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'weave-net', + }, + }, + }, + }, + prometheusRuleWeaveNet: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p._config.mixin.ruleLabels, + name: 'weave-net-rules', + namespace: p._config.namespace, + }, + spec: { + groups: [{ + name: 'weave-net', + rules: (import './alerts.libsonnet'), + }], + }, + }, + mixin+:: { + grafanaDashboards+:: { + 'weave-net.json': (import './grafana-weave-net.json'), + 'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'), + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/windows.libsonnet b/jsonnet/kube-prometheus/addons/windows.libsonnet new file mode 100644 index 00000000..d97e8ffe --- /dev/null +++ b/jsonnet/kube-prometheus/addons/windows.libsonnet @@ -0,0 +1,70 @@ +local windowsdashboards = import 'github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/windows.libsonnet'; +local windowsrules = import 'github.com/kubernetes-monitoring/kubernetes-mixin/rules/windows.libsonnet'; + +{ + values+:: { + // This needs to follow prometheus naming convention and not prometheus-operator one + windowsScrapeConfig+:: { + job_name: 'windows-exporter', + static_configs: [ + { + targets: [error 'must provide targets array'], + }, + ], + relabel_configs: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + source_labels: [ + '__meta_kubernetes_endpoint_address_target_name', + ], + target_label: 'instance', + }, + ], + }, + + grafana+:: { + dashboards+:: windowsdashboards { + _config: $.kubernetesControlPlane.mixin._config { + wmiExporterSelector: 'job="' + $.values.windowsScrapeConfig.job_name + '"', + }, + }.grafanaDashboards, + }, + }, + kubernetesControlPlane+: { + mixin+:: { + prometheusRules+:: { + groups+: windowsrules { + _config: $.kubernetesControlPlane.mixin._config { + wmiExporterSelector: 'job="' + $.values.windowsScrapeConfig.job_name + '"', + }, + }.prometheusRules.groups, + }, + }, + }, + prometheus+: { + local p = self, + local sc = [$.values.windowsScrapeConfig], + prometheus+: { + spec+: { + additionalScrapeConfigs: { + name: 'prometheus-' + p._config.name + '-additional-scrape-config', + key: 'prometheus-additional.yaml', + }, + }, + + }, + windowsConfig: { + apiVersion: 'v1', + kind: 'Secret', + metadata: { + name: 'prometheus-' + p._config.name + '-additional-scrape-config', + namespace: p._config.namespace, + }, + stringData: { + 'prometheus-additional.yaml': std.manifestYamlDoc(sc), + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet deleted file mode 100644 index 677c7889..00000000 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ /dev/null @@ -1,149 +0,0 @@ -{ - _config+:: { - namespace: 'default', - - versions+:: { - alertmanager: 'v0.21.0', - }, - - imageRepos+:: { - alertmanager: 'quay.io/prometheus/alertmanager', - }, - - alertmanager+:: { - name: 'main', - config: { - global: { - resolve_timeout: '5m', - }, - inhibit_rules: [{ - source_match: { - severity: 'critical', - }, - target_match_re: { - severity: 'warning|info', - }, - equal: ['namespace', 'alertname'], - }, { - source_match: { - severity: 'warning', - }, - target_match_re: { - severity: 'info', - }, - equal: ['namespace', 'alertname'], - }], - route: { - group_by: ['namespace'], - group_wait: '30s', - group_interval: '5m', - repeat_interval: '12h', - receiver: 'Default', - routes: [ - { receiver: 'Watchdog', match: { alertname: 'Watchdog' } }, - { receiver: 'Critical', match: { severity: 'critical' } }, - ], - }, - receivers: [ - { name: 'Default' }, - { name: 'Watchdog' }, - { name: 'Critical' }, - ], - }, - replicas: 3, - }, - }, - - alertmanager+:: { - secret: { - apiVersion: 'v1', - kind: 'Secret', - type: 'Opaque', - metadata: { - name: 'alertmanager-' + $._config.alertmanager.name, - namespace: $._config.namespace, - }, - stringData: { - 'alertmanager.yaml': if std.type($._config.alertmanager.config) == 'object' - then - std.manifestYamlDoc($._config.alertmanager.config) - else - $._config.alertmanager.config, - }, - }, - - serviceAccount: { - apiVersion: 'v1', - kind: 'ServiceAccount', - metadata: { - name: 'alertmanager-' + $._config.alertmanager.name, - namespace: $._config.namespace, - }, - }, - - service: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'alertmanager-' + $._config.alertmanager.name, - namespace: $._config.namespace, - labels: { alertmanager: $._config.alertmanager.name }, - }, - spec: { - ports: [ - { name: 'web', targetPort: 'web', port: 9093 }, - ], - selector: { app: 'alertmanager', alertmanager: $._config.alertmanager.name }, - sessionAffinity: 'ClientIP', - }, - }, - - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'alertmanager', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'alertmanager', - }, - }, - spec: { - selector: { - matchLabels: { - alertmanager: $._config.alertmanager.name, - }, - }, - endpoints: [ - { port: 'web', interval: '30s' }, - ], - }, - }, - - alertmanager: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'Alertmanager', - metadata: { - name: $._config.alertmanager.name, - namespace: $._config.namespace, - labels: { - alertmanager: $._config.alertmanager.name, - }, - }, - spec: { - replicas: $._config.alertmanager.replicas, - version: $._config.versions.alertmanager, - image: $._config.imageRepos.alertmanager + ':' + $._config.versions.alertmanager, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - serviceAccountName: 'alertmanager-' + $._config.alertmanager.name, - securityContext: { - runAsUser: 1000, - runAsNonRoot: true, - fsGroup: 2000, - }, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet deleted file mode 100644 index bcabf4d9..00000000 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ /dev/null @@ -1,57 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'alertmanager.rules', - rules: [ - { - alert: 'AlertmanagerConfigInconsistent', - annotations: { - message: ||| - The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. - {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} - Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" - {{ end }} - |||, - }, - expr: ||| - count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - }, - { - alert: 'AlertmanagerFailedReload', - annotations: { - message: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", - }, - expr: ||| - alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - }, - { - alert: 'AlertmanagerMembersInconsistent', - annotations: { - message: 'Alertmanager has not found all other members of the cluster.', - }, - expr: ||| - alertmanager_cluster_members{%(alertmanagerSelector)s} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{%(alertmanagerSelector)s}) - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - }, - ], - }, - ], - }, -} diff --git a/jsonnet/kube-prometheus/alerts/tests.yaml b/jsonnet/kube-prometheus/alerts/tests.yaml deleted file mode 100644 index 532bb895..00000000 --- a/jsonnet/kube-prometheus/alerts/tests.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# TODO(metalmatze): This file is temporarily saved here for later reference -# until we find out how to integrate the tests into our jsonnet stack. - -rule_files: - - rules.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' diff --git a/jsonnet/kube-prometheus/components/alertmanager.libsonnet b/jsonnet/kube-prometheus/components/alertmanager.libsonnet new file mode 100644 index 00000000..bda39ec4 --- /dev/null +++ b/jsonnet/kube-prometheus/components/alertmanager.libsonnet @@ -0,0 +1,213 @@ +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + image: error 'must provide image', + version: error 'must provide version', + resources: { + limits: { cpu: '100m', memory: '100Mi' }, + requests: { cpu: '4m', memory: '100Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': 'alertmanager', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'alert-router', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + name: error 'must provide name', + config: { + global: { + resolve_timeout: '5m', + }, + inhibit_rules: [{ + source_match: { + severity: 'critical', + }, + target_match_re: { + severity: 'warning|info', + }, + equal: ['namespace', 'alertname'], + }, { + source_match: { + severity: 'warning', + }, + target_match_re: { + severity: 'info', + }, + equal: ['namespace', 'alertname'], + }], + route: { + group_by: ['namespace'], + group_wait: '30s', + group_interval: '5m', + repeat_interval: '12h', + receiver: 'Default', + routes: [ + { receiver: 'Watchdog', match: { alertname: 'Watchdog' } }, + { receiver: 'Critical', match: { severity: 'critical' } }, + ], + }, + receivers: [ + { name: 'Default' }, + { name: 'Watchdog' }, + { name: 'Critical' }, + ], + }, + replicas: 3, + mixin: { + ruleLabels: {}, + _config: { + alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}', + alertmanagerClusterLabels: 'namespace,service', + alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/alertmanager/%s', + }, + }, +}; + + +function(params) { + local am = self, + _config:: defaults + params, + // Safety check + assert std.isObject(am._config.resources), + assert std.isObject(am._config.mixin._config), + + mixin:: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: am._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: am._config.commonLabels + am._config.mixin.ruleLabels, + name: 'alertmanager-' + am._config.name + '-rules', + namespace: am._config.namespace, + }, + spec: { + local r = if std.objectHasAll(am.mixin, 'prometheusRules') then am.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(am.mixin, 'prometheusAlerts') then am.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + secret: { + apiVersion: 'v1', + kind: 'Secret', + type: 'Opaque', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: { alertmanager: am._config.name } + am._config.commonLabels, + }, + stringData: { + 'alertmanager.yaml': if std.type(am._config.config) == 'object' + then + std.manifestYamlDoc(am._config.config) + else + am._config.config, + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: { alertmanager: am._config.name } + am._config.commonLabels, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: { alertmanager: am._config.name } + am._config.commonLabels, + }, + spec: { + ports: [ + { name: 'web', targetPort: 'web', port: 9093 }, + ], + selector: { + app: 'alertmanager', + alertmanager: am._config.name, + } + am._config.selectorLabels, + sessionAffinity: 'ClientIP', + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'alertmanager', + namespace: am._config.namespace, + labels: am._config.commonLabels, + }, + spec: { + selector: { + matchLabels: { + alertmanager: am._config.name, + } + am._config.selectorLabels, + }, + endpoints: [ + { port: 'web', interval: '30s' }, + ], + }, + }, + + [if (defaults + params).replicas > 1 then 'podDisruptionBudget']: { + apiVersion: 'policy/v1beta1', + kind: 'PodDisruptionBudget', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: am._config.commonLabels, + }, + spec: { + maxUnavailable: 1, + selector: { + matchLabels: { + alertmanager: am._config.name, + } + am._config.selectorLabels, + }, + }, + }, + + alertmanager: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Alertmanager', + metadata: { + name: am._config.name, + namespace: am._config.namespace, + labels: { + alertmanager: am._config.name, + } + am._config.commonLabels, + }, + spec: { + replicas: am._config.replicas, + version: am._config.version, + image: am._config.image, + podMetadata: { + labels: am._config.commonLabels, + }, + resources: am._config.resources, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + serviceAccountName: 'alertmanager-' + am._config.name, + securityContext: { + runAsUser: 1000, + runAsNonRoot: true, + fsGroup: 2000, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/blackbox-exporter.libsonnet b/jsonnet/kube-prometheus/components/blackbox-exporter.libsonnet new file mode 100644 index 00000000..cb4dcd9a --- /dev/null +++ b/jsonnet/kube-prometheus/components/blackbox-exporter.libsonnet @@ -0,0 +1,290 @@ +local krp = import './kube-rbac-proxy.libsonnet'; + +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide version', + resources: { + requests: { cpu: '10m', memory: '20Mi' }, + limits: { cpu: '20m', memory: '40Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': 'blackbox-exporter', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + configmapReloaderImage: error 'must provide version', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + + port: 9115, + internalPort: 19115, + replicas: 1, + modules: { + http_2xx: { + prober: 'http', + http: { + preferred_ip_protocol: 'ip4', + }, + }, + http_post_2xx: { + prober: 'http', + http: { + method: 'POST', + preferred_ip_protocol: 'ip4', + }, + }, + tcp_connect: { + prober: 'tcp', + tcp: { + preferred_ip_protocol: 'ip4', + }, + }, + pop3s_banner: { + prober: 'tcp', + tcp: { + query_response: [ + { expect: '^+OK' }, + ], + tls: true, + tls_config: { + insecure_skip_verify: false, + }, + preferred_ip_protocol: 'ip4', + }, + }, + ssh_banner: { + prober: 'tcp', + tcp: { + query_response: [ + { expect: '^SSH-2.0-' }, + ], + preferred_ip_protocol: 'ip4', + }, + }, + irc_banner: { + prober: 'tcp', + tcp: { + query_response: [ + { send: 'NICK prober' }, + { send: 'USER prober prober prober :prober' }, + { expect: 'PING :([^ ]+)', send: 'PONG ${1}' }, + { expect: '^:[^ ]+ 001' }, + ], + preferred_ip_protocol: 'ip4', + }, + }, + }, + privileged: + local icmpModules = [self.modules[m] for m in std.objectFields(self.modules) if self.modules[m].prober == 'icmp']; + std.length(icmpModules) > 0, +}; + + +function(params) { + local bb = self, + _config:: defaults + params, + // Safety check + assert std.isObject(bb._config.resources), + + configuration: { + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: { + name: 'blackbox-exporter-configuration', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + data: { + 'config.yml': std.manifestYamlDoc({ modules: bb._config.modules }), + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + }, + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'blackbox-exporter', + }, + rules: [ + { + apiGroups: ['authentication.k8s.io'], + resources: ['tokenreviews'], + verbs: ['create'], + }, + { + apiGroups: ['authorization.k8s.io'], + resources: ['subjectaccessreviews'], + verbs: ['create'], + }, + ], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'blackbox-exporter', + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'blackbox-exporter', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'blackbox-exporter', + namespace: bb._config.namespace, + }], + }, + + deployment: + local blackboxExporter = { + name: 'blackbox-exporter', + image: bb._config.image, + args: [ + '--config.file=/etc/blackbox_exporter/config.yml', + '--web.listen-address=:%d' % bb._config.internalPort, + ], + ports: [{ + name: 'http', + containerPort: bb._config.internalPort, + }], + resources: bb._config.resources, + securityContext: if bb._config.privileged then { + runAsNonRoot: false, + capabilities: { drop: ['ALL'], add: ['NET_RAW'] }, + } else { + runAsNonRoot: true, + runAsUser: 65534, + }, + volumeMounts: [{ + mountPath: '/etc/blackbox_exporter/', + name: 'config', + readOnly: true, + }], + }; + + local reloader = { + name: 'module-configmap-reloader', + image: bb._config.configmapReloaderImage, + args: [ + '--webhook-url=http://localhost:%d/-/reload' % bb._config.internalPort, + '--volume-dir=/etc/blackbox_exporter/', + ], + resources: bb._config.resources, + securityContext: { runAsNonRoot: true, runAsUser: 65534 }, + terminationMessagePath: '/dev/termination-log', + terminationMessagePolicy: 'FallbackToLogsOnError', + volumeMounts: [{ + mountPath: '/etc/blackbox_exporter/', + name: 'config', + readOnly: true, + }], + }; + + local kubeRbacProxy = krp({ + name: 'kube-rbac-proxy', + upstream: 'http://127.0.0.1:' + bb._config.internalPort + '/', + resources: bb._config.resources, + secureListenAddress: ':' + bb._config.port, + ports: [ + { name: 'https', containerPort: bb._config.port }, + ], + image: bb._config.kubeRbacProxyImage, + }); + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + spec: { + replicas: bb._config.replicas, + selector: { matchLabels: bb._config.selectorLabels }, + template: { + metadata: { + labels: bb._config.commonLabels, + annotations: { + 'kubectl.kubernetes.io/default-container': blackboxExporter.name, + }, + }, + spec: { + containers: [blackboxExporter, reloader, kubeRbacProxy], + nodeSelector: { 'kubernetes.io/os': 'linux' }, + serviceAccountName: 'blackbox-exporter', + volumes: [{ + name: 'config', + configMap: { name: 'blackbox-exporter-configuration' }, + }], + }, + }, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + spec: { + ports: [{ + name: 'https', + port: bb._config.port, + targetPort: 'https', + }, { + name: 'probe', + port: bb._config.internalPort, + targetPort: 'http', + }], + selector: bb._config.selectorLabels, + }, + }, + + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + spec: { + endpoints: [{ + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + interval: '30s', + path: '/metrics', + port: 'https', + scheme: 'https', + tlsConfig: { + insecureSkipVerify: true, + }, + }], + selector: { + matchLabels: bb._config.selectorLabels, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/grafana.libsonnet b/jsonnet/kube-prometheus/components/grafana.libsonnet new file mode 100644 index 00000000..cf2c7ea2 --- /dev/null +++ b/jsonnet/kube-prometheus/components/grafana.libsonnet @@ -0,0 +1,105 @@ +local defaults = { + local defaults = self, + name: 'grafana', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + resources: { + requests: { cpu: '100m', memory: '100Mi' }, + limits: { cpu: '200m', memory: '200Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'grafana', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + prometheusName: error 'must provide prometheus name', + dashboards: {}, + // TODO(paulfantom): expose those to have a stable API. After kubernetes-grafana refactor those could probably be removed. + rawDashboards: {}, + folderDashboards: {}, + containers: [], + datasources: [], + config: {}, + plugins: [], + env: [], +}; + +function(params) { + local g = self, + _config:: defaults + params, + // Safety check + assert std.isObject(g._config.resources), + + local glib = (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + { + _config+:: { + namespace: g._config.namespace, + versions+:: { + grafana: g._config.version, + }, + imageRepos+:: { + grafana: std.split(g._config.image, ':')[0], + }, + prometheus+:: { + name: g._config.prometheusName, + }, + grafana+:: { + labels: g._config.commonLabels, + dashboards: g._config.dashboards, + resources: g._config.resources, + rawDashboards: g._config.rawDashboards, + folderDashboards: g._config.folderDashboards, + containers: g._config.containers, + config+: g._config.config, + plugins+: g._config.plugins, + env: g._config.env, + } + ( + // Conditionally overwrite default setting. + if std.length(g._config.datasources) > 0 then + { datasources: g._config.datasources } + else {} + ), + }, + }, + + config: glib.grafana.config, + service: glib.grafana.service, + serviceAccount: glib.grafana.serviceAccount, + deployment: glib.grafana.deployment, + dashboardDatasources: glib.grafana.dashboardDatasources, + dashboardSources: glib.grafana.dashboardSources, + + dashboardDefinitions: if std.length(g._config.dashboards) > 0 || + std.length(g._config.rawDashboards) > 0 || + std.length(g._config.folderDashboards) > 0 then { + apiVersion: 'v1', + kind: 'ConfigMapList', + items: glib.grafana.dashboardDefinitions, + }, + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'grafana', + namespace: g._config.namespace, + labels: g._config.commonLabels, + }, + spec: { + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'grafana', + }, + }, + endpoints: [{ + port: 'http', + interval: '15s', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet new file mode 100644 index 00000000..08cdfb20 --- /dev/null +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -0,0 +1,319 @@ +local relabelings = import '../addons/dropping-deprecated-metrics-relabelings.libsonnet'; + +local defaults = { + namespace: error 'must provide namespace', + commonLabels:: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin: { + ruleLabels: {}, + _config: { + cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', + kubeletSelector: 'job="kubelet", metrics_path="/metrics"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s', + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + }, + }, + kubeProxy: false, +}; + +function(params) { + local k8s = self, + _config:: defaults + params, + + mixin:: (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') { + _config+:: k8s._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: k8s._config.commonLabels + k8s._config.mixin.ruleLabels, + name: 'kubernetes-monitoring-rules', + namespace: k8s._config.namespace, + }, + spec: { + local r = if std.objectHasAll(k8s.mixin, 'prometheusRules') then k8s.mixin.prometheusRules.groups else {}, + local a = if std.objectHasAll(k8s.mixin, 'prometheusAlerts') then k8s.mixin.prometheusAlerts.groups else {}, + groups: a + r, + }, + }, + + serviceMonitorKubeScheduler: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-scheduler', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'kube-scheduler' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [{ + port: 'https-metrics', + interval: '30s', + scheme: 'https', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { insecureSkipVerify: true }, + }], + selector: { + matchLabels: { 'app.kubernetes.io/name': 'kube-scheduler' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + }, + }, + + serviceMonitorKubelet: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kubelet', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'kubelet' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [ + { + port: 'https-metrics', + scheme: 'https', + interval: '30s', + honorLabels: true, + tlsConfig: { insecureSkipVerify: true }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: relabelings, + relabelings: [{ + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', + }], + }, + { + port: 'https-metrics', + scheme: 'https', + path: '/metrics/cadvisor', + interval: '30s', + honorLabels: true, + honorTimestamps: false, + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [{ + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', + }], + metricRelabelings: [ + // Drop a bunch of metrics which are disabled but still sent, see + // https://github.com/google/cadvisor/issues/1925. + { + sourceLabels: ['__name__'], + regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', + action: 'drop', + }, + // Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation) + { + sourceLabels: ['__name__', 'pod', 'namespace'], + action: 'drop', + regex: '(' + std.join('|', + [ + 'container_fs_.*', // add filesystem read/write data (nodes*disks*services*4) + 'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5) + 'container_blkio_device_usage_total', // useful for containers, but not for system services (nodes*disks*services*operations*2) + 'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services) + 'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services) + 'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services) + 'container_threads', // used threads in cgroup. Usually not important for system services (nodes*services) + 'container_start_time_seconds', // container start. Possibly not needed for system services (nodes*services) + 'container_last_seen', // not needed as system services are always running (nodes*services) + ]) + ');;', + }, + ], + }, + { + port: 'https-metrics', + scheme: 'https', + path: '/metrics/probes', + interval: '30s', + honorLabels: true, + tlsConfig: { insecureSkipVerify: true }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [{ + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', + }], + }, + ], + selector: { + matchLabels: { 'app.kubernetes.io/name': 'kubelet' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + }, + }, + + serviceMonitorKubeControllerManager: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-controller-manager', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'kube-controller-manager' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [{ + port: 'https-metrics', + interval: '30s', + scheme: 'https', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + metricRelabelings: relabelings + [ + { + sourceLabels: ['__name__'], + regex: 'etcd_(debugging|disk|request|server).*', + action: 'drop', + }, + ], + }], + selector: { + matchLabels: { 'app.kubernetes.io/name': 'kube-controller-manager' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + }, + }, + + serviceMonitorApiserver: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-apiserver', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'apiserver' }, + }, + spec: { + jobLabel: 'component', + selector: { + matchLabels: { + component: 'apiserver', + provider: 'kubernetes', + }, + }, + namespaceSelector: { + matchNames: ['default'], + }, + endpoints: [{ + port: 'https', + interval: '30s', + scheme: 'https', + tlsConfig: { + caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', + serverName: 'kubernetes', + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: relabelings + [ + { + sourceLabels: ['__name__'], + regex: 'etcd_(debugging|disk|server).*', + action: 'drop', + }, + { + sourceLabels: ['__name__'], + regex: 'apiserver_admission_controller_admission_latencies_seconds_.*', + action: 'drop', + }, + { + sourceLabels: ['__name__'], + regex: 'apiserver_admission_step_admission_latencies_seconds_.*', + action: 'drop', + }, + { + sourceLabels: ['__name__', 'le'], + regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)', + action: 'drop', + }, + ], + }], + }, + }, + + [if (defaults + params).kubeProxy then 'podMonitorKubeProxy']: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PodMonitor', + metadata: { + labels: { + 'k8s-app': 'kube-proxy', + }, + name: 'kube-proxy', + namespace: k8s._config.namespace, + }, + spec: { + jobLabel: 'k8s-app', + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + selector: { + matchLabels: { + 'k8s-app': 'kube-proxy', + }, + }, + podMetricsEndpoints: [{ + honorLabels: true, + targetPort: 10249, + relabelings: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + sourceLabels: ['__meta_kubernetes_pod_node_name'], + targetLabel: 'instance', + }, + ], + }], + }, + }, + + + serviceMonitorCoreDNS: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'coredns', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'coredns' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: { 'k8s-app': 'kube-dns' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + endpoints: [{ + port: 'metrics', + interval: '15s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }], + }, + }, + + +} diff --git a/jsonnet/kube-prometheus/components/kube-rbac-proxy.libsonnet b/jsonnet/kube-prometheus/components/kube-rbac-proxy.libsonnet new file mode 100644 index 00000000..534a2eed --- /dev/null +++ b/jsonnet/kube-prometheus/components/kube-rbac-proxy.libsonnet @@ -0,0 +1,63 @@ +local defaults = { + namespace: error 'must provide namespace', + image: error 'must provide image', + ports: error 'must provide ports', + secureListenAddress: error 'must provide secureListenAddress', + upstream: error 'must provide upstream', + resources: { + requests: { cpu: '10m', memory: '20Mi' }, + limits: { cpu: '20m', memory: '40Mi' }, + }, + tlsCipherSuites: [ + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + + // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + // 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 + // 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 + // 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2 + // 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 + // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 + + // disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go + + 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + ], +}; + + +function(params) { + local krp = self, + _config:: defaults + params, + // Safety check + assert std.isObject(krp._config.resources), + + name: krp._config.name, + image: krp._config.image, + args: [ + '--logtostderr', + '--secure-listen-address=' + krp._config.secureListenAddress, + '--tls-cipher-suites=' + std.join(',', krp._config.tlsCipherSuites), + '--upstream=' + krp._config.upstream, + ], + resources: krp._config.resources, + ports: krp._config.ports, + securityContext: { + runAsUser: 65532, + runAsGroup: 65532, + runAsNonRoot: true, + }, +} diff --git a/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet new file mode 100644 index 00000000..b39a605c --- /dev/null +++ b/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet @@ -0,0 +1,176 @@ +local krp = import './kube-rbac-proxy.libsonnet'; + +local defaults = { + local defaults = self, + name: 'kube-state-metrics', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide version', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + resources: { + requests: { cpu: '10m', memory: '190Mi' }, + limits: { cpu: '100m', memory: '250Mi' }, + }, + + kubeRbacProxyMain: { + resources+: { + limits+: { cpu: '40m' }, + requests+: { cpu: '20m' }, + }, + }, + scrapeInterval: '30s', + scrapeTimeout: '30s', + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + mixin: { + ruleLabels: {}, + _config: { + kubeStateMetricsSelector: 'job="' + defaults.name + '"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/%s', + }, + }, +}; + +function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') { + local ksm = self, + _config:: defaults + params, + // Safety check + assert std.isObject(ksm._config.resources), + assert std.isObject(ksm._config.mixin._config), + + name:: ksm._config.name, + namespace:: ksm._config.namespace, + version:: ksm._config.version, + image:: ksm._config.image, + commonLabels:: ksm._config.commonLabels, + podLabels:: ksm._config.selectorLabels, + + mixin:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: ksm._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: ksm._config.commonLabels + ksm._config.mixin.ruleLabels, + name: ksm._config.name + '-rules', + namespace: ksm._config.namespace, + }, + spec: { + local r = if std.objectHasAll(ksm.mixin, 'prometheusRules') then ksm.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(ksm.mixin, 'prometheusAlerts') then ksm.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + service+: { + spec+: { + ports: [ + { + name: 'https-main', + port: 8443, + targetPort: 'https-main', + }, + { + name: 'https-self', + port: 9443, + targetPort: 'https-self', + }, + ], + }, + }, + + local kubeRbacProxyMain = krp(ksm._config.kubeRbacProxyMain { + name: 'kube-rbac-proxy-main', + upstream: 'http://127.0.0.1:8081/', + secureListenAddress: ':8443', + ports: [ + { name: 'https-main', containerPort: 8443 }, + ], + image: ksm._config.kubeRbacProxyImage, + }), + + local kubeRbacProxySelf = krp({ + name: 'kube-rbac-proxy-self', + upstream: 'http://127.0.0.1:8082/', + secureListenAddress: ':9443', + ports: [ + { name: 'https-self', containerPort: 9443 }, + ], + image: ksm._config.kubeRbacProxyImage, + }), + + deployment+: { + spec+: { + template+: { + metadata+: { + annotations+: { + 'kubectl.kubernetes.io/default-container': 'kube-state-metrics', + }, + }, + spec+: { + containers: std.map(function(c) c { + ports:: null, + livenessProbe:: null, + readinessProbe:: null, + args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], + resources: ksm._config.resources, + }, super.containers) + [kubeRbacProxyMain, kubeRbacProxySelf], + }, + }, + }, + }, + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: ksm.name, + namespace: ksm._config.namespace, + labels: ksm._config.commonLabels, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { matchLabels: ksm._config.selectorLabels }, + endpoints: [ + { + port: 'https-main', + scheme: 'https', + interval: ksm._config.scrapeInterval, + scrapeTimeout: ksm._config.scrapeTimeout, + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [ + { + regex: '(pod|service|endpoint|namespace)', + action: 'labeldrop', + }, + ], + tlsConfig: { + insecureSkipVerify: true, + }, + }, + { + port: 'https-self', + scheme: 'https', + interval: ksm._config.scrapeInterval, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, +} diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/alerts.libsonnet similarity index 61% rename from jsonnet/kube-prometheus/alerts/alerts.libsonnet rename to jsonnet/kube-prometheus/components/mixin/alerts/alerts.libsonnet index adc46130..8733ae44 100644 --- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/alerts.libsonnet @@ -1,3 +1,2 @@ -(import 'alertmanager.libsonnet') + (import 'general.libsonnet') + (import 'node.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet similarity index 75% rename from jsonnet/kube-prometheus/alerts/general.libsonnet rename to jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet index 16f3e39c..cd5c7165 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet @@ -7,7 +7,8 @@ { alert: 'TargetDown', annotations: { - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.', + summary: 'One or more targets are unreachable.', + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.', }, expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10', 'for': '10m', @@ -18,7 +19,8 @@ { alert: 'Watchdog', annotations: { - message: ||| + summary: 'An alert that should always be firing to certify that Alertmanager is working properly.', + description: ||| This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet similarity index 68% rename from jsonnet/kube-prometheus/alerts/node.libsonnet rename to jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet index d1b9caf6..5bad9bf8 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet @@ -7,7 +7,8 @@ { alert: 'NodeNetworkInterfaceFlapping', annotations: { - message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"', + summary: 'Network interface is often changing its status', + description: 'Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', }, expr: ||| changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2 diff --git a/jsonnet/kube-prometheus/components/mixin/custom.libsonnet b/jsonnet/kube-prometheus/components/mixin/custom.libsonnet new file mode 100644 index 00000000..c8f43b03 --- /dev/null +++ b/jsonnet/kube-prometheus/components/mixin/custom.libsonnet @@ -0,0 +1,44 @@ +local defaults = { + name: 'kube-prometheus', + namespace: error 'must provide namespace', + commonLabels:: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin: { + ruleLabels: {}, + _config: { + nodeExporterSelector: 'job="node-exporter"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/general/%s', + }, + }, +}; + +function(params) { + local m = self, + _config:: defaults + params, + + local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'), + + mixin:: alertsandrules + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: m._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: m._config.commonLabels + m._config.mixin.ruleLabels, + name: m._config.name + '-rules', + namespace: m._config.namespace, + }, + spec: { + local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, +} diff --git a/jsonnet/kube-prometheus/rules/general.libsonnet b/jsonnet/kube-prometheus/components/mixin/rules/general.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/general.libsonnet rename to jsonnet/kube-prometheus/components/mixin/rules/general.libsonnet diff --git a/jsonnet/kube-prometheus/rules/node-rules.libsonnet b/jsonnet/kube-prometheus/components/mixin/rules/node-rules.libsonnet similarity index 81% rename from jsonnet/kube-prometheus/rules/node-rules.libsonnet rename to jsonnet/kube-prometheus/components/mixin/rules/node-rules.libsonnet index acd05424..8cdee479 100644 --- a/jsonnet/kube-prometheus/rules/node-rules.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/rules/node-rules.libsonnet @@ -5,7 +5,7 @@ name: 'kube-prometheus-node-recording.rules', rules: [ { - expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)', record: 'instance:node_cpu:rate:sum', }, { @@ -17,11 +17,11 @@ record: 'instance:node_network_transmit_bytes:rate:sum', }, { - expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)', record: 'instance:node_cpu:ratio', }, { - expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))', record: 'cluster:node_cpu:sum_rate5m', }, { diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/components/mixin/rules/rules.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/rules.libsonnet rename to jsonnet/kube-prometheus/components/mixin/rules/rules.libsonnet diff --git a/jsonnet/kube-prometheus/components/node-exporter.libsonnet b/jsonnet/kube-prometheus/components/node-exporter.libsonnet new file mode 100644 index 00000000..1452174a --- /dev/null +++ b/jsonnet/kube-prometheus/components/node-exporter.libsonnet @@ -0,0 +1,259 @@ +local krp = import './kube-rbac-proxy.libsonnet'; + +local defaults = { + local defaults = self, + name: 'node-exporter', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide version', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + resources: { + requests: { cpu: '102m', memory: '180Mi' }, + limits: { cpu: '250m', memory: '180Mi' }, + }, + listenAddress: '127.0.0.1', + port: 9100, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + mixin: { + ruleLabels: {}, + _config: { + nodeExporterSelector: 'job="' + defaults.name + '"', + // Adjust NodeFilesystemSpaceFillingUp warning and critical thresholds according to the following default kubelet + // GC values, + // imageGCLowThresholdPercent: 80 + // imageGCHighThresholdPercent: 85 + // See https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/ for more details. + fsSpaceFillingUpWarningThreshold: 20, + fsSpaceFillingUpCriticalThreshold: 15, + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/node/%s', + }, + }, +}; + + +function(params) { + local ne = self, + _config:: defaults + params, + // Safety check + assert std.isObject(ne._config.resources), + assert std.isObject(ne._config.mixin._config), + + mixin:: (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: ne._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: ne._config.commonLabels + ne._config.mixin.ruleLabels, + name: ne._config.name + '-rules', + namespace: ne._config.namespace, + }, + spec: { + local r = if std.objectHasAll(ne.mixin, 'prometheusRules') then ne.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(ne.mixin, 'prometheusAlerts') then ne.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: ne._config.name, + labels: ne._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: ne._config.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: ne._config.name, + namespace: ne._config.namespace, + }], + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: ne._config.name, + labels: ne._config.commonLabels, + }, + rules: [ + { + apiGroups: ['authentication.k8s.io'], + resources: ['tokenreviews'], + verbs: ['create'], + }, + { + apiGroups: ['authorization.k8s.io'], + resources: ['subjectaccessreviews'], + verbs: ['create'], + }, + ], + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + spec: { + ports: [ + { name: 'https', targetPort: 'https', port: ne._config.port }, + ], + selector: ne._config.selectorLabels, + clusterIP: 'None', + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: ne._config.selectorLabels, + }, + endpoints: [{ + port: 'https', + scheme: 'https', + interval: '15s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + sourceLabels: ['__meta_kubernetes_pod_node_name'], + targetLabel: 'instance', + }, + ], + tlsConfig: { + insecureSkipVerify: true, + }, + }], + }, + }, + + daemonset: + local nodeExporter = { + name: ne._config.name, + image: ne._config.image, + args: [ + '--web.listen-address=' + std.join(':', [ne._config.listenAddress, std.toString(ne._config.port)]), + '--path.sysfs=/host/sys', + '--path.rootfs=/host/root', + '--no-collector.wifi', + '--no-collector.hwmon', + '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)', + // NOTE: ignore veth network interface associated with containers. + // OVN renames veth.* to @if where X is /sys/class/net//ifindex + // thus [a-z0-9] regex below + '--collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$', + '--collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$', + ], + volumeMounts: [ + { name: 'sys', mountPath: '/host/sys', mountPropagation: 'HostToContainer', readOnly: true }, + { name: 'root', mountPath: '/host/root', mountPropagation: 'HostToContainer', readOnly: true }, + ], + resources: ne._config.resources, + }; + + local kubeRbacProxy = krp({ + name: 'kube-rbac-proxy', + //image: krpImage, + upstream: 'http://127.0.0.1:' + ne._config.port + '/', + secureListenAddress: '[$(IP)]:' + ne._config.port, + // Keep `hostPort` here, rather than in the node-exporter container + // because Kubernetes mandates that if you define a `hostPort` then + // `containerPort` must match. In our case, we are splitting the + // host port and container port between the two containers. + // We'll keep the port specification here so that the named port + // used by the service is tied to the proxy container. We *could* + // forgo declaring the host port, however it is important to declare + // it so that the scheduler can decide if the pod is schedulable. + ports: [ + { name: 'https', containerPort: ne._config.port, hostPort: ne._config.port }, + ], + image: ne._config.kubeRbacProxyImage, + }) + { + env: [ + { name: 'IP', valueFrom: { fieldRef: { fieldPath: 'status.podIP' } } }, + ], + }; + + { + apiVersion: 'apps/v1', + kind: 'DaemonSet', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + spec: { + selector: { matchLabels: ne._config.selectorLabels }, + updateStrategy: { + type: 'RollingUpdate', + rollingUpdate: { maxUnavailable: '10%' }, + }, + template: { + metadata: { labels: ne._config.commonLabels }, + spec: { + nodeSelector: { 'kubernetes.io/os': 'linux' }, + tolerations: [{ + operator: 'Exists', + }], + containers: [nodeExporter, kubeRbacProxy], + volumes: [ + { name: 'sys', hostPath: { path: '/sys' } }, + { name: 'root', hostPath: { path: '/' } }, + ], + serviceAccountName: ne._config.name, + securityContext: { + runAsUser: 65534, + runAsNonRoot: true, + }, + hostPID: true, + hostNetwork: true, + }, + }, + }, + }, + + +} diff --git a/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet new file mode 100644 index 00000000..86d0475a --- /dev/null +++ b/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet @@ -0,0 +1,380 @@ +local defaults = { + local defaults = self, + name: 'prometheus-adapter', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + resources: { + requests: { cpu: '102m', memory: '180Mi' }, + limits: { cpu: '250m', memory: '180Mi' }, + }, + replicas: 2, + listenAddress: '127.0.0.1', + port: 9100, + commonLabels:: { + 'app.kubernetes.io/name': 'prometheus-adapter', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'metrics-adapter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + // Default range intervals are equal to 4 times the default scrape interval. + // This is done in order to follow Prometheus rule of thumb with irate(). + rangeIntervals: { + kubelet: '4m', + nodeExporter: '4m', + windowsExporter: '4m', + }, + + prometheusURL: error 'must provide prometheusURL', + config: { + resourceRules: { + cpu: { + containerQuery: ||| + sum by (<<.GroupBy>>) ( + irate ( + container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[%(kubelet)s] + ) + ) + ||| % $.rangeIntervals, + nodeQuery: ||| + sum by (<<.GroupBy>>) ( + 1 - irate( + node_cpu_seconds_total{mode="idle"}[%(nodeExporter)s] + ) + * on(namespace, pod) group_left(node) ( + node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>} + ) + ) + or sum by (<<.GroupBy>>) ( + 1 - irate( + windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[%(windowsExporter)s] + ) + ) + ||| % $.rangeIntervals, + resources: { + overrides: { + node: { resource: 'node' }, + namespace: { resource: 'namespace' }, + pod: { resource: 'pod' }, + }, + }, + containerLabel: 'container', + }, + memory: { + containerQuery: ||| + sum by (<<.GroupBy>>) ( + container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""} + ) + |||, + nodeQuery: ||| + sum by (<<.GroupBy>>) ( + node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} + - + node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>} + ) + or sum by (<<.GroupBy>>) ( + windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>} + - + windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>} + ) + |||, + resources: { + overrides: { + instance: { resource: 'node' }, + namespace: { resource: 'namespace' }, + pod: { resource: 'pod' }, + }, + }, + containerLabel: 'container', + }, + window: '5m', + }, + }, + tlsCipherSuites: [ + 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', + 'TLS_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_RSA_WITH_AES_128_CBC_SHA', + 'TLS_RSA_WITH_AES_256_CBC_SHA', + ], +}; + +function(params) { + local pa = self, + _config:: defaults + params, + // Safety check + assert std.isObject(pa._config.resources), + + apiService: { + apiVersion: 'apiregistration.k8s.io/v1', + kind: 'APIService', + metadata: { + name: 'v1beta1.metrics.k8s.io', + labels: pa._config.commonLabels, + }, + spec: { + service: { + name: $.service.metadata.name, + namespace: pa._config.namespace, + }, + group: 'metrics.k8s.io', + version: 'v1beta1', + insecureSkipTLSVerify: true, + groupPriorityMinimum: 100, + versionPriority: 100, + }, + }, + + configMap: { + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: { + name: 'adapter-config', + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + data: { 'config.yaml': std.manifestYamlDoc(pa._config.config) }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + selector: { + matchLabels: pa._config.selectorLabels, + }, + endpoints: [ + { + port: 'https', + interval: '30s', + scheme: 'https', + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + ], + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + ports: [ + { name: 'https', targetPort: 6443, port: 443 }, + ], + selector: pa._config.selectorLabels, + }, + }, + + deployment: + local c = { + name: pa._config.name, + image: pa._config.image, + args: [ + '--cert-dir=/var/run/serving-cert', + '--config=/etc/adapter/config.yaml', + '--logtostderr=true', + '--metrics-relist-interval=1m', + '--prometheus-url=' + pa._config.prometheusURL, + '--secure-port=6443', + '--tls-cipher-suites=' + std.join(',', pa._config.tlsCipherSuites), + ], + resources: pa._config.resources, + ports: [{ containerPort: 6443 }], + volumeMounts: [ + { name: 'tmpfs', mountPath: '/tmp', readOnly: false }, + { name: 'volume-serving-cert', mountPath: '/var/run/serving-cert', readOnly: false }, + { name: 'config', mountPath: '/etc/adapter', readOnly: false }, + ], + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + replicas: pa._config.replicas, + selector: { matchLabels: pa._config.selectorLabels }, + strategy: { + rollingUpdate: { + maxSurge: 1, + maxUnavailable: 1, + }, + }, + template: { + metadata: { labels: pa._config.commonLabels }, + spec: { + containers: [c], + serviceAccountName: $.serviceAccount.metadata.name, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + volumes: [ + { name: 'tmpfs', emptyDir: {} }, + { name: 'volume-serving-cert', emptyDir: {} }, + { name: 'config', configMap: { name: 'adapter-config' } }, + ], + }, + }, + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: pa._config.name, + labels: pa._config.commonLabels, + }, + rules: [{ + apiGroups: [''], + resources: ['nodes', 'namespaces', 'pods', 'services'], + verbs: ['get', 'list', 'watch'], + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: pa._config.name, + labels: pa._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: $.clusterRole.metadata.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: pa._config.namespace, + }], + }, + + clusterRoleBindingDelegator: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'resource-metrics:system:auth-delegator', + labels: pa._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'system:auth-delegator', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: pa._config.namespace, + }], + }, + + clusterRoleServerResources: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'resource-metrics-server-resources', + labels: pa._config.commonLabels, + }, + rules: [{ + apiGroups: ['metrics.k8s.io'], + resources: ['*'], + verbs: ['*'], + }], + }, + + clusterRoleAggregatedMetricsReader: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'system:aggregated-metrics-reader', + labels: { + 'rbac.authorization.k8s.io/aggregate-to-admin': 'true', + 'rbac.authorization.k8s.io/aggregate-to-edit': 'true', + 'rbac.authorization.k8s.io/aggregate-to-view': 'true', + } + pa._config.commonLabels, + }, + rules: [{ + apiGroups: ['metrics.k8s.io'], + resources: ['pods', 'nodes'], + verbs: ['get', 'list', 'watch'], + }], + }, + + roleBindingAuthReader: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'resource-metrics-auth-reader', + namespace: 'kube-system', + labels: pa._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'extension-apiserver-authentication-reader', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: pa._config.namespace, + }], + }, + + [if (defaults + params).replicas > 1 then 'podDisruptionBudget']: { + apiVersion: 'policy/v1beta1', + kind: 'PodDisruptionBudget', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + minAvailable: 1, + selector: { + matchLabels: pa._config.selectorLabels, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet new file mode 100644 index 00000000..b0a78e06 --- /dev/null +++ b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet @@ -0,0 +1,130 @@ +local krp = import './kube-rbac-proxy.libsonnet'; +local prometheusOperator = import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet'; + +local defaults = { + local defaults = self, + name: 'prometheus-operator', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + configReloaderImage: error 'must provide config reloader image', + resources: { + limits: { cpu: '200m', memory: '200Mi' }, + requests: { cpu: '100m', memory: '100Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'controller', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + mixin: { + ruleLabels: { + role: 'alert-rules', + prometheus: defaults.name, + }, + _config: { + prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s', + }, + }, +}; + +function(params) + local config = defaults + params; + // Safety check + assert std.isObject(config.resources); + + prometheusOperator(config) { + local po = self, + // declare variable as a field to allow overriding options and to have unified API across all components + _config:: config, + mixin:: (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: po._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: po._config.commonLabels + po._config.mixin.ruleLabels, + name: po._config.name + '-rules', + namespace: po._config.namespace, + }, + spec: { + local r = if std.objectHasAll(po.mixin, 'prometheusRules') then po.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(po.mixin, 'prometheusAlerts') then po.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + service+: { + spec+: { + ports: [ + { + name: 'https', + port: 8443, + targetPort: 'https', + }, + ], + }, + }, + + serviceMonitor+: { + spec+: { + endpoints: [ + { + port: 'https', + scheme: 'https', + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, + + clusterRole+: { + rules+: [ + { + apiGroups: ['authentication.k8s.io'], + resources: ['tokenreviews'], + verbs: ['create'], + }, + { + apiGroups: ['authorization.k8s.io'], + resources: ['subjectaccessreviews'], + verbs: ['create'], + }, + ], + }, + + local kubeRbacProxy = krp({ + name: 'kube-rbac-proxy', + upstream: 'http://127.0.0.1:8080/', + secureListenAddress: ':8443', + ports: [ + { name: 'https', containerPort: 8443 }, + ], + image: po._config.kubeRbacProxyImage, + }), + + deployment+: { + spec+: { + template+: { + spec+: { + containers+: [kubeRbacProxy], + }, + }, + }, + }, + } diff --git a/jsonnet/kube-prometheus/components/prometheus.libsonnet b/jsonnet/kube-prometheus/components/prometheus.libsonnet new file mode 100644 index 00000000..2df12b96 --- /dev/null +++ b/jsonnet/kube-prometheus/components/prometheus.libsonnet @@ -0,0 +1,394 @@ +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + resources: { + requests: { memory: '400Mi' }, + }, + + name: error 'must provide name', + alertmanagerName: error 'must provide alertmanagerName', + namespaces: ['default', 'kube-system', defaults.namespace], + replicas: 2, + externalLabels: {}, + enableFeatures: [], + commonLabels:: { + 'app.kubernetes.io/name': 'prometheus', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + } + { prometheus: defaults.name }, + ruleSelector: {}, + mixin: { + ruleLabels: {}, + _config: { + prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"', + prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', + thanosSelector: 'job="thanos-sidecar"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus/%s', + }, + }, + thanos: null, +}; + + +function(params) { + local p = self, + _config:: defaults + params, + // Safety check + assert std.isObject(p._config.resources), + assert std.isObject(p._config.mixin._config), + + mixin:: + (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') + { + _config+:: p._config.mixin._config, + }, + + mixinThanos:: + (import 'github.com/thanos-io/thanos/mixin/alerts/sidecar.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') + { + _config+:: p._config.mixin._config, + targetGroups: {}, + sidecar: { + selector: p._config.mixin._config.thanosSelector, + dimensions: std.join(', ', ['job', 'instance']), + }, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p._config.commonLabels + p._config.mixin.ruleLabels, + name: 'prometheus-' + p._config.name + '-prometheus-rules', + namespace: p._config.namespace, + }, + spec: { + local r = if std.objectHasAll(p.mixin, 'prometheusRules') then p.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(p.mixin, 'prometheusAlerts') then p.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: { prometheus: p._config.name } + p._config.commonLabels, + }, + spec: { + ports: [ + { name: 'web', targetPort: 'web', port: 9090 }, + ] + + ( + if p._config.thanos != null then + [{ name: 'grpc', port: 10901, targetPort: 10901 }] + else [] + ), + selector: { app: 'prometheus' } + p._config.selectorLabels, + sessionAffinity: 'ClientIP', + }, + }, + + roleBindingSpecificNamespaces: + local newSpecificRoleBinding(namespace) = { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: namespace, + labels: p._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'prometheus-' + p._config.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + }], + }; + { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBindingList', + items: [newSpecificRoleBinding(x) for x in p._config.namespaces], + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'prometheus-' + p._config.name, + labels: p._config.commonLabels, + }, + rules: [ + { + apiGroups: [''], + resources: ['nodes/metrics'], + verbs: ['get'], + }, + { + nonResourceURLs: ['/metrics'], + verbs: ['get'], + }, + ], + }, + + roleConfig: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'prometheus-' + p._config.name + '-config', + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + rules: [{ + apiGroups: [''], + resources: ['configmaps'], + verbs: ['get'], + }], + }, + + roleBindingConfig: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'prometheus-' + p._config.name + '-config', + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'prometheus-' + p._config.name + '-config', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'prometheus-' + p._config.name, + labels: p._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'prometheus-' + p._config.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + }], + }, + + roleSpecificNamespaces: + local newSpecificRole(namespace) = { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: namespace, + labels: p._config.commonLabels, + }, + rules: [ + { + apiGroups: [''], + resources: ['services', 'endpoints', 'pods'], + verbs: ['get', 'list', 'watch'], + }, + { + apiGroups: ['extensions'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }, + { + apiGroups: ['networking.k8s.io'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }, + ], + }; + { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleList', + items: [newSpecificRole(x) for x in p._config.namespaces], + }, + + [if (defaults + params).replicas > 1 then 'podDisruptionBudget']: { + apiVersion: 'policy/v1beta1', + kind: 'PodDisruptionBudget', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + spec: { + minAvailable: 1, + selector: { + matchLabels: { + prometheus: p._config.name, + } + p._config.selectorLabels, + }, + }, + }, + + prometheus: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Prometheus', + metadata: { + name: p._config.name, + namespace: p._config.namespace, + labels: { prometheus: p._config.name } + p._config.commonLabels, + }, + spec: { + replicas: p._config.replicas, + version: p._config.version, + image: p._config.image, + podMetadata: { + labels: p._config.commonLabels, + }, + externalLabels: p._config.externalLabels, + enableFeatures: p._config.enableFeatures, + serviceAccountName: 'prometheus-' + p._config.name, + podMonitorSelector: {}, + podMonitorNamespaceSelector: {}, + probeSelector: {}, + probeNamespaceSelector: {}, + ruleNamespaceSelector: {}, + ruleSelector: p._config.ruleSelector, + serviceMonitorSelector: {}, + serviceMonitorNamespaceSelector: {}, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + resources: p._config.resources, + alerting: { + alertmanagers: [{ + namespace: p._config.namespace, + name: 'alertmanager-' + p._config.alertmanagerName, + port: 'web', + apiVersion: 'v2', + }], + }, + securityContext: { + runAsUser: 1000, + runAsNonRoot: true, + fsGroup: 2000, + }, + [if std.objectHas(params, 'thanos') then 'thanos']: p._config.thanos, + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + spec: { + selector: { + matchLabels: p._config.selectorLabels, + }, + endpoints: [{ + port: 'web', + interval: '30s', + }], + }, + }, + + // Include thanos sidecar PrometheusRule only if thanos config was passed by user + [if std.objectHas(params, 'thanos') && params.thanos != null then 'prometheusRuleThanosSidecar']: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p._config.commonLabels + p._config.mixin.ruleLabels, + name: 'prometheus-' + p._config.name + '-thanos-sidecar-rules', + namespace: p._config.namespace, + }, + spec: { + local r = if std.objectHasAll(p.mixinThanos, 'prometheusRules') then p.mixinThanos.prometheusRules.groups else [], + local a = if std.objectHasAll(p.mixinThanos, 'prometheusAlerts') then p.mixinThanos.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + // Include thanos sidecar Service only if thanos config was passed by user + [if std.objectHas(params, 'thanos') && params.thanos != null then 'serviceThanosSidecar']: { + apiVersion: 'v1', + kind: 'Service', + metadata+: { + name: 'prometheus-' + p._config.name + '-thanos-sidecar', + namespace: p._config.namespace, + labels+: p._config.commonLabels { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'thanos-sidecar', + }, + }, + spec+: { + ports: [ + { name: 'grpc', port: 10901, targetPort: 10901 }, + { name: 'http', port: 10902, targetPort: 10902 }, + ], + selector: p._config.selectorLabels { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'prometheus', + }, + clusterIP: 'None', + }, + }, + + // Include thanos sidecar ServiceMonitor only if thanos config was passed by user + [if std.objectHas(params, 'thanos') && params.thanos != null then 'serviceMonitorThanosSidecar']: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata+: { + name: 'thanos-sidecar', + namespace: p._config.namespace, + labels: p._config.commonLabels { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'thanos-sidecar', + }, + }, + spec+: { + jobLabel: 'app.kubernetes.io/component', + selector: { + matchLabels: { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'thanos-sidecar', + }, + }, + endpoints: [{ + port: 'http', + interval: '30s', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet b/jsonnet/kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet deleted file mode 100644 index d83368cf..00000000 --- a/jsonnet/kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet +++ /dev/null @@ -1,50 +0,0 @@ -[ - // Drop all kubelet metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)', - action: 'drop', - }, - // Drop all scheduler metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)', - action: 'drop', - }, - // Drop all apiserver metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)', - action: 'drop', - }, - // Drop all docker metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)', - action: 'drop', - }, - // Drop all reflector metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)', - action: 'drop', - }, - // Drop all etcd metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)', - action: 'drop', - }, - // Drop all transformation metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'transformation_(transformation_latencies_microseconds|failures_total)', - action: 'drop', - }, - // Drop all other metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)', - action: 'drop', - }, -] diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 7c74b15e..232ef3f1 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -14,10 +14,10 @@ "source": { "git": { "remote": "https://github.com/etcd-io/etcd", - "subdir": "Documentation/etcd-mixin" + "subdir": "contrib/mixin" } }, - "version": "master" + "version": "main" }, { "source": { @@ -26,7 +26,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "release-0.43" + "version": "master" }, { "source": { @@ -35,17 +35,8 @@ "subdir": "jsonnet/mixin" } }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/ksonnet/ksonnet-lib", - "subdir": "" - } - }, "version": "master", - "name": "ksonnet" + "name": "prometheus-operator-mixin" }, { "source": { @@ -63,7 +54,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "release-1.9" + "version": "master" }, { "source": { @@ -90,8 +81,28 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "release-2.22", + "version": "main", "name": "prometheus" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "main", + "name": "alertmanager" + }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos", + "subdir": "mixin" + } + }, + "version": "main", + "name": "thanos-mixin" } ], "legacyImports": true diff --git a/jsonnet/kube-prometheus/ksm-autoscaler/ksm-autoscaler.libsonnet b/jsonnet/kube-prometheus/ksm-autoscaler/ksm-autoscaler.libsonnet deleted file mode 100644 index 1fed631d..00000000 --- a/jsonnet/kube-prometheus/ksm-autoscaler/ksm-autoscaler.libsonnet +++ /dev/null @@ -1,118 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - versions+:: { - clusterVerticalAutoscaler: "v0.8.1" - }, - - imageRepos+:: { - clusterVerticalAutoscaler: 'gcr.io/google_containers/cpvpa-amd64' - }, - - kubeStateMetrics+:: { - stepCPU: '1m', - stepMemory: '2Mi', - }, - }, - ksmAutoscaler+:: { - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local rulesType = clusterRole.rulesType; - - local rules = [ - rulesType.new() + - rulesType.withApiGroups(['']) + - rulesType.withResources([ - 'nodes', - ]) + - rulesType.withVerbs(['list', 'watch']), - ]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('ksm-autoscaler') + - clusterRole.withRules(rules), - - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('ksm-autoscaler') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('ksm-autoscaler') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $._config.namespace }]), - - roleBinding: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('ksm-autoscaler') + - roleBinding.mixin.metadata.withNamespace($._config.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('ksm-autoscaler') + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler' }]), - - role: - local role = k.rbac.v1.role; - local rulesType = role.rulesType; - - local extensionsRule = rulesType.new() + - rulesType.withApiGroups(['extensions']) + - rulesType.withResources([ - 'deployments', - ]) + - rulesType.withVerbs(['patch']) + - rulesType.withResourceNames(['kube-state-metrics']); - - local appsRule = rulesType.new() + - rulesType.withApiGroups(['apps']) + - rulesType.withResources([ - 'deployments', - ]) + - rulesType.withVerbs(['patch']) + - rulesType.withResourceNames(['kube-state-metrics']); - - local rules = [extensionsRule, appsRule]; - - role.new() + - role.mixin.metadata.withName('ksm-autoscaler') + - role.mixin.metadata.withNamespace($._config.namespace) + - role.withRules(rules), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('ksm-autoscaler') + - serviceAccount.mixin.metadata.withNamespace($._config.namespace), - deployment: - local deployment = k.apps.v1.deployment; - local container = deployment.mixin.spec.template.spec.containersType; - local podSelector = deployment.mixin.spec.template.spec.selectorType; - local podLabels = { app: 'ksm-autoscaler' }; - - local kubeStateMetricsAutoscaler = - container.new('ksm-autoscaler', $._config.imageRepos.clusterVerticalAutoscaler + ':' + $._config.versions.clusterVerticalAutoscaler) + - container.withArgs([ - '/cpvpa', - '--target=deployment/kube-state-metrics', - '--namespace=' + $._config.namespace, - '--logtostderr=true', - '--poll-period-seconds=10', - '--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}}}}' - ]) + - container.mixin.resources.withRequests({cpu: '20m', memory: '10Mi'}); - - local c = [kubeStateMetricsAutoscaler]; - - deployment.new('ksm-autoscaler', 1, c, podLabels) + - deployment.mixin.metadata.withNamespace($._config.namespace) + - deployment.mixin.metadata.withLabels(podLabels) + - deployment.mixin.spec.selector.withMatchLabels(podLabels) + - deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) + - deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - deployment.mixin.spec.template.spec.withServiceAccountName('ksm-autoscaler'), - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-all-namespaces.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-all-namespaces.libsonnet deleted file mode 100644 index e6ab5548..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-all-namespaces.libsonnet +++ /dev/null @@ -1,20 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - prometheus+:: { - clusterRole+: { - rules+: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - local rule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - [rule] - }, - } -} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet deleted file mode 100644 index 59014d55..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet +++ /dev/null @@ -1,41 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; -local statefulSet = k.apps.v1.statefulSet; -local affinity = statefulSet.mixin.spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecutionType; -local matchExpression = affinity.mixin.podAffinityTerm.labelSelector.matchExpressionsType; - -{ - local antiaffinity(key, values, namespace) = { - affinity: { - podAntiAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - affinity.new() + - affinity.withWeight(100) + - affinity.mixin.podAffinityTerm.withNamespaces(namespace) + - affinity.mixin.podAffinityTerm.withTopologyKey('kubernetes.io/hostname') + - affinity.mixin.podAffinityTerm.labelSelector.withMatchExpressions([ - matchExpression.new() + - matchExpression.withKey(key) + - matchExpression.withOperator('In') + - matchExpression.withValues(values), - ]), - ], - }, - }, - }, - - alertmanager+:: { - alertmanager+: { - spec+: - antiaffinity('alertmanager', [$._config.alertmanager.name], $._config.namespace), - }, - }, - - prometheus+: { - local p = self, - - prometheus+: { - spec+: - antiaffinity('prometheus', [p.name], p.namespace), - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet deleted file mode 100644 index ad278407..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet +++ /dev/null @@ -1,20 +0,0 @@ -local l = import 'lib/lib.libsonnet'; - -// withImageRepository is a mixin that replaces all images prefixes by repository. eg. -// quay.io/coreos/addon-resizer -> $repository/addon-resizer -// grafana/grafana -> grafana $repository/grafana -local withImageRepository(repository) = { - local oldRepos = super._config.imageRepos, - local substituteRepository(image, repository) = - if repository == null then image else repository + '/' + l.imageName(image), - _config+:: { - imageRepos:: { - [field]: substituteRepository(oldRepos[field], repository), - for field in std.objectFields(oldRepos) - } - }, -}; - -{ - withImageRepository:: withImageRepository, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet deleted file mode 100644 index c4230e19..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet +++ /dev/null @@ -1,89 +0,0 @@ -{ - _config+:: { - eks: { - minimumAvailableIPs: 10, - minimumAvailableIPsTime: '10m', - }, - }, - prometheus+: { - serviceMonitorCoreDNS+: { - spec+: { - endpoints: [ - { - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - interval: '15s', - targetPort: 9153, - }, - ], - }, - }, - AwsEksCniMetricService: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'aws-node', - namespace: 'kube-system', - labels: { 'k8s-app': 'aws-node' }, - }, - spec: { - ports: [ - { name: 'cni-metrics-port', port: 61678, targetPort: 61678 }, - ], - selector: { 'k8s-app': 'aws-node' }, - clusterIP: 'None', - }, - }, - - serviceMonitorAwsEksCNI: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'awsekscni', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'eks-cni', - }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'aws-node', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'cni-metrics-port', - interval: '30s', - path: '/metrics', - }, - ], - }, - }, - }, - prometheusRules+: { - groups+: [ - { - name: 'kube-prometheus-eks.rules', - rules: [ - { - expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs, - labels: { - severity: 'critical', - }, - annotations: { - message: 'Instance {{ $labels.instance }} has less than 10 IPs available.', - }, - 'for': $._config.eks.minimumAvailableIPsTime, - alert: 'EksAvailableIPs', - }, - ], - }, - ], - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet deleted file mode 100644 index c1e7682d..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet +++ /dev/null @@ -1,56 +0,0 @@ -local service(name, namespace, labels, selector, ports) = { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: name, - namespace: namespace, - labels: labels, - }, - spec: { - ports+: ports, - selector: selector, - clusterIP: 'None', - }, -}; - -{ - - prometheus+: { - kubeControllerManagerPrometheusDiscoveryService: service( - 'kube-controller-manager-prometheus-discovery', - 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, - [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] - ), - - kubeSchedulerPrometheusDiscoveryService: service( - 'kube-scheduler-prometheus-discovery', - 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, - [{ name: 'https-metrics', port: 10259, targetPort: 10259 }], - ), - - serviceMonitorKubeScheduler+: { - spec+: { - selector+: { - matchLabels: { - 'k8s-app': 'kube-scheduler', - }, - }, - }, - }, - - serviceMonitorKubeControllerManager+: { - spec+: { - selector+: { - matchLabels: { - 'k8s-app': 'kube-controller-manager', - }, - }, - }, - }, - - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet deleted file mode 100644 index 9b4e1a8c..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet +++ /dev/null @@ -1,35 +0,0 @@ -// On managed Kubernetes clusters some of the control plane components are not exposed to customers. -// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'kube-prometheus.libsonnet' defaults - -{ - _config+:: { - // This snippet walks the original object (super.jobs, set as temp var j) and creates a replacement jobs object - // excluding any members of the set specified (eg: controller and scheduler). - local j = super.jobs, - jobs: { - [k]: j[k] - for k in std.objectFields(j) - if !std.setMember(k, ['KubeControllerManager', 'KubeScheduler']) - }, - - // Skip alerting rules too - prometheus+:: { - rules+:: { - local g = super.groups, - groups: [ - h - for h in g - if !std.setMember(h.name, ['kubernetes-system-controller-manager', 'kubernetes-system-scheduler']) - ], - }, - }, - }, - - // Same as above but for ServiceMonitor's - local p = super.prometheus, - prometheus: { - [q]: p[q] - for q in std.objectFields(p) - if !std.setMember(q, ['serviceMonitorKubeControllerManager', 'serviceMonitorKubeScheduler']) - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-strip-limits.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-strip-limits.libsonnet deleted file mode 100644 index fbd40200..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-strip-limits.libsonnet +++ /dev/null @@ -1,35 +0,0 @@ -// Strips spec.containers[].limits for certain containers -// https://github.com/prometheus-operator/kube-prometheus/issues/72 -{ - _config+:: { - resources+:: { - 'addon-resizer'+: { - limits: {}, - }, - 'kube-rbac-proxy'+: { - limits: {}, - }, - 'kube-state-metrics'+: { - limits: {}, - }, - 'node-exporter'+: { - limits: {}, - }, - }, - }, - prometheusOperator+: { - deployment+: { - spec+: { - template+: { - spec+: { - local addArgs(c) = - if c.name == 'prometheus-operator' - then c { args+: ['--config-reloader-cpu=0'] } - else c, - containers: std.map(addArgs, super.containers), - }, - }, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet deleted file mode 100644 index 7d98c309..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet +++ /dev/null @@ -1,77 +0,0 @@ -{ - _config+:: { - versions+:: { thanos: 'v0.14.0' }, - imageRepos+:: { thanos: 'quay.io/thanos/thanos' }, - thanos+:: { - objectStorageConfig: { - key: 'thanos.yaml', // How the file inside the secret is called - name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config - }, - }, - }, - prometheus+:: { - // Add the grpc port to the Prometheus service to be able to query it with the Thanos Querier - service+: { - spec+: { - ports+: [ - { name: 'grpc', port: 10901, targetPort: 10901 }, - ], - }, - }, - // Create a new service that exposes both sidecar's HTTP metrics port and gRPC StoreAPI - serviceThanosSidecar: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'prometheus-' + $._config.prometheus.name + '-thanos-sidecar', - namespace: $._config.namespace, - labels: { prometheus: $._config.prometheus.name, app: 'thanos-sidecar' }, - }, - spec: { - ports: [ - { name: 'grpc', port: 10901, targetPort: 10901 }, - { name: 'http', port: 10902, targetPort: 10902 }, - ], - selector: { app: 'prometheus', prometheus: $._config.prometheus.name }, - clusterIP: 'None', - }, - }, - prometheus+: { - spec+: { - thanos+: { - version: $._config.versions.thanos, - image: $._config.imageRepos.thanos + ':' + $._config.versions.thanos, - objectStorageConfig: $._config.thanos.objectStorageConfig, - }, - }, - }, - serviceMonitorThanosSidecar: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'thanos-sidecar', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'prometheus', - }, - }, - spec: { - // Use the service's app label (thanos-sidecar) as the value for the job label. - jobLabel: 'app', - selector: { - matchLabels: { - prometheus: $._config.prometheus.name, - app: 'thanos-sidecar', - }, - }, - endpoints: [ - { - port: 'http', - interval: '30s', - }, - ], - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet deleted file mode 100644 index 19e7b934..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet +++ /dev/null @@ -1,196 +0,0 @@ -{ - prometheus+: { - serviceWeaveNet: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'weave-net', - namespace: 'kube-system', - labels: { 'k8s-app': 'weave-net' }, - }, - spec: { - ports: [ - { name: 'weave-net-metrics', targetPort: 6782, port: 6782 }, - ], - selector: { name: 'weave-net' }, - clusterIP: 'None', - }, - }, - serviceMonitorWeaveNet: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'weave-net', - labels: { - 'k8s-app': 'weave-net', - }, - namespace: 'monitoring', - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'weave-net-metrics', - path: '/metrics', - interval: '15s', - }, - ], - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - selector: { - matchLabels: { - 'k8s-app': 'weave-net', - }, - }, - }, - }, - }, - prometheusRules+: { - groups+: [ - { - name: 'weave-net', - rules: [ - { - alert: 'WeaveNetIPAMSplitBrain', - expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.', - description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetIPAMUnreachable', - expr: 'weave_ipam_unreachable_percentage > 25', - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.', - description: 'actionable: Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetIPAMPendingAllocates', - expr: 'sum(weave_ipam_pending_allocates) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Number of pending allocates is above the threshold.', - description: 'actionable: Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetIPAMPendingClaims', - expr: 'sum(weave_ipam_pending_claims) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Number of pending claims is above the threshold.', - description: 'actionable: Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetFastDPFlowsLow', - expr: 'sum(weave_flows) < 15000', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Number of FastDP flows is below the threshold.', - description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.', - }, - }, - { - alert: 'WeaveNetFastDPFlowsOff', - expr: 'sum(weave_flows == bool 0) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'FastDP flows is zero.', - description: 'actionable: Please find the reason for FastDP flows to be off and fix it.', - }, - }, - { - alert: 'WeaveNetHighConnectionTerminationRate', - expr: 'rate(weave_connection_terminations_total[5m]) > 0.1', - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are getting terminated.', - description: 'actionable: Please find the reason for the high connection termination rate and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsConnecting', - expr: 'sum(weave_connections{state="connecting"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in connecting state.', - description: 'actionable: Please find the reason for this and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsRetying', - expr: 'sum(weave_connections{state="retrying"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in retrying state.', - description: 'actionable: Please find the reason for this and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsPending', - expr: 'sum(weave_connections{state="pending"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in pending state.', - description: 'actionable: Please find the reason for this and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsFailed', - expr: 'sum(weave_connections{state="failed"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in failed state.', - description: 'actionable: Please find the reason and fix it.', - }, - }, - ], - }, - ], - }, - grafanaDashboards+:: { - 'weave-net.json': (import './grafana-weave-net.json'), - 'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'), - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet deleted file mode 100644 index 46deacc4..00000000 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ /dev/null @@ -1,203 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; -local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet'; -local configMapList = k3.core.v1.configMapList; -local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet'; - -(import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + -(import './kube-state-metrics/kube-state-metrics.libsonnet') + -(import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') + -(import './node-exporter/node-exporter.libsonnet') + -(import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') + -(import './alertmanager/alertmanager.libsonnet') + -(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') + -(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') + -(import './prometheus/prometheus.libsonnet') + -(import './prometheus-adapter/prometheus-adapter.libsonnet') + -(import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') + -(import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') + -(import './alerts/alerts.libsonnet') + -(import './rules/rules.libsonnet') + { - kubePrometheus+:: { - namespace: k.core.v1.namespace.new($._config.namespace), - }, - prometheusOperator+:: { - service+: { - spec+: { - ports: [ - { - name: 'https', - port: 8443, - targetPort: 'https', - }, - ], - }, - }, - serviceMonitor+: { - spec+: { - endpoints: [ - { - port: 'https', - scheme: 'https', - honorLabels: true, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ] - }, - }, - clusterRole+: { - rules+: [ - { - apiGroups: ['authentication.k8s.io'], - resources: ['tokenreviews'], - verbs: ['create'], - }, - { - apiGroups: ['authorization.k8s.io'], - resources: ['subjectaccessreviews'], - verbs: ['create'], - }, - ], - }, - } + - (kubeRbacProxyContainer { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy', - securePortName: 'https', - securePort: 8443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8080/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin, - - grafana+:: { - dashboardDefinitions: configMapList.new(super.dashboardDefinitions), - serviceMonitor: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'grafana', - namespace: $._config.namespace, - }, - spec: { - selector: { - matchLabels: { - app: 'grafana', - }, - }, - endpoints: [ - { - port: 'http', - interval: '15s', - }, - ], - }, - }, - }, -} + { - _config+:: { - namespace: 'default', - - versions+:: { - grafana: '7.1.0', - kubeRbacProxy: 'v0.8.0', - }, - - imageRepos+:: { - kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy', - }, - - tlsCipherSuites: [ - 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 - 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 - - // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - // 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 - // 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 - // 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 - // 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2 - // 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2 - // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 - // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 - - // disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go - - 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', - 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', - ], - - cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', - kubeletSelector: 'job="kubelet", metrics_path="/metrics"', - kubeStateMetricsSelector: 'job="kube-state-metrics"', - nodeExporterSelector: 'job="node-exporter"', - fsSpaceFillingUpCriticalThreshold: 15, - notKubeDnsSelector: 'job!="kube-dns"', - kubeSchedulerSelector: 'job="kube-scheduler"', - kubeControllerManagerSelector: 'job="kube-controller-manager"', - kubeApiserverSelector: 'job="apiserver"', - coreDNSSelector: 'job="kube-dns"', - podLabel: 'pod', - - alertmanagerSelector: 'job="alertmanager-' + $._config.alertmanager.name + '",namespace="' + $._config.namespace + '"', - prometheusSelector: 'job="prometheus-' + $._config.prometheus.name + '",namespace="' + $._config.namespace + '"', - prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', - prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + $._config.namespace + '"', - - jobs: { - Kubelet: $._config.kubeletSelector, - KubeScheduler: $._config.kubeSchedulerSelector, - KubeControllerManager: $._config.kubeControllerManagerSelector, - KubeAPI: $._config.kubeApiserverSelector, - KubeStateMetrics: $._config.kubeStateMetricsSelector, - NodeExporter: $._config.nodeExporterSelector, - Alertmanager: $._config.alertmanagerSelector, - Prometheus: $._config.prometheusSelector, - PrometheusOperator: $._config.prometheusOperatorSelector, - CoreDNS: $._config.coreDNSSelector, - }, - - resources+:: { - 'addon-resizer': { - requests: { cpu: '10m', memory: '30Mi' }, - limits: { cpu: '50m', memory: '30Mi' }, - }, - 'kube-rbac-proxy': { - requests: { cpu: '10m', memory: '20Mi' }, - limits: { cpu: '20m', memory: '40Mi' }, - }, - 'kube-state-metrics': { - requests: { cpu: '100m', memory: '150Mi' }, - limits: { cpu: '100m', memory: '150Mi' }, - }, - 'node-exporter': { - requests: { cpu: '102m', memory: '180Mi' }, - limits: { cpu: '250m', memory: '180Mi' }, - }, - }, - prometheus+:: { - rules: $.prometheusRules + $.prometheusAlerts, - }, - - grafana+:: { - dashboards: $.grafanaDashboards, - }, - - }, -} diff --git a/jsonnet/kube-prometheus/kube-rbac-proxy/container.libsonnet b/jsonnet/kube-prometheus/kube-rbac-proxy/container.libsonnet deleted file mode 100644 index fa85f0cf..00000000 --- a/jsonnet/kube-prometheus/kube-rbac-proxy/container.libsonnet +++ /dev/null @@ -1,91 +0,0 @@ -{ - local krp = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - - specMixin:: { - local sm = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - spec+: { - template+: { - spec+: { - containers+: [{ - name: krp.config.kubeRbacProxy.name, - image: krp.config.kubeRbacProxy.image, - args: [ - '--logtostderr', - '--secure-listen-address=' + krp.config.kubeRbacProxy.secureListenAddress, - '--tls-cipher-suites=' + std.join(',', krp.config.kubeRbacProxy.tlsCipherSuites), - '--upstream=' + krp.config.kubeRbacProxy.upstream, - ], - ports: [ - { name: krp.config.kubeRbacProxy.securePortName, containerPort: krp.config.kubeRbacProxy.securePort }, - ], - securityContext: { - runAsUser: 65534, - }, - }], - }, - }, - }, - }, - - deploymentMixin:: { - local dm = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - deployment+: krp.specMixin { - config+:: { - kubeRbacProxy+: dm.config.kubeRbacProxy, - }, - }, - }, - - statefulSetMixin:: { - local sm = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - statefulSet+: krp.specMixin { - config+:: { - kubeRbacProxy+: sm.config.kubeRbacProxy, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet deleted file mode 100644 index 7fae2be2..00000000 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ /dev/null @@ -1,132 +0,0 @@ -local kubeRbacProxyContainer = import '../kube-rbac-proxy/container.libsonnet'; -local ksm = import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet'; - -{ - _config+:: { - versions+:: { - kubeStateMetrics: '1.9.7', - }, - imageRepos+:: { - kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', - }, - kubeStateMetrics+:: { - scrapeInterval: '30s', - scrapeTimeout: '30s', - }, - }, - kubeStateMetrics+:: - ksm + { - local version = self.version, - name:: 'kube-state-metrics', - namespace:: $._config.namespace, - version:: $._config.versions.kubeStateMetrics, - image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics, - service+: { - spec+: { - ports: [ - { - name: 'https-main', - port: 8443, - targetPort: 'https-main', - }, - { - name: 'https-self', - port: 9443, - targetPort: 'https-self', - }, - ], - }, - }, - deployment+: { - spec+: { - template+: { - spec+: { - containers: std.map(function(c) c { - ports:: null, - livenessProbe:: null, - readinessProbe:: null, - args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], - }, super.containers), - }, - }, - }, - }, - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-state-metrics', - namespace: $._config.namespace, - labels: { - 'app.kubernetes.io/name': 'kube-state-metrics', - 'app.kubernetes.io/version': version, - }, - }, - spec: { - jobLabel: 'app.kubernetes.io/name', - selector: { - matchLabels: { - 'app.kubernetes.io/name': 'kube-state-metrics', - }, - }, - endpoints: [ - { - port: 'https-main', - scheme: 'https', - interval: $._config.kubeStateMetrics.scrapeInterval, - scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, - honorLabels: true, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - regex: '(pod|service|endpoint|namespace)', - action: 'labeldrop', - }, - ], - tlsConfig: { - insecureSkipVerify: true, - }, - }, - { - port: 'https-self', - scheme: 'https', - interval: $._config.kubeStateMetrics.scrapeInterval, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - }, - }, - } + - (kubeRbacProxyContainer { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy-main', - securePortName: 'https-main', - securePort: 8443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8081/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin + - (kubeRbacProxyContainer { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy-self', - securePortName: 'https-self', - securePort: 9443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8082/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin, -} diff --git a/jsonnet/kube-prometheus/lib/image.libsonnet b/jsonnet/kube-prometheus/lib/image.libsonnet deleted file mode 100644 index 0561e33c..00000000 --- a/jsonnet/kube-prometheus/lib/image.libsonnet +++ /dev/null @@ -1,21 +0,0 @@ -// imageName extracts the image name from a fully qualified image string. eg. -// quay.io/coreos/addon-resizer -> addon-resizer -// grafana/grafana -> grafana -local imageName(image) = - local parts = std.split(image, '/'); - local len = std.length(parts); - if len == 3 then - # registry.com/org/image - parts[2] - else if len == 2 then - # org/image - parts[1] - else if len == 1 then - # image, ie. busybox - parts[0] - else - error 'unknown image format: ' + image; - -{ - imageName:: imageName, -} diff --git a/jsonnet/kube-prometheus/lib/lib.libsonnet b/jsonnet/kube-prometheus/lib/lib.libsonnet deleted file mode 100644 index c30f976f..00000000 --- a/jsonnet/kube-prometheus/lib/lib.libsonnet +++ /dev/null @@ -1 +0,0 @@ -(import 'image.libsonnet') diff --git a/jsonnet/kube-prometheus/lib/mixin.libsonnet b/jsonnet/kube-prometheus/lib/mixin.libsonnet new file mode 100644 index 00000000..4c0665ea --- /dev/null +++ b/jsonnet/kube-prometheus/lib/mixin.libsonnet @@ -0,0 +1,38 @@ +local defaults = { + name: error 'provide name', + namespace: 'monitoring', + labels: { + prometheus: 'k8s', + }, + mixin: error 'provide a mixin', +}; + +function(params) { + _config:: defaults + params, + + local m = self, + + local prometheusRules = if std.objectHasAll(m._config.mixin, 'prometheusRules') || std.objectHasAll(m._config.mixin, 'prometheusAlerts') then { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: m._config.labels, + name: m._config.name, + namespace: m._config.namespace, + }, + spec: { + local r = if std.objectHasAll(m._config.mixin, 'prometheusRules') then m._config.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(m._config.mixin, 'prometheusAlerts') then m._config.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + local grafanaDashboards = if std.objectHasAll(m._config.mixin, 'grafanaDashboards') then ( + if std.objectHas(m._config, 'dashboardFolder') then { + [m._config.dashboardFolder]+: m._config.mixin.grafanaDashboards, + } else (m._config.mixin.grafanaDashboards) + ), + + prometheusRules: prometheusRules, + grafanaDashboards: grafanaDashboards, +} diff --git a/jsonnet/kube-prometheus/lib/utils.libsonnet b/jsonnet/kube-prometheus/lib/utils.libsonnet new file mode 100644 index 00000000..b5d29825 --- /dev/null +++ b/jsonnet/kube-prometheus/lib/utils.libsonnet @@ -0,0 +1,7 @@ +{ + // rangeInterval takes a scrape interval and convert its to a range interval + // following Prometheus rule of thumb for rate() and irate(). + rangeInterval(i='1m'): + local interval = std.parseInt(std.substr(i, 0, std.length(i) - 1)); + interval * 4 + i[std.length(i) - 1], +} diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet new file mode 100644 index 00000000..877619b2 --- /dev/null +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -0,0 +1,144 @@ +local alertmanager = import './components/alertmanager.libsonnet'; +local blackboxExporter = import './components/blackbox-exporter.libsonnet'; +local grafana = import './components/grafana.libsonnet'; +local kubernetesControlPlane = import './components/k8s-control-plane.libsonnet'; +local kubeStateMetrics = import './components/kube-state-metrics.libsonnet'; +local customMixin = import './components/mixin/custom.libsonnet'; +local nodeExporter = import './components/node-exporter.libsonnet'; +local prometheusAdapter = import './components/prometheus-adapter.libsonnet'; +local prometheusOperator = import './components/prometheus-operator.libsonnet'; +local prometheus = import './components/prometheus.libsonnet'; + +local platformPatch = import './platforms/platforms.libsonnet'; + +local utils = import './lib/utils.libsonnet'; + +{ + // using `values` as this is similar to helm + values:: { + common: { + namespace: 'default', + platform: null, + ruleLabels: { + role: 'alert-rules', + prometheus: $.values.prometheus.name, + }, + // to allow automatic upgrades of components, we store versions in autogenerated `versions.json` file and import it here + versions: { + alertmanager: error 'must provide version', + blackboxExporter: error 'must provide version', + grafana: error 'must provide version', + kubeStateMetrics: error 'must provide version', + nodeExporter: error 'must provide version', + prometheus: error 'must provide version', + prometheusAdapter: error 'must provide version', + prometheusOperator: error 'must provide version', + kubeRbacProxy: error 'must provide version', + configmapReload: error 'must provide version', + } + (import 'versions.json'), + images: { + alertmanager: 'quay.io/prometheus/alertmanager:v' + $.values.common.versions.alertmanager, + blackboxExporter: 'quay.io/prometheus/blackbox-exporter:v' + $.values.common.versions.blackboxExporter, + grafana: 'grafana/grafana:v' + $.values.common.versions.grafana, + kubeStateMetrics: 'k8s.gcr.io/kube-state-metrics/kube-state-metrics:v' + $.values.common.versions.kubeStateMetrics, + nodeExporter: 'quay.io/prometheus/node-exporter:v' + $.values.common.versions.nodeExporter, + prometheus: 'quay.io/prometheus/prometheus:v' + $.values.common.versions.prometheus, + prometheusAdapter: 'k8s.gcr.io/prometheus-adapter/prometheus-adapter:v' + $.values.common.versions.prometheusAdapter, + prometheusOperator: 'quay.io/prometheus-operator/prometheus-operator:v' + $.values.common.versions.prometheusOperator, + prometheusOperatorReloader: 'quay.io/prometheus-operator/prometheus-config-reloader:v' + $.values.common.versions.prometheusOperator, + kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy:v' + $.values.common.versions.kubeRbacProxy, + configmapReload: 'jimmidyson/configmap-reload:v' + $.values.common.versions.configmapReload, + }, + }, + alertmanager: { + name: 'main', + namespace: $.values.common.namespace, + version: $.values.common.versions.alertmanager, + image: $.values.common.images.alertmanager, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + blackboxExporter: { + namespace: $.values.common.namespace, + version: $.values.common.versions.blackboxExporter, + image: $.values.common.images.blackboxExporter, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + configmapReloaderImage: $.values.common.images.configmapReload, + }, + grafana: { + namespace: $.values.common.namespace, + version: $.values.common.versions.grafana, + image: $.values.common.images.grafana, + prometheusName: $.values.prometheus.name, + // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards + dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesControlPlane.mixin.grafanaDashboards + $.alertmanager.mixin.grafanaDashboards, + }, + kubeStateMetrics: { + namespace: $.values.common.namespace, + version: $.values.common.versions.kubeStateMetrics, + image: $.values.common.images.kubeStateMetrics, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + }, + nodeExporter: { + namespace: $.values.common.namespace, + version: $.values.common.versions.nodeExporter, + image: $.values.common.images.nodeExporter, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + }, + prometheus: { + namespace: $.values.common.namespace, + version: $.values.common.versions.prometheus, + image: $.values.common.images.prometheus, + name: 'k8s', + alertmanagerName: $.values.alertmanager.name, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + prometheusAdapter: { + namespace: $.values.common.namespace, + version: $.values.common.versions.prometheusAdapter, + image: $.values.common.images.prometheusAdapter, + prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.common.namespace + '.svc.cluster.local:9090/', + rangeIntervals+: { + kubelet: utils.rangeInterval($.kubernetesControlPlane.serviceMonitorKubelet.spec.endpoints[0].interval), + nodeExporter: utils.rangeInterval($.nodeExporter.serviceMonitor.spec.endpoints[0].interval), + }, + }, + prometheusOperator: { + namespace: $.values.common.namespace, + version: $.values.common.versions.prometheusOperator, + image: $.values.common.images.prometheusOperator, + configReloaderImage: $.values.common.images.prometheusOperatorReloader, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + }, + kubernetesControlPlane: { + namespace: $.values.common.namespace, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + }, + + alertmanager: alertmanager($.values.alertmanager), + blackboxExporter: blackboxExporter($.values.blackboxExporter), + grafana: grafana($.values.grafana), + kubeStateMetrics: kubeStateMetrics($.values.kubeStateMetrics), + nodeExporter: nodeExporter($.values.nodeExporter), + prometheus: prometheus($.values.prometheus), + prometheusAdapter: prometheusAdapter($.values.prometheusAdapter), + prometheusOperator: prometheusOperator($.values.prometheusOperator), + kubernetesControlPlane: kubernetesControlPlane($.values.kubernetesControlPlane), + kubePrometheus: customMixin( + { + namespace: $.values.common.namespace, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + } + ) + { + namespace: { + apiVersion: 'v1', + kind: 'Namespace', + metadata: { + name: $.values.common.namespace, + }, + }, + }, +} + platformPatch diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet deleted file mode 100644 index e0326b88..00000000 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ /dev/null @@ -1,208 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - nodeExporter: 'v1.0.1', - }, - - imageRepos+:: { - nodeExporter: 'quay.io/prometheus/node-exporter', - }, - - nodeExporter+:: { - listenAddress: '127.0.0.1', - port: 9100, - labels: { - 'app.kubernetes.io/name': 'node-exporter', - 'app.kubernetes.io/version': $._config.versions.nodeExporter, - }, - selectorLabels: { - [labelName]: $._config.nodeExporter.labels[labelName] - for labelName in std.objectFields($._config.nodeExporter.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - }, - }, - - nodeExporter+:: { - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('node-exporter') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('node-exporter') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter', namespace: $._config.namespace }]), - - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local authenticationRole = policyRule.new() + - policyRule.withApiGroups(['authentication.k8s.io']) + - policyRule.withResources([ - 'tokenreviews', - ]) + - policyRule.withVerbs(['create']); - - local authorizationRole = policyRule.new() + - policyRule.withApiGroups(['authorization.k8s.io']) + - policyRule.withResources([ - 'subjectaccessreviews', - ]) + - policyRule.withVerbs(['create']); - - local rules = [authenticationRole, authorizationRole]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('node-exporter') + - clusterRole.withRules(rules), - - daemonset: - local daemonset = k.apps.v1.daemonSet; - local container = daemonset.mixin.spec.template.spec.containersType; - local volume = daemonset.mixin.spec.template.spec.volumesType; - local containerPort = container.portsType; - local containerVolumeMount = container.volumeMountsType; - local podSelector = daemonset.mixin.spec.template.spec.selectorType; - local toleration = daemonset.mixin.spec.template.spec.tolerationsType; - local containerEnv = container.envType; - - local podLabels = $._config.nodeExporter.labels; - local selectorLabels = $._config.nodeExporter.selectorLabels; - - local existsToleration = toleration.new() + - toleration.withOperator('Exists'); - local procVolumeName = 'proc'; - local procVolume = volume.fromHostPath(procVolumeName, '/proc'); - local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc'). - withMountPropagation('HostToContainer'). - withReadOnly(true); - - local sysVolumeName = 'sys'; - local sysVolume = volume.fromHostPath(sysVolumeName, '/sys'); - local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'). - withMountPropagation('HostToContainer'). - withReadOnly(true); - - local rootVolumeName = 'root'; - local rootVolume = volume.fromHostPath(rootVolumeName, '/'); - local rootVolumeMount = containerVolumeMount.new(rootVolumeName, '/host/root'). - withMountPropagation('HostToContainer'). - withReadOnly(true); - - local nodeExporter = - container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) + - container.withArgs([ - '--web.listen-address=' + std.join(':', [$._config.nodeExporter.listenAddress, std.toString($._config.nodeExporter.port)]), - '--path.procfs=/host/proc', - '--path.sysfs=/host/sys', - '--path.rootfs=/host/root', - '--no-collector.wifi', - '--no-collector.hwmon', - '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)', - ]) + - container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) + - container.mixin.resources.withRequests($._config.resources['node-exporter'].requests) + - container.mixin.resources.withLimits($._config.resources['node-exporter'].limits); - - local ip = containerEnv.fromFieldPath('IP', 'status.podIP'); - local proxy = - container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + - container.withArgs([ - '--logtostderr', - '--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port, - '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), - '--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/', - ]) + - // Keep `hostPort` here, rather than in the node-exporter container - // because Kubernetes mandates that if you define a `hostPort` then - // `containerPort` must match. In our case, we are splitting the - // host port and container port between the two containers. - // We'll keep the port specification here so that the named port - // used by the service is tied to the proxy container. We *could* - // forgo declaring the host port, however it is important to declare - // it so that the scheduler can decide if the pod is schedulable. - container.withPorts(containerPort.new($._config.nodeExporter.port) + containerPort.withHostPort($._config.nodeExporter.port) + containerPort.withName('https')) + - container.mixin.resources.withRequests($._config.resources['kube-rbac-proxy'].requests) + - container.mixin.resources.withLimits($._config.resources['kube-rbac-proxy'].limits) + - container.withEnv([ip]); - - local c = [nodeExporter, proxy]; - - daemonset.new() + - daemonset.mixin.metadata.withName('node-exporter') + - daemonset.mixin.metadata.withNamespace($._config.namespace) + - daemonset.mixin.metadata.withLabels(podLabels) + - daemonset.mixin.spec.selector.withMatchLabels(selectorLabels) + - daemonset.mixin.spec.updateStrategy.rollingUpdate.withMaxUnavailable('10%') + - daemonset.mixin.spec.template.metadata.withLabels(podLabels) + - daemonset.mixin.spec.template.spec.withTolerations([existsToleration]) + - daemonset.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) + - daemonset.mixin.spec.template.spec.withContainers(c) + - daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume, rootVolume]) + - daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter') + - daemonset.mixin.spec.template.spec.withHostPid(true) + - daemonset.mixin.spec.template.spec.withHostNetwork(true), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('node-exporter') + - serviceAccount.mixin.metadata.withNamespace($._config.namespace), - - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'node-exporter', - namespace: $._config.namespace, - labels: $._config.nodeExporter.labels, - }, - spec: { - jobLabel: 'app.kubernetes.io/name', - selector: { - matchLabels: $._config.nodeExporter.selectorLabels, - }, - endpoints: [ - { - port: 'https', - scheme: 'https', - interval: '15s', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - action: 'replace', - regex: '(.*)', - replacement: '$1', - sourceLabels: ['__meta_kubernetes_pod_node_name'], - targetLabel: 'instance', - }, - ], - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - }, - }, - - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - local nodeExporterPort = servicePort.newNamed('https', $._config.nodeExporter.port, 'https'); - - service.new('node-exporter', $._config.nodeExporter.selectorLabels, nodeExporterPort) + - service.mixin.metadata.withNamespace($._config.namespace) + - service.mixin.metadata.withLabels($._config.nodeExporter.labels) + - service.mixin.spec.withClusterIp('None'), - }, -} diff --git a/jsonnet/kube-prometheus/platforms/README.md b/jsonnet/kube-prometheus/platforms/README.md new file mode 100644 index 00000000..45eb76d1 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/README.md @@ -0,0 +1,3 @@ +# Adding a new platform specific configuration + +Adding a new platform specific configuration requires to update the [README](../../../README.md#cluster-creation-tools) and the [platforms.jsonnet](./platform.jsonnet) file by adding the platform to the list of existing ones. This allow the new platform to be discoverable and easily configurable by the users. diff --git a/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet b/jsonnet/kube-prometheus/platforms/aws.libsonnet similarity index 72% rename from jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet rename to jsonnet/kube-prometheus/platforms/aws.libsonnet index ae8d364d..27a61c2b 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet +++ b/jsonnet/kube-prometheus/platforms/aws.libsonnet @@ -14,19 +14,19 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }], ), kubeSchedulerPrometheusDiscoveryService: service( 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }], ), }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet b/jsonnet/kube-prometheus/platforms/bootkube.libsonnet similarity index 70% rename from jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet rename to jsonnet/kube-prometheus/platforms/bootkube.libsonnet index 284544c1..e4651ae9 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet +++ b/jsonnet/kube-prometheus/platforms/bootkube.libsonnet @@ -14,28 +14,28 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+:: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] ), kubeSchedulerPrometheusDiscoveryService: service( 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }] ), kubeDnsPrometheusDiscoveryService: service( 'kube-dns-prometheus-discovery', - 'kube-syste', - { 'k8s-app': 'kube-dns' }, - { 'k8s-app': 'kube-dns' }, + 'kube-system', + { 'app.kubernetes.io/name': 'kube-dns' }, + { 'app.kubernetes.io/name': 'kube-dns' }, [{ name: 'http-metrics-skydns', port: 10055, targetPort: 10055 }, { name: 'http-metrics-dnsmasq', port: 10054, targetPort: 10054 }] ), }, diff --git a/jsonnet/kube-prometheus/platforms/eks.libsonnet b/jsonnet/kube-prometheus/platforms/eks.libsonnet new file mode 100644 index 00000000..f46709f2 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/eks.libsonnet @@ -0,0 +1,16 @@ +(import '../addons/aws-vpc-cni.libsonnet') + +(import '../addons/managed-cluster.libsonnet') + { + kubernetesControlPlane+: { + serviceMonitorCoreDNS+: { + spec+: { + endpoints: [ + { + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + interval: '15s', + targetPort: 9153, + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/platforms/gke.libsonnet b/jsonnet/kube-prometheus/platforms/gke.libsonnet new file mode 100644 index 00000000..973eeffb --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/gke.libsonnet @@ -0,0 +1,13 @@ +(import '../addons/managed-cluster.libsonnet') + { + values+:: { + prometheusAdapter+: { + config+: { + resourceRules:: null, + }, + }, + }, + + prometheusAdapter+:: { + apiService:: null, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet b/jsonnet/kube-prometheus/platforms/kops-coredns.libsonnet similarity index 65% rename from jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet rename to jsonnet/kube-prometheus/platforms/kops-coredns.libsonnet index 7311c366..b9688173 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet +++ b/jsonnet/kube-prometheus/platforms/kops-coredns.libsonnet @@ -1,19 +1,19 @@ { - prometheus+:: { + kubernetesControlPlane+: { kubeDnsPrometheusDiscoveryService: { apiVersion: 'v1', kind: 'Service', metadata: { name: 'kube-dns-prometheus-discovery', namespace: 'kube-system', - labels: { 'k8s-app': 'kube-dns' }, + labels: { 'app.kubernetes.io/name': 'kube-dns' }, }, spec: { ports: [ { name: 'metrics', port: 9153, targetPort: 9153 }, ], - selector: { 'k8s-app': 'kube-dns' }, - cluserAPI: 'None', + selector: { 'app.kubernetes.io/name': 'kube-dns' }, + clusterIP: 'None', }, }, }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet b/jsonnet/kube-prometheus/platforms/kops.libsonnet similarity index 70% rename from jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet rename to jsonnet/kube-prometheus/platforms/kops.libsonnet index 8db8c299..52eac362 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet +++ b/jsonnet/kube-prometheus/platforms/kops.libsonnet @@ -14,25 +14,25 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+:: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, + { 'k8s-app': 'kube-controller-manager', 'app.kubernetes.io/name': 'kube-controller-manager' }, { 'k8s-app': 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] ), kubeSchedulerPrometheusDiscoveryService: service( - 'kube-controller-manager-prometheus-discovery', + 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, + { 'k8s-app': 'kube-controller-manager', 'app.kubernetes.io/name': 'kube-scheduler' }, { 'k8s-app': 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }] ), kubeDnsPrometheusDiscoveryService: service( - 'kube-controller-manager-prometheus-discovery', + 'kube-dns-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-dns' }, + { 'k8s-app': 'kube-controller-manager', 'app.kubernetes.io/name': 'kube-dns' }, { 'k8s-app': 'kube-dns' }, [{ name: 'metrics', port: 10055, targetPort: 10055 }, { name: 'http-metrics-dnsmasq', port: 10054, targetPort: 10054 }] ), diff --git a/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet b/jsonnet/kube-prometheus/platforms/kubeadm.libsonnet similarity index 74% rename from jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet rename to jsonnet/kube-prometheus/platforms/kubeadm.libsonnet index 0eccc939..dec785d9 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet +++ b/jsonnet/kube-prometheus/platforms/kubeadm.libsonnet @@ -14,19 +14,19 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, + { component: 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] ), kubeSchedulerPrometheusDiscoveryService: service( 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, + { component: 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }], ), }, diff --git a/jsonnet/kube-prometheus/platforms/kubespray.libsonnet b/jsonnet/kube-prometheus/platforms/kubespray.libsonnet new file mode 100644 index 00000000..dabee251 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/kubespray.libsonnet @@ -0,0 +1 @@ +(import './kubeadm.libsonnet') diff --git a/jsonnet/kube-prometheus/platforms/platforms.libsonnet b/jsonnet/kube-prometheus/platforms/platforms.libsonnet new file mode 100644 index 00000000..a3978a6c --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/platforms.libsonnet @@ -0,0 +1,41 @@ +local platforms = { + aws: import './aws.libsonnet', + bootkube: import './bootkube.libsonnet', + gke: import './gke.libsonnet', + eks: import './eks.libsonnet', + kops: import './kops.libsonnet', + kops_coredns: (import './kops.libsonnet') + (import './kops-coredns.libsonnet'), + kubeadm: import './kubeadm.libsonnet', + kubespray: import './kubespray.libsonnet', +}; + +// platformPatch returns the platform specific patch associated to the given +// platform. +local platformPatch(p) = if p != null && std.objectHas(platforms, p) then platforms[p] else {}; + +{ + // initialize the object to prevent "Indexed object has no field" lint errors + local p = { + alertmanager: {}, + blackboxExporter: {}, + grafana: {}, + kubeStateMetrics: {}, + nodeExporter: {}, + prometheus: {}, + prometheusAdapter: {}, + prometheusOperator: {}, + kubernetesControlPlane: {}, + kubePrometheus: {}, + } + platformPatch($.values.common.platform), + + alertmanager+: p.alertmanager, + blackboxExporter+: p.blackboxExporter, + grafana+: p.grafana, + kubeStateMetrics+: p.kubeStateMetrics, + nodeExporter+: p.nodeExporter, + prometheus+: p.prometheus, + prometheusAdapter+: p.prometheusAdapter, + prometheusOperator+: p.prometheusOperator, + kubernetesControlPlane+: p.kubernetesControlPlane, + kubePrometheus+: p.kubePrometheus, +} diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet deleted file mode 100644 index 2b4b3748..00000000 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ /dev/null @@ -1,262 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - prometheusAdapter: 'v0.8.2', - }, - - imageRepos+:: { - prometheusAdapter: 'directxman12/k8s-prometheus-adapter', - }, - - prometheusAdapter+:: { - name: 'prometheus-adapter', - namespace: $._config.namespace, - labels: { name: $._config.prometheusAdapter.name }, - prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc.cluster.local:9090/', - config: { - resourceRules: { - cpu: { - containerQuery: 'sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)', - nodeQuery: 'sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)', - resources: { - overrides: { - node: { - resource: 'node' - }, - namespace: { - resource: 'namespace' - }, - pod: { - resource: 'pod' - }, - }, - }, - containerLabel: 'container' - }, - memory: { - containerQuery: 'sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)', - nodeQuery: 'sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)', - resources: { - overrides: { - instance: { - resource: 'node' - }, - namespace: { - resource: 'namespace' - }, - pod: { - resource: 'pod' - }, - }, - }, - containerLabel: 'container' - }, - window: '5m', - }, - } - }, - }, - - prometheusAdapter+:: { - apiService: - { - apiVersion: 'apiregistration.k8s.io/v1', - kind: 'APIService', - metadata: { - name: 'v1beta1.metrics.k8s.io', - }, - spec: { - service: { - name: $.prometheusAdapter.service.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }, - group: 'metrics.k8s.io', - version: 'v1beta1', - insecureSkipTLSVerify: true, - groupPriorityMinimum: 100, - versionPriority: 100, - }, - }, - - configMap: - local configmap = k.core.v1.configMap; - configmap.new('adapter-config', { 'config.yaml': std.manifestYamlDoc($._config.prometheusAdapter.config) }) + - - configmap.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace), - - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: $._config.prometheusAdapter.name, - namespace: $._config.prometheusAdapter.namespace, - labels: $._config.prometheusAdapter.labels, - }, - spec: { - selector: { - matchLabels: $._config.prometheusAdapter.labels, - }, - endpoints: [ - { - port: 'https', - interval: '30s', - scheme: 'https', - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - }, - ], - }, - }, - - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - service.new( - $._config.prometheusAdapter.name, - $._config.prometheusAdapter.labels, - servicePort.newNamed('https', 443, 6443), - ) + - service.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace) + - service.mixin.metadata.withLabels($._config.prometheusAdapter.labels), - - deployment: - local deployment = k.apps.v1.deployment; - local volume = deployment.mixin.spec.template.spec.volumesType; - local container = deployment.mixin.spec.template.spec.containersType; - local containerVolumeMount = container.volumeMountsType; - - local c = - container.new($._config.prometheusAdapter.name, $._config.imageRepos.prometheusAdapter + ':' + $._config.versions.prometheusAdapter) + - container.withArgs([ - '--cert-dir=/var/run/serving-cert', - '--config=/etc/adapter/config.yaml', - '--logtostderr=true', - '--metrics-relist-interval=1m', - '--prometheus-url=' + $._config.prometheusAdapter.prometheusURL, - '--secure-port=6443', - ]) + - container.withPorts([{ containerPort: 6443 }]) + - container.withVolumeMounts([ - containerVolumeMount.new('tmpfs', '/tmp'), - containerVolumeMount.new('volume-serving-cert', '/var/run/serving-cert'), - containerVolumeMount.new('config', '/etc/adapter'), - ],); - - deployment.new($._config.prometheusAdapter.name, 1, c, $._config.prometheusAdapter.labels) + - deployment.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace) + - deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) + - deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) + - deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(0) + - deployment.mixin.spec.template.spec.withVolumes([ - volume.fromEmptyDir(name='tmpfs'), - volume.fromEmptyDir(name='volume-serving-cert'), - { name: 'config', configMap: { name: 'adapter-config' } }, - ]), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new($._config.prometheusAdapter.name) + - serviceAccount.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace), - - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local rules = - policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources(['nodes', 'namespaces', 'pods', 'services']) + - policyRule.withVerbs(['get', 'list', 'watch']); - - clusterRole.new() + - clusterRole.mixin.metadata.withName($._config.prometheusAdapter.name) + - clusterRole.withRules(rules), - - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName($._config.prometheusAdapter.name) + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName($.prometheusAdapter.clusterRole.metadata.name) + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ - kind: 'ServiceAccount', - name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }]), - - clusterRoleBindingDelegator: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('resource-metrics:system:auth-delegator') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('system:auth-delegator') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ - kind: 'ServiceAccount', - name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }]), - - clusterRoleServerResources: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local rules = - policyRule.new() + - policyRule.withApiGroups(['metrics.k8s.io']) + - policyRule.withResources(['*']) + - policyRule.withVerbs(['*']); - - clusterRole.new() + - clusterRole.mixin.metadata.withName('resource-metrics-server-resources') + - clusterRole.withRules(rules), - - clusterRoleAggregatedMetricsReader: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local rules = - policyRule.new() + - policyRule.withApiGroups(['metrics.k8s.io']) + - policyRule.withResources(['pods', 'nodes']) + - policyRule.withVerbs(['get','list','watch']); - - clusterRole.new() + - clusterRole.mixin.metadata.withName('system:aggregated-metrics-reader') + - clusterRole.mixin.metadata.withLabels({ - "rbac.authorization.k8s.io/aggregate-to-admin": "true", - "rbac.authorization.k8s.io/aggregate-to-edit": "true", - "rbac.authorization.k8s.io/aggregate-to-view": "true", - }) + - clusterRole.withRules(rules), - - roleBindingAuthReader: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('resource-metrics-auth-reader') + - roleBinding.mixin.metadata.withNamespace('kube-system') + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('extension-apiserver-authentication-reader') + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ - kind: 'ServiceAccount', - name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }]), - }, -} diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet deleted file mode 100644 index 9ffd5f1f..00000000 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ /dev/null @@ -1,502 +0,0 @@ -local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet'; -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - prometheus: 'v2.22.1', - }, - - imageRepos+:: { - prometheus: 'quay.io/prometheus/prometheus', - }, - - alertmanager+:: { - name: 'main', - }, - - prometheus+:: { - name: 'k8s', - replicas: 2, - rules: {}, - namespaces: ['default', 'kube-system', $._config.namespace], - }, - }, - - prometheus+:: { - local p = self, - - name:: $._config.prometheus.name, - namespace:: $._config.namespace, - roleBindingNamespaces:: $._config.prometheus.namespaces, - replicas:: $._config.prometheus.replicas, - prometheusRules:: $._config.prometheus.rules, - alertmanagerName:: $.alertmanager.service.metadata.name, - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('prometheus-' + p.name) + - serviceAccount.mixin.metadata.withNamespace(p.namespace), - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - local prometheusPort = servicePort.newNamed('web', 9090, 'web'); - - service.new('prometheus-' + p.name, { app: 'prometheus', prometheus: p.name }, prometheusPort) + - service.mixin.spec.withSessionAffinity('ClientIP') + - service.mixin.metadata.withNamespace(p.namespace) + - service.mixin.metadata.withLabels({ prometheus: p.name }), - - rules: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', - metadata: { - labels: { - prometheus: p.name, - role: 'alert-rules', - }, - name: 'prometheus-' + p.name + '-rules', - namespace: p.namespace, - }, - spec: { - groups: p.prometheusRules.groups, - }, - }, - - roleBindingSpecificNamespaces: - local roleBinding = k.rbac.v1.roleBinding; - - local newSpecificRoleBinding(namespace) = - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + p.name) + - roleBinding.mixin.metadata.withNamespace(namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + p.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]); - - local roleBindingList = k3.rbac.v1.roleBindingList; - roleBindingList.new([newSpecificRoleBinding(x) for x in p.roleBindingNamespaces]), - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local nodeMetricsRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources(['nodes/metrics']) + - policyRule.withVerbs(['get']); - - local metricsRule = policyRule.new() + - policyRule.withNonResourceUrls('/metrics') + - policyRule.withVerbs(['get']); - - local rules = [nodeMetricsRule, metricsRule]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('prometheus-' + p.name) + - clusterRole.withRules(rules), - roleConfig: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local configmapRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'configmaps', - ]) + - policyRule.withVerbs(['get']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + p.name + '-config') + - role.mixin.metadata.withNamespace(p.namespace) + - role.withRules(configmapRule), - roleBindingConfig: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + p.name + '-config') + - roleBinding.mixin.metadata.withNamespace(p.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + p.name + '-config') + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]), - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('prometheus-' + p.name) + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('prometheus-' + p.name) + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]), - roleSpecificNamespaces: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - local ingressRule = policyRule.new() + - policyRule.withApiGroups(['extensions']) + - policyRule.withResources([ - 'ingresses', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - local newSpecificRole(namespace) = - role.new() + - role.mixin.metadata.withName('prometheus-' + p.name) + - role.mixin.metadata.withNamespace(namespace) + - role.withRules([coreRule, ingressRule]); - - local roleList = k3.rbac.v1.roleList; - roleList.new([newSpecificRole(x) for x in p.roleBindingNamespaces]), - prometheus: - local statefulSet = k.apps.v1.statefulSet; - local container = statefulSet.mixin.spec.template.spec.containersType; - local resourceRequirements = container.mixin.resourcesType; - local selector = statefulSet.mixin.spec.selectorType; - - - local resources = - resourceRequirements.new() + - resourceRequirements.withRequests({ memory: '400Mi' }); - - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'Prometheus', - metadata: { - name: p.name, - namespace: p.namespace, - labels: { - prometheus: p.name, - }, - }, - spec: { - replicas: p.replicas, - version: $._config.versions.prometheus, - image: $._config.imageRepos.prometheus + ':' + $._config.versions.prometheus, - serviceAccountName: 'prometheus-' + p.name, - serviceMonitorSelector: {}, - podMonitorSelector: {}, - probeSelector: {}, - serviceMonitorNamespaceSelector: {}, - podMonitorNamespaceSelector: {}, - probeNamespaceSelector: {}, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - ruleSelector: selector.withMatchLabels({ - role: 'alert-rules', - prometheus: p.name, - }), - resources: resources, - alerting: { - alertmanagers: [ - { - namespace: p.namespace, - name: p.alertmanagerName, - port: 'web', - }, - ], - }, - securityContext: { - runAsUser: 1000, - runAsNonRoot: true, - fsGroup: 2000, - }, - }, - }, - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'prometheus', - namespace: p.namespace, - labels: { - 'k8s-app': 'prometheus', - }, - }, - spec: { - selector: { - matchLabels: { - prometheus: p.name, - }, - }, - endpoints: [ - { - port: 'web', - interval: '30s', - }, - ], - }, - }, - serviceMonitorKubeScheduler: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-scheduler', - namespace: p.namespace, - labels: { - 'k8s-app': 'kube-scheduler', - }, - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'https-metrics', - interval: '30s', - scheme: 'https', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'kube-scheduler', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - }, - }, - serviceMonitorKubelet: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kubelet', - namespace: p.namespace, - labels: { - 'k8s-app': 'kubelet', - }, - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'https-metrics', - scheme: 'https', - interval: '30s', - honorLabels: true, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'), - relabelings: [ - { - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }, - ], - }, - { - port: 'https-metrics', - scheme: 'https', - path: '/metrics/cadvisor', - interval: '30s', - honorLabels: true, - honorTimestamps: false, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }, - ], - metricRelabelings: [ - // Drop a bunch of metrics which are disabled but still sent, see - // https://github.com/google/cadvisor/issues/1925. - { - sourceLabels: ['__name__'], - regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', - action: 'drop', - }, - ], - }, - { - port: 'https-metrics', - scheme: 'https', - path: '/metrics/probes', - interval: '30s', - honorLabels: true, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }, - ], - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'kubelet', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - }, - }, - serviceMonitorKubeControllerManager: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-controller-manager', - namespace: p.namespace, - labels: { - 'k8s-app': 'kube-controller-manager', - }, - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'https-metrics', - interval: '30s', - scheme: 'https', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [ - { - sourceLabels: ['__name__'], - regex: 'etcd_(debugging|disk|request|server).*', - action: 'drop', - }, - ], - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'kube-controller-manager', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - }, - }, - serviceMonitorApiserver: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-apiserver', - namespace: p.namespace, - labels: { - 'k8s-app': 'apiserver', - }, - }, - spec: { - jobLabel: 'component', - selector: { - matchLabels: { - component: 'apiserver', - provider: 'kubernetes', - }, - }, - namespaceSelector: { - matchNames: [ - 'default', - ], - }, - endpoints: [ - { - port: 'https', - interval: '30s', - scheme: 'https', - tlsConfig: { - caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', - serverName: 'kubernetes', - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [ - { - sourceLabels: ['__name__'], - regex: 'etcd_(debugging|disk|server).*', - action: 'drop', - }, - { - sourceLabels: ['__name__'], - regex: 'apiserver_admission_controller_admission_latencies_seconds_.*', - action: 'drop', - }, - { - sourceLabels: ['__name__'], - regex: 'apiserver_admission_step_admission_latencies_seconds_.*', - action: 'drop', - }, - { - sourceLabels: ['__name__', 'le'], - regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)', - action: 'drop', - }, - ], - }, - ], - }, - }, - serviceMonitorCoreDNS: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'coredns', - namespace: p.namespace, - labels: { - 'k8s-app': 'coredns', - }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'kube-dns', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'metrics', - interval: '15s', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - }, - ], - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/versions.json b/jsonnet/kube-prometheus/versions.json new file mode 100644 index 00000000..d27a9628 --- /dev/null +++ b/jsonnet/kube-prometheus/versions.json @@ -0,0 +1,12 @@ +{ + "alertmanager": "0.23.0", + "blackboxExporter": "0.19.0", + "grafana": "8.1.3", + "kubeStateMetrics": "2.2.0", + "nodeExporter": "1.2.2", + "prometheus": "2.29.2", + "prometheusAdapter": "0.9.0", + "prometheusOperator": "0.50.0", + "kubeRbacProxy": "0.11.0", + "configmapReload": "0.5.0" +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 883c61e7..9880c654 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,18 +8,18 @@ "subdir": "grafana" } }, - "version": "02ac326459f46d6f30a766ce2f9a45337a745db0", - "sum": "r7kj5f5w7aVB7vO++dI9vbHhzoW8PpyVLSA7gOiouZ0=" + "version": "c3b14b24b83cfe9abf1064649d19e2d679f033fb", + "sum": "YrE4DNQsWgYWs6h0j/FjQETt8xDXdYdsslb1WK7xQEk=" }, { "source": { "git": { "remote": "https://github.com/etcd-io/etcd.git", - "subdir": "Documentation/etcd-mixin" + "subdir": "contrib/mixin" } }, - "version": "7da5182f1d02c0baaefd52f361fd0459d5b6703e", - "sum": "L+PGlPK9mykGCJ9TIoEWdhMBjz+9lKuQ4YZ8fOeP9sk=" + "version": "c2937d78d2722d774f69dbf91a956f382d32f4d3", + "sum": "5XhYOigrKipOWDbIn9hlrz7JcbelzvJnormxSaup9JI=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "grafonnet" } }, - "version": "356bd73e4792ffe107725776ca8946895969c191", - "sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU=" + "version": "05fb200ee1a1816fc1b4c522071d5606d8dd71c1", + "sum": "mEoObbqbyVaXrHFEJSM2Nad31tOvadzIevWuyNHHBgI=" }, { "source": { @@ -38,19 +38,8 @@ "subdir": "grafana-builder" } }, - "version": "fe3e027c5a0d8311e1bd6cd9de2c295707c3ae76", - "sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc=" - }, - { - "source": { - "git": { - "remote": "https://github.com/ksonnet/ksonnet-lib.git", - "subdir": "" - } - }, - "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f", - "sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg=", - "name": "ksonnet" + "version": "746874e4836a4bfbb7034d32de0c98ab1282aaae", + "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" }, { "source": { @@ -59,8 +48,8 @@ "subdir": "" } }, - "version": "8a98e9c6fab000ef090b8d313292043696a8b3bb", - "sum": "btFPZfE2paWZdvLtFwv4gfDoygj1axt7Q4ACGSdeuJ8=" + "version": "2b27a09a667091cef74776b690ccceaf55995e29", + "sum": "j2jPdrcM3iuaUK+6V9jWn2M3Fapr0KtI8FZ1KQoHIGA=" }, { "source": { @@ -69,7 +58,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "8a98e9c6fab000ef090b8d313292043696a8b3bb", + "version": "2b27a09a667091cef74776b690ccceaf55995e29", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -79,8 +68,8 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "89aaf6c524ee891140c4c8f2a05b1b16f5847309", - "sum": "zD/pbQLnQq+5hegEelaheHS8mn1h09GTktFO74iwlBI=" + "version": "d111b6d8e07f8dde1dfe7e688f44242e4aa4f734", + "sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE=" }, { "source": { @@ -89,8 +78,8 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "a4867d8809ba60a59013034646d0a4bc89576b9c", - "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" + "version": "d111b6d8e07f8dde1dfe7e688f44242e4aa4f734", + "sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk=" }, { "source": { @@ -99,8 +88,9 @@ "subdir": "jsonnet/mixin" } }, - "version": "7134bc321bda7e69db0aacc9a41949167de7a56f", - "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=" + "version": "2c81b0cf6a5673e08057499a08ddce396b19dda4", + "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=", + "name": "prometheus-operator-mixin" }, { "source": { @@ -109,8 +99,19 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "b86ab77239f2a11ee69ad05b24122958d8b2df5b", - "sum": "Zof470kQY377VxlEH5MQJUSbtViNEdLyLPv/P7fX8QQ=" + "version": "2c81b0cf6a5673e08057499a08ddce396b19dda4", + "sum": "WUuFzKqxzxmTWLeic/IU1SMjdCV/zClt11MHucJ9MSc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager.git", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "44011410d7065487789c447ce55157ae6e0b917d", + "sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=", + "name": "alertmanager" }, { "source": { @@ -119,8 +120,8 @@ "subdir": "docs/node-mixin" } }, - "version": "f645d4924224f1f3abab7b20798ca8e24957724c", - "sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8=" + "version": "dc68e035a5b37a9a3b47e1547f07d96df29ba575", + "sum": "OFNs9Te1QMqSscXqNqMv0zwaJoJxaEg7NyQVNyT4VeA=" }, { "source": { @@ -129,10 +130,21 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "00f16d1ac3a4c94561e5133b821d8e4d9ef78ec2", - "sum": "CGxvaHkP7z/gnsLB/8Imvt/AnW+9nJUnTcL+fvIAZUs=", + "version": "46286cb6abfff961e8c257de091443e835ec444f", + "sum": "m4VHwft4fUcxzL4+52lLZG/V5aH5ZEdjaweb88vISL0=", "name": "prometheus" }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos.git", + "subdir": "mixin" + } + }, + "version": "2dd8c22e8c15f5ec0daaa07ae20be44bed419aa5", + "sum": "X+060DnePPeN/87fgj0SrfxVitywTk8hZA9V4nHxl1g=", + "name": "thanos-mixin" + }, { "source": { "local": { diff --git a/kustomization.yaml b/kustomization.yaml index b067b22f..ffea5b17 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -2,10 +2,20 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ./manifests/alertmanager-alertmanager.yaml +- ./manifests/alertmanager-podDisruptionBudget.yaml +- ./manifests/alertmanager-prometheusRule.yaml - ./manifests/alertmanager-secret.yaml - ./manifests/alertmanager-service.yaml - ./manifests/alertmanager-serviceAccount.yaml - ./manifests/alertmanager-serviceMonitor.yaml +- ./manifests/blackbox-exporter-clusterRole.yaml +- ./manifests/blackbox-exporter-clusterRoleBinding.yaml +- ./manifests/blackbox-exporter-configuration.yaml +- ./manifests/blackbox-exporter-deployment.yaml +- ./manifests/blackbox-exporter-service.yaml +- ./manifests/blackbox-exporter-serviceAccount.yaml +- ./manifests/blackbox-exporter-serviceMonitor.yaml +- ./manifests/grafana-config.yaml - ./manifests/grafana-dashboardDatasources.yaml - ./manifests/grafana-dashboardDefinitions.yaml - ./manifests/grafana-dashboardSources.yaml @@ -13,15 +23,24 @@ resources: - ./manifests/grafana-service.yaml - ./manifests/grafana-serviceAccount.yaml - ./manifests/grafana-serviceMonitor.yaml +- ./manifests/kube-prometheus-prometheusRule.yaml - ./manifests/kube-state-metrics-clusterRole.yaml - ./manifests/kube-state-metrics-clusterRoleBinding.yaml - ./manifests/kube-state-metrics-deployment.yaml +- ./manifests/kube-state-metrics-prometheusRule.yaml - ./manifests/kube-state-metrics-service.yaml - ./manifests/kube-state-metrics-serviceAccount.yaml - ./manifests/kube-state-metrics-serviceMonitor.yaml +- ./manifests/kubernetes-prometheusRule.yaml +- ./manifests/kubernetes-serviceMonitorApiserver.yaml +- ./manifests/kubernetes-serviceMonitorCoreDNS.yaml +- ./manifests/kubernetes-serviceMonitorKubeControllerManager.yaml +- ./manifests/kubernetes-serviceMonitorKubeScheduler.yaml +- ./manifests/kubernetes-serviceMonitorKubelet.yaml - ./manifests/node-exporter-clusterRole.yaml - ./manifests/node-exporter-clusterRoleBinding.yaml - ./manifests/node-exporter-daemonset.yaml +- ./manifests/node-exporter-prometheusRule.yaml - ./manifests/node-exporter-service.yaml - ./manifests/node-exporter-serviceAccount.yaml - ./manifests/node-exporter-serviceMonitor.yaml @@ -33,27 +52,25 @@ resources: - ./manifests/prometheus-adapter-clusterRoleServerResources.yaml - ./manifests/prometheus-adapter-configMap.yaml - ./manifests/prometheus-adapter-deployment.yaml +- ./manifests/prometheus-adapter-podDisruptionBudget.yaml - ./manifests/prometheus-adapter-roleBindingAuthReader.yaml - ./manifests/prometheus-adapter-service.yaml - ./manifests/prometheus-adapter-serviceAccount.yaml - ./manifests/prometheus-adapter-serviceMonitor.yaml - ./manifests/prometheus-clusterRole.yaml - ./manifests/prometheus-clusterRoleBinding.yaml +- ./manifests/prometheus-operator-prometheusRule.yaml - ./manifests/prometheus-operator-serviceMonitor.yaml +- ./manifests/prometheus-podDisruptionBudget.yaml - ./manifests/prometheus-prometheus.yaml +- ./manifests/prometheus-prometheusRule.yaml - ./manifests/prometheus-roleBindingConfig.yaml - ./manifests/prometheus-roleBindingSpecificNamespaces.yaml - ./manifests/prometheus-roleConfig.yaml - ./manifests/prometheus-roleSpecificNamespaces.yaml -- ./manifests/prometheus-rules.yaml - ./manifests/prometheus-service.yaml - ./manifests/prometheus-serviceAccount.yaml - ./manifests/prometheus-serviceMonitor.yaml -- ./manifests/prometheus-serviceMonitorApiserver.yaml -- ./manifests/prometheus-serviceMonitorCoreDNS.yaml -- ./manifests/prometheus-serviceMonitorKubeControllerManager.yaml -- ./manifests/prometheus-serviceMonitorKubeScheduler.yaml -- ./manifests/prometheus-serviceMonitorKubelet.yaml - ./manifests/setup/0namespace-namespace.yaml - ./manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml - ./manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 55b353a8..42bed1e1 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -3,16 +3,33 @@ kind: Alertmanager metadata: labels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: main namespace: monitoring spec: - image: quay.io/prometheus/alertmanager:v0.21.0 + image: quay.io/prometheus/alertmanager:v0.23.0 nodeSelector: kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 replicas: 3 + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 4m + memory: 100Mi securityContext: fsGroup: 2000 runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main - version: v0.21.0 + version: 0.23.0 diff --git a/manifests/alertmanager-podDisruptionBudget.yaml b/manifests/alertmanager-podDisruptionBudget.yaml new file mode 100644 index 00000000..b55b7cdb --- /dev/null +++ b/manifests/alertmanager-podDisruptionBudget.yaml @@ -0,0 +1,18 @@ +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 + name: alertmanager-main + namespace: monitoring +spec: + maxUnavailable: 1 + selector: + matchLabels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/alertmanager-prometheusRule.yaml b/manifests/alertmanager-prometheusRule.yaml new file mode 100644 index 00000000..9f749c99 --- /dev/null +++ b/manifests/alertmanager-prometheusRule.yaml @@ -0,0 +1,138 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 + prometheus: k8s + role: alert-rules + name: alertmanager-main-rules + namespace: monitoring +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 20c205fb..f265e096 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,12 @@ apiVersion: v1 kind: Secret metadata: + labels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager-main namespace: monitoring stringData: diff --git a/manifests/alertmanager-service.yaml b/manifests/alertmanager-service.yaml index df4c9ff5..f3f6cf71 100644 --- a/manifests/alertmanager-service.yaml +++ b/manifests/alertmanager-service.yaml @@ -3,6 +3,10 @@ kind: Service metadata: labels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager-main namespace: monitoring spec: @@ -13,4 +17,7 @@ spec: selector: alertmanager: main app: alertmanager + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus sessionAffinity: ClientIP diff --git a/manifests/alertmanager-serviceAccount.yaml b/manifests/alertmanager-serviceAccount.yaml index 5c06d5e4..ba806b50 100644 --- a/manifests/alertmanager-serviceAccount.yaml +++ b/manifests/alertmanager-serviceAccount.yaml @@ -1,5 +1,11 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager-main namespace: monitoring diff --git a/manifests/alertmanager-serviceMonitor.yaml b/manifests/alertmanager-serviceMonitor.yaml index 548af0d6..070ef530 100644 --- a/manifests/alertmanager-serviceMonitor.yaml +++ b/manifests/alertmanager-serviceMonitor.yaml @@ -2,7 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: alertmanager + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager namespace: monitoring spec: @@ -12,3 +15,6 @@ spec: selector: matchLabels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/blackbox-exporter-clusterRole.yaml b/manifests/blackbox-exporter-clusterRole.yaml new file mode 100644 index 00000000..c7824058 --- /dev/null +++ b/manifests/blackbox-exporter-clusterRole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: blackbox-exporter +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/manifests/blackbox-exporter-clusterRoleBinding.yaml b/manifests/blackbox-exporter-clusterRoleBinding.yaml new file mode 100644 index 00000000..7b3ae320 --- /dev/null +++ b/manifests/blackbox-exporter-clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: blackbox-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: blackbox-exporter +subjects: +- kind: ServiceAccount + name: blackbox-exporter + namespace: monitoring diff --git a/manifests/blackbox-exporter-configuration.yaml b/manifests/blackbox-exporter-configuration.yaml new file mode 100644 index 00000000..35bfad12 --- /dev/null +++ b/manifests/blackbox-exporter-configuration.yaml @@ -0,0 +1,51 @@ +apiVersion: v1 +data: + config.yml: |- + "modules": + "http_2xx": + "http": + "preferred_ip_protocol": "ip4" + "prober": "http" + "http_post_2xx": + "http": + "method": "POST" + "preferred_ip_protocol": "ip4" + "prober": "http" + "irc_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "send": "NICK prober" + - "send": "USER prober prober prober :prober" + - "expect": "PING :([^ ]+)" + "send": "PONG ${1}" + - "expect": "^:[^ ]+ 001" + "pop3s_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "expect": "^+OK" + "tls": true + "tls_config": + "insecure_skip_verify": false + "ssh_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "expect": "^SSH-2.0-" + "tcp_connect": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter-configuration + namespace: monitoring diff --git a/manifests/blackbox-exporter-deployment.yaml b/manifests/blackbox-exporter-deployment.yaml new file mode 100644 index 00000000..e47166bd --- /dev/null +++ b/manifests/blackbox-exporter-deployment.yaml @@ -0,0 +1,99 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: blackbox-exporter + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + spec: + containers: + - args: + - --config.file=/etc/blackbox_exporter/config.yml + - --web.listen-address=:19115 + image: quay.io/prometheus/blackbox-exporter:v0.19.0 + name: blackbox-exporter + ports: + - containerPort: 19115 + name: http + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - mountPath: /etc/blackbox_exporter/ + name: config + readOnly: true + - args: + - --webhook-url=http://localhost:19115/-/reload + - --volume-dir=/etc/blackbox_exporter/ + image: jimmidyson/configmap-reload:v0.5.0 + name: module-configmap-reloader + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/blackbox_exporter/ + name: config + readOnly: true + - args: + - --logtostderr + - --secure-listen-address=:9115 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:19115/ + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 + name: kube-rbac-proxy + ports: + - containerPort: 9115 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: blackbox-exporter + volumes: + - configMap: + name: blackbox-exporter-configuration + name: config diff --git a/manifests/blackbox-exporter-service.yaml b/manifests/blackbox-exporter-service.yaml new file mode 100644 index 00000000..58ffb394 --- /dev/null +++ b/manifests/blackbox-exporter-service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter + namespace: monitoring +spec: + ports: + - name: https + port: 9115 + targetPort: https + - name: probe + port: 19115 + targetPort: http + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/blackbox-exporter-serviceAccount.yaml b/manifests/blackbox-exporter-serviceAccount.yaml new file mode 100644 index 00000000..ac2acefb --- /dev/null +++ b/manifests/blackbox-exporter-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: blackbox-exporter + namespace: monitoring diff --git a/manifests/blackbox-exporter-serviceMonitor.yaml b/manifests/blackbox-exporter-serviceMonitor.yaml new file mode 100644 index 00000000..d7c05825 --- /dev/null +++ b/manifests/blackbox-exporter-serviceMonitor.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + path: /metrics + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/grafana-config.yaml b/manifests/grafana-config.yaml new file mode 100644 index 00000000..eeece25e --- /dev/null +++ b/manifests/grafana-config.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-config + namespace: monitoring +stringData: + grafana.ini: | + [date_formats] + default_timezone = UTC +type: Opaque diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 22d47488..c3cce22f 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -1,8 +1,27 @@ apiVersion: v1 -data: - datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0= kind: Secret metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-datasources namespace: monitoring +stringData: + datasources.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access": "proxy", + "editable": false, + "name": "prometheus", + "orgId": 1, + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090", + "version": 1 + } + ] + } type: Opaque diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e11e95af..a80d3de2 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1,5 +1,604 @@ apiVersion: v1 items: +- apiVersion: v1 + data: + alertmanager-overview.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Received", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Invalid", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts receive rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Total", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Failed", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notifications Send Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} 99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Average", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notification Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Notifications", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "alertmanager-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, service)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "integration", + "options": [ + + ], + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-alertmanager-overview + namespace: monitoring - apiVersion: v1 data: apiserver.json: |- @@ -142,7 +741,7 @@ items: "dashes": false, "datasource": "$datasource", "decimals": 3, - "description": "How much error budget is left looking at our 0.990% availability gurantees?", + "description": "How much error budget is left looking at our 0.990% availability guarantees?", "fill": 10, "fillGradient": 0, "gridPos": { @@ -1729,6 +2328,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-apiserver namespace: monitoring - apiVersion: v1 @@ -1839,7 +2443,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -1942,7 +2546,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2243,7 +2847,7 @@ items: ], "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2252,7 +2856,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2261,7 +2865,7 @@ items: "step": 10 }, { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2270,7 +2874,7 @@ items: "step": 10 }, { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2279,7 +2883,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2288,7 +2892,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2297,7 +2901,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2306,7 +2910,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "table", "instant": true, "intervalFactor": 2, @@ -2386,7 +2990,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2489,7 +3093,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2620,7 +3224,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2721,7 +3325,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2833,7 +3437,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2934,7 +3538,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -3055,7 +3659,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -3156,7 +3760,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -3261,7 +3865,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(rate(node_netstat_Tcp_RetransSegs[$interval:$resolution]) / rate(node_netstat_Tcp_OutSegs[$interval:$resolution])) by (instance))", + "expr": "sort_desc(sum(rate(node_netstat_Tcp_RetransSegs{cluster=\"$cluster\"}[$interval:$resolution]) / rate(node_netstat_Tcp_OutSegs{cluster=\"$cluster\"}[$interval:$resolution])) by (instance))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -3366,7 +3970,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(rate(node_netstat_TcpExt_TCPSynRetrans[$interval:$resolution]) / rate(node_netstat_Tcp_RetransSegs[$interval:$resolution])) by (instance))", + "expr": "sort_desc(sum(rate(node_netstat_TcpExt_TCPSynRetrans{cluster=\"$cluster\"}[$interval:$resolution]) / rate(node_netstat_Tcp_RetransSegs{cluster=\"$cluster\"}[$interval:$resolution])) by (instance))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -3530,6 +4134,32 @@ items: "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -3569,6 +4199,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-cluster-total namespace: monitoring - apiVersion: v1 @@ -3661,7 +4296,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-controller-manager\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-controller-manager\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -3730,10 +4365,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -3836,10 +4471,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -3942,10 +4577,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -4162,7 +4797,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -4268,7 +4903,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -4374,7 +5009,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -4467,7 +5102,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -4560,7 +5195,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -4644,6 +5279,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kube-controller-manager\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -4654,7 +5315,7 @@ items: "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-controller-manager\"}, instance)", + "query": "label_values(up{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -4704,6 +5365,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-controller-manager namespace: monitoring - apiVersion: v1 @@ -4768,7 +5434,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__interval]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -4852,7 +5518,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -4936,7 +5602,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5020,7 +5686,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5104,7 +5770,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5188,7 +5854,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5283,7 +5949,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -5574,7 +6240,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5583,7 +6249,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5592,7 +6258,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5601,7 +6267,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5610,7 +6276,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6010,7 +6676,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6019,7 +6685,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6028,7 +6694,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6037,7 +6703,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6294,7 +6960,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6303,7 +6969,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6312,7 +6978,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6321,7 +6987,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6330,7 +6996,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6339,7 +7005,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6394,7 +7060,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -6434,12 +7100,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6486,19 +7152,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -6532,12 +7186,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6590,7 +7244,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -6630,12 +7284,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6682,19 +7336,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -6728,12 +7370,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6786,7 +7428,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Namespace", "titleSize": "h6" }, { @@ -6826,12 +7468,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6862,7 +7504,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -6878,19 +7520,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -6924,12 +7554,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6960,7 +7590,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -6982,7 +7612,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -7022,12 +7652,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -7058,7 +7688,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -7074,19 +7704,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -7120,12 +7738,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -7154,6 +7772,191 @@ items: ] }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rate of Packets Dropped", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, "yaxes": [ { "format": "Bps", @@ -7178,7 +7981,312 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -7220,7 +8328,7 @@ items: "options": [ ], - "query": "label_values(node_cpu_seconds_total, cluster)", + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", "refresh": 2, "regex": "", "sort": 1, @@ -7270,6 +8378,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-cluster namespace: monitoring - apiVersion: v1 @@ -7333,7 +8446,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7417,7 +8530,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7501,7 +8614,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7511,7 +8624,7 @@ items: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -7585,7 +8698,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7677,8 +8790,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -7687,8 +8801,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -7699,7 +8814,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -7950,7 +9065,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7959,7 +9074,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7968,7 +9083,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7977,7 +9092,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7986,7 +9101,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8083,8 +9198,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -8093,8 +9209,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -8422,7 +9539,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8431,7 +9548,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8440,7 +9557,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8449,7 +9566,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8733,7 +9850,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8742,7 +9859,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8751,7 +9868,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8760,7 +9877,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8769,7 +9886,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8778,7 +9895,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8833,7 +9950,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -8873,12 +9990,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8925,19 +10042,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -8971,12 +10076,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9029,7 +10134,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -9069,12 +10174,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9105,7 +10210,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -9121,19 +10226,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -9167,12 +10260,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9203,7 +10296,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -9225,7 +10318,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -9265,12 +10358,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9301,7 +10394,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -9317,19 +10410,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -9363,12 +10444,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9397,6 +10478,191 @@ items: ] }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rate of Packets Dropped", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, "yaxes": [ { "format": "Bps", @@ -9421,7 +10687,312 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -9464,7 +11035,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -9491,7 +11062,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -9540,6 +11111,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-namespace namespace: monitoring - apiVersion: v1 @@ -9602,7 +11178,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9837,7 +11413,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9846,7 +11422,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9855,7 +11431,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9864,7 +11440,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9873,7 +11449,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10274,7 +11850,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10283,7 +11859,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10292,7 +11868,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10301,7 +11877,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10426,7 +12002,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -10453,7 +12029,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -10502,6 +12078,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-node namespace: monitoring - apiVersion: v1 @@ -10581,7 +12162,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -10589,7 +12170,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -10597,7 +12178,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -10695,7 +12276,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -10937,7 +12518,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10946,7 +12527,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10955,7 +12536,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10964,7 +12545,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10973,7 +12554,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11071,7 +12652,7 @@ items: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -11081,7 +12662,7 @@ items: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -11092,7 +12673,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -11100,7 +12681,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -11108,7 +12689,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -11121,7 +12702,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -11212,7 +12793,7 @@ items: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ @@ -11400,7 +12981,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11409,7 +12990,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11418,7 +12999,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11427,7 +13008,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11436,7 +13017,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11559,12 +13140,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11611,19 +13192,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -11658,12 +13227,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11716,7 +13285,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -11757,12 +13326,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11793,7 +13362,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -11809,19 +13378,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -11856,12 +13413,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11892,7 +13449,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -11914,7 +13471,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -11955,12 +13512,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11991,7 +13548,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -12007,19 +13564,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -12054,12 +13599,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12088,6 +13633,207 @@ items: ] }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rate of Packets Dropped", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, "yaxes": [ { "format": "Bps", @@ -12112,7 +13858,497 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution(Pod - Read & Writes)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution(Containers)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -12155,7 +14391,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -12182,7 +14418,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -12258,6 +14494,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-pod namespace: monitoring - apiVersion: v1 @@ -12320,7 +14561,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12555,7 +14796,7 @@ items: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12564,7 +14805,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12573,7 +14814,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12582,7 +14823,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12591,7 +14832,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12935,7 +15176,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12944,7 +15185,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12953,7 +15194,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12962,7 +15203,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13219,7 +15460,7 @@ items: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13228,7 +15469,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13237,7 +15478,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13246,7 +15487,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13255,7 +15496,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13264,7 +15505,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13319,7 +15560,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -13359,12 +15600,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13411,19 +15652,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -13457,12 +15686,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13515,7 +15744,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -13555,12 +15784,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13607,19 +15836,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -13653,12 +15870,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13711,7 +15928,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Pod", "titleSize": "h6" }, { @@ -13751,12 +15968,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13787,7 +16004,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -13803,19 +16020,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -13849,12 +16054,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13885,7 +16090,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -13907,7 +16112,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -13947,12 +16152,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13983,7 +16188,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -13999,19 +16204,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -14045,12 +16238,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -14081,7 +16274,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -14103,7 +16296,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -14146,7 +16339,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14173,7 +16366,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14200,7 +16393,7 @@ items: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14227,7 +16420,7 @@ items: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14276,6 +16469,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-workload namespace: monitoring - apiVersion: v1 @@ -14335,8 +16533,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -14345,8 +16544,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -14357,7 +16557,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -14655,7 +16855,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14664,7 +16864,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14673,7 +16873,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14682,7 +16882,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14691,7 +16891,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14788,8 +16988,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -14798,8 +16999,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -15117,7 +17319,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15126,7 +17328,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15135,7 +17337,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15144,7 +17346,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15420,7 +17622,7 @@ items: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15429,7 +17631,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15438,7 +17640,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15447,7 +17649,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15456,7 +17658,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15465,7 +17667,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15520,7 +17722,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -15560,12 +17762,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15612,19 +17814,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -15658,12 +17848,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15716,7 +17906,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -15756,12 +17946,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15808,19 +17998,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -15854,12 +18032,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15912,7 +18090,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Workload", "titleSize": "h6" }, { @@ -15952,12 +18130,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15988,7 +18166,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -16004,19 +18182,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -16050,12 +18216,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -16086,7 +18252,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -16108,7 +18274,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -16148,12 +18314,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -16184,7 +18350,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -16200,19 +18366,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -16246,12 +18400,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -16282,7 +18436,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -16304,7 +18458,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -16331,38 +18485,6 @@ items: "regex": "", "type": "datasource" }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "deployment", - "value": "deployment" - }, - "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "type", - "options": [ - - ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "current": { @@ -16379,7 +18501,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -16390,6 +18512,38 @@ items: "type": "query", "useTags": false }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "deployment", + "value": "deployment" + }, + "datasource": "$datasource", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "type", + "options": [ + + ], + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -16406,7 +18560,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -16455,6 +18609,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-workloads-namespace namespace: monitoring - apiVersion: v1 @@ -16479,2385 +18638,2100 @@ items: "id": null, "links": [ + ], + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Kubelets", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Running Pods", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Running Container", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Actual Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Desired Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Config Error Count", + "transparent": false, + "type": "stat" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} pod", + "refId": "A" + }, + { + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} worker", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} pod", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} worker", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager operation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Pod lifecycle event generator", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist interval", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Request duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 77 + }, + "id": 23, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 77 + }, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 77 + }, + "id": 25, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } ], "refresh": "10s", "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Up", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Pods", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Container", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Actual Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Config Error Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} pod", - "refId": "A" - }, - { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} worker", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} pod", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} worker", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager operation rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "Pod lifecycle event generator", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist interval", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 21, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{verb}} {{url}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Request duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 23, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 25, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } ], "schemaVersion": 14, "style": "dark", @@ -18896,7 +20770,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, cluster)", + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics\"}, cluster)", "refresh": 2, "regex": "", "sort": 1, @@ -18972,6 +20846,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-kubelet namespace: monitoring - apiVersion: v1 @@ -19131,7 +21010,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution]))", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution]))", "format": "time_series", "instant": null, "intervalFactor": 1, @@ -19258,7 +21137,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution]))", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution]))", "format": "time_series", "instant": null, "intervalFactor": 1, @@ -19490,7 +21369,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -19499,7 +21378,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -19508,7 +21387,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -19517,7 +21396,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -19526,7 +21405,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -19535,7 +21414,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -19623,7 +21502,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -19724,7 +21603,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -19836,7 +21715,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -19937,7 +21816,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -20058,7 +21937,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -20159,7 +22038,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -20244,6 +22123,32 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": ".+", "auto": false, @@ -20254,7 +22159,7 @@ items: "value": "kube-system" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", "hide": 0, "includeAll": true, "label": null, @@ -20263,8 +22168,8 @@ items: "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -20394,6 +22299,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-namespace-by-pod namespace: monitoring - apiVersion: v1 @@ -20504,7 +22414,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ workload }}", @@ -20607,7 +22517,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ workload }}", @@ -20908,7 +22818,7 @@ items: ], "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20917,7 +22827,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20926,7 +22836,7 @@ items: "step": 10 }, { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20935,7 +22845,7 @@ items: "step": 10 }, { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20944,7 +22854,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20953,7 +22863,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20962,7 +22872,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -20971,7 +22881,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -21051,7 +22961,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ workload }}", @@ -21154,7 +23064,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ workload }}", @@ -21285,7 +23195,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{workload}}", @@ -21386,7 +23296,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{workload}}", @@ -21498,7 +23408,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{workload}}", @@ -21599,7 +23509,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{workload}}", @@ -21720,7 +23630,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{workload}}", @@ -21821,7 +23731,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{workload}}", @@ -21906,6 +23816,32 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "auto": false, @@ -21916,7 +23852,7 @@ items: "value": "kube-system" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", "hide": 0, "includeAll": false, "label": null, @@ -21925,8 +23861,8 @@ items: "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -21948,7 +23884,7 @@ items: "value": "deployment" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", "hide": 0, "includeAll": false, "label": null, @@ -21957,8 +23893,8 @@ items: "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -22088,29 +24024,41 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-namespace-by-workload namespace: monitoring - apiVersion: v1 data: node-cluster-rsrc-use.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ ] }, - "editable": true, + "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, + "id": null, "links": [ ], - "refresh": "10s", + "refresh": "30s", "rows": [ { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -22121,26 +24069,34 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -22150,12 +24106,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "expr": "((\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n *\n instance:node_num_cpu:sum{job=\"node-exporter\", cluster=\"$cluster\"}\n) != 0 )\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\", cluster=\"$cluster\"}))\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 + "legendFormat": "{{ instance }}", + "refId": "A" } ], "thresholds": [ @@ -22165,8 +24120,8 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -22184,17 +24139,17 @@ items: "format": "percentunit", "label": null, "logBase": 1, - "max": 1, - "min": 0, + "max": null, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -22207,875 +24162,20 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Saturation (load1 per CPU)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { + "fillGradient": 0, + "gridPos": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, "id": 3, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Saturation (Major Page Faults)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "rps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/ Receive/", - "stack": "A" - }, - { - "alias": "/ Transmit/", - "stack": "B", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Receive", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - }, - { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Transmit", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Utilisation (Bytes Receive/Transmit)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/ Receive/", - "stack": "A" - }, - { - "alias": "/ Transmit/", - "stack": "B", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Receive", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - }, - { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Transmit", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Saturation (Drops Receive/Transmit)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "rps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{device}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{device}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Saturation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk IO", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Space Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk Space", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "USE Method / Cluster", - "uid": "3e97d1d02672cdd0861f4c97c64f89b2", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-node-cluster-rsrc-use - namespace: monitoring -- apiVersion: v1 - data: - node-rsrc-use.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23084,112 +24184,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_cpu_utilisation:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23199,8 +24213,8 @@ items: "timeShift": null, "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23219,16 +24233,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23238,11 +24252,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "CPU", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23252,14 +24267,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 3, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23268,26 +24290,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\", job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Memory", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23297,8 +24319,8 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23317,16 +24339,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23338,14 +24360,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 4, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23354,26 +24383,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Major page faults", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23383,8 +24412,8 @@ items: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23399,20 +24428,20 @@ items: }, "yaxes": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", + "format": "rds", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true + }, + { + "format": "rds", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } ] } @@ -23422,11 +24451,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "Memory", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23436,14 +24466,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 5, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23452,11 +24489,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ { "alias": "/Receive/", @@ -23470,24 +24508,22 @@ items: ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Receive", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Receive", + "refId": "A" }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Transmit", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Transmit", + "refId": "B" } ], "thresholds": [ @@ -23495,10 +24531,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Net Utilisation (Bytes Receive/Transmit)", + "title": "Network Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23521,12 +24557,12 @@ items: "show": true }, { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23538,14 +24574,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 6, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23554,42 +24597,41 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ { - "alias": "/Receive/", + "alias": "/ Receive/", "stack": "A" }, { - "alias": "/Transmit/", + "alias": "/ Transmit/", "stack": "B", "transform": "negative-Y" } ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Receive drops", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Receive", + "refId": "A" }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Transmit drops", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Transmit", + "refId": "B" } ], "thresholds": [ @@ -23597,10 +24639,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Net Saturation (Drops Receive/Transmit)", + "title": "Network Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23615,7 +24657,7 @@ items: }, "yaxes": [ { - "format": "rps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -23623,12 +24665,12 @@ items: "show": true }, { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23637,12 +24679,13 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Net", - "titleSize": "h6" + "title": "Network", + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23652,14 +24695,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 7, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23668,26 +24718,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} {{device}}", + "refId": "A" } ], "thresholds": [ @@ -23697,8 +24747,8 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23717,16 +24767,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23738,14 +24788,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 8, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23754,26 +24811,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} {{device}}", + "refId": "A" } ], "thresholds": [ @@ -23783,8 +24840,8 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23803,16 +24860,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23822,11 +24879,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "Disk IO", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23836,14 +24894,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 9, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23852,26 +24917,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "1 -\n(\n max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\"})\n/\n max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\"})\n)\n", + "expr": "sum without (device) (\n max without (fstype, mountpoint) ((\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", cluster=\"$cluster\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", cluster=\"$cluster\"}\n ) != 0)\n)\n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", cluster=\"$cluster\"})))\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23881,8 +24946,8 @@ items: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23901,16 +24966,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23920,20 +24985,21 @@ items: "repeatRowId": null, "showTitle": true, "title": "Disk Space", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" } ], "schemaVersion": 14, "style": "dark", "tags": [ - + "node-exporter-mixin" ], "templating": { "list": [ { "current": { - "text": "default", - "value": "default" + "text": "Prometheus", + "value": "Prometheus" }, "hide": 0, "label": null, @@ -23949,22 +25015,22 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", - "hide": 0, + "hide": 2, "includeAll": false, - "label": "instance", + "label": null, "multi": false, - "name": "instance", + "name": "cluster", "options": [ ], - "query": "label_values(up{job=\"node-exporter\"}, instance)", - "refresh": 1, + "query": "label_values(node_time_seconds, cluster)", + "refresh": 2, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -24004,13 +25070,1095 @@ items: "30d" ] }, - "timezone": "UTC", - "title": "USE Method / Node", - "uid": "fac67cfbe174d3ef53eb473d73d9212f", + "timezone": "utc", + "title": "Node Exporter / USE Method / Cluster", "version": 0 } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-node-cluster-rsrc-use + namespace: monitoring +- apiVersion: v1 + data: + node-rsrc-use.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1 per CPU)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Major page Faults", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Major Page Faults)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "rds", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "rds", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/Receive/", + "stack": "A" + }, + { + "alias": "/Transmit/", + "stack": "B", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Receive", + "refId": "A" + }, + { + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Transmit", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Utilisation (Bytes Receive/Transmit)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/ Receive/", + "stack": "A" + }, + { + "alias": "/ Transmit/", + "stack": "B", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Receive", + "refId": "A" + }, + { + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Transmit", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Saturation (Drops Receive/Transmit)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Network", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk IO", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(1 -\n (\n max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n /\n max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n ) != 0\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Space Utilisation", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk Space", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "node-exporter-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(node_time_seconds, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(node_exporter_build_info{job=\"node-exporter\", cluster=\"$cluster\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Node Exporter / USE Method / Node", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-node-rsrc-use namespace: monitoring - apiVersion: v1 @@ -24030,13 +26178,13 @@ items: }, "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [ ], - "refresh": "", + "refresh": "30s", "rows": [ { "collapse": false, @@ -24088,9 +26236,8 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", "format": "time_series", - "interval": "1m", "intervalFactor": 5, "legendFormat": "{{cpu}}", "refId": "A" @@ -24103,7 +26250,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24217,7 +26364,7 @@ items: "timeShift": null, "title": "Load Average", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24344,7 +26491,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24447,9 +26594,6 @@ items: ], "thresholds": "80, 90", "title": "Memory Usage", - "tooltip": { - "shared": false - }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -24527,25 +26671,22 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}} read", "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}} written", "refId": "B" }, { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}} io time", "refId": "C" @@ -24558,7 +26699,7 @@ items: "timeShift": null, "title": "Disk I/O", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24665,7 +26806,7 @@ items: "timeShift": null, "title": "Disk Space Usage", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24757,9 +26898,8 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A" @@ -24772,7 +26912,7 @@ items: "timeShift": null, "title": "Network Received", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24851,9 +26991,8 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A" @@ -24866,7 +27005,7 @@ items: "timeShift": null, "title": "Network Transmitted", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -24912,7 +27051,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "node-exporter-mixin" ], "templating": { "list": [ @@ -24989,13 +27128,17 @@ items: "30d" ] }, - "timezone": "UTC", - "title": "Nodes", - "uid": "fa49a4706d07a042595b664c87fb33ea", + "timezone": "utc", + "title": "Node Exporter / Nodes", "version": 0 } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-nodes namespace: monitoring - apiVersion: v1 @@ -25073,14 +27216,14 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used Space", "refId": "A" }, { - "expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Free Space", @@ -25188,7 +27331,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -25270,14 +27413,14 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used inodes", "refId": "A" }, { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": " Free inodes", @@ -25385,7 +27528,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -25557,6 +27700,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-persistentvolumesusage namespace: monitoring - apiVersion: v1 @@ -25716,7 +27864,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", "format": "time_series", "instant": null, "intervalFactor": 1, @@ -25843,7 +27991,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", "format": "time_series", "instant": null, "intervalFactor": 1, @@ -25940,7 +28088,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -26041,7 +28189,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -26153,7 +28301,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -26254,7 +28402,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -26375,7 +28523,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -26476,7 +28624,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -26561,6 +28709,32 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": ".+", "auto": false, @@ -26571,7 +28745,7 @@ items: "value": "kube-system" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", "hide": 0, "includeAll": true, "label": null, @@ -26580,8 +28754,8 @@ items: "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -26603,7 +28777,7 @@ items: "value": "" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)", "hide": 0, "includeAll": false, "label": null, @@ -26612,8 +28786,8 @@ items: "options": [ ], - "query": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -26743,6 +28917,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-pod-total namespace: monitoring - apiVersion: v1 @@ -26768,7 +28947,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "60s", "rows": [ { "collapse": false, @@ -26820,7 +28999,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} != 0)\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -26913,7 +29092,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", + "expr": "clamp_min(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n, 0)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -27019,7 +29198,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n- \n (rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -27604,7 +29783,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"} or prometheus_remote_storage_samples_pending{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -27909,7 +30088,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28002,7 +30181,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28095,7 +30274,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28248,7 +30427,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ @@ -28392,11 +30571,16 @@ items: ] }, "timezone": "browser", - "title": "Prometheus Remote Write", + "title": "Prometheus / Remote Write", "version": 0 } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-prometheus-remote-write namespace: monitoring - apiVersion: v1 @@ -28415,7 +30599,7 @@ items: "links": [ ], - "refresh": "10s", + "refresh": "60s", "rows": [ { "collapse": false, @@ -28602,7 +30786,7 @@ items: "timeShift": null, "title": "Prometheus Stats", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -28701,7 +30885,7 @@ items: "timeShift": null, "title": "Target Sync", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -28787,7 +30971,7 @@ items: "timeShift": null, "title": "Targets", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -28885,7 +31069,7 @@ items: "timeShift": null, "title": "Average Scrape Interval Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -28955,6 +31139,14 @@ items: "stack": true, "steppedLine": false, "targets": [ + { + "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "exceeded body size limit: {{job}}", + "legendLink": null, + "step": 10 + }, { "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))", "format": "time_series", @@ -28995,7 +31187,7 @@ items: "timeShift": null, "title": "Scrape failures", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -29081,7 +31273,7 @@ items: "timeShift": null, "title": "Appended Samples", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -29179,7 +31371,7 @@ items: "timeShift": null, "title": "Head Series", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -29265,7 +31457,7 @@ items: "timeShift": null, "title": "Head Chunks", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -29363,7 +31555,7 @@ items: "timeShift": null, "title": "Query Rate", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -29449,7 +31641,7 @@ items: "timeShift": null, "title": "Stage Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -29494,7 +31686,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ @@ -29515,7 +31707,7 @@ items: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -29530,7 +31722,7 @@ items: "options": [ ], - "query": "label_values(prometheus_build_info, job)", + "query": "label_values(prometheus_build_info{job=\"prometheus-k8s\",namespace=\"monitoring\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -29543,7 +31735,7 @@ items: "useTags": false }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -29558,7 +31750,7 @@ items: "options": [ ], - "query": "label_values(prometheus_build_info, instance)", + "query": "label_values(prometheus_build_info{job=~\"$job\"}, instance)", "refresh": 1, "regex": "", "sort": 2, @@ -29602,12 +31794,17 @@ items: ] }, "timezone": "utc", - "title": "Prometheus Overview", + "title": "Prometheus / Overview", "uid": "", "version": 0 } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-prometheus namespace: monitoring - apiVersion: v1 @@ -29700,7 +31897,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-proxy\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-proxy\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -29769,7 +31966,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "rate", @@ -29862,7 +32059,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -29968,7 +32165,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "rate", @@ -30061,7 +32258,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -30167,28 +32364,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -30281,7 +32478,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -30387,7 +32584,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -30493,7 +32690,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -30586,7 +32783,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-proxy\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -30679,7 +32876,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -30763,6 +32960,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -30773,7 +32996,7 @@ items: "options": [ ], - "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\"}, instance)", + "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -30823,6 +33046,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-proxy namespace: monitoring - apiVersion: v1 @@ -30915,7 +33143,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-scheduler\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-scheduler\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -30984,31 +33212,31 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} e2e", + "legendFormat": "{{cluster}} {{instance}} e2e", "refId": "A" }, { - "expr": "sum(rate(scheduler_binding_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} binding", + "legendFormat": "{{cluster}} {{instance}} binding", "refId": "B" }, { - "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} scheduling algorithm", + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", "refId": "C" }, { - "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} volume", + "legendFormat": "{{cluster}} {{instance}} volume", "refId": "D" } ], @@ -31098,31 +33326,31 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} e2e", + "legendFormat": "{{cluster}} {{instance}} e2e", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} binding", + "legendFormat": "{{cluster}} {{instance}} binding", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} scheduling algorithm", + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", "refId": "C" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} volume", + "legendFormat": "{{cluster}} {{instance}} volume", "refId": "D" } ], @@ -31225,28 +33453,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -31339,7 +33567,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -31445,7 +33673,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -31551,7 +33779,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-scheduler\", instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -31644,7 +33872,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -31737,7 +33965,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-scheduler\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -31821,6 +34049,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kube-scheduler\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -31831,7 +34085,7 @@ items: "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-scheduler\"}, instance)", + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -31881,920 +34135,13 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-scheduler namespace: monitoring -- apiVersion: v1 - data: - statefulset.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "CPU", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Memory", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "Bps", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Network", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Replicas", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Replicas of current version", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Observed Generation", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Metadata Generation", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas specified", - "refId": "A" - }, - { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas created", - "refId": "B" - }, - { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "ready", - "refId": "C" - }, - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas of current version", - "refId": "D" - }, - { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "E" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Name", - "multi": false, - "name": "statefulset", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, statefulset)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / StatefulSets", - "uid": "a31c1f46e6f727cb37c0d731a7245005", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-statefulset - namespace: monitoring - apiVersion: v1 data: workload-total.json: |- @@ -32903,7 +34250,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33006,7 +34353,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33120,7 +34467,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33223,7 +34570,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33354,7 +34701,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33455,7 +34802,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33567,7 +34914,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33668,7 +35015,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33789,7 +35136,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33890,7 +35237,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33975,6 +35322,32 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": ".+", "auto": false, @@ -33985,7 +35358,7 @@ items: "value": "kube-system" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", "hide": 0, "includeAll": true, "label": null, @@ -33994,8 +35367,8 @@ items: "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -34017,7 +35390,7 @@ items: "value": "" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\"}, workload)", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\"}, workload)", "hide": 0, "includeAll": false, "label": null, @@ -34026,8 +35399,8 @@ items: "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\"}, workload)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\"}, workload)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -34049,7 +35422,7 @@ items: "value": "deployment" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", "hide": 0, "includeAll": false, "label": null, @@ -34058,8 +35431,8 @@ items: "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -34189,6 +35562,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-workload-total namespace: monitoring kind: ConfigMapList diff --git a/manifests/grafana-dashboardSources.yaml b/manifests/grafana-dashboardSources.yaml index fffec986..f6e281eb 100644 --- a/manifests/grafana-dashboardSources.yaml +++ b/manifests/grafana-dashboardSources.yaml @@ -6,6 +6,7 @@ data: "providers": [ { "folder": "Default", + "folderUid": "", "name": "0", "options": { "path": "/grafana-dashboard-definitions/0" @@ -17,5 +18,10 @@ data: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboards namespace: monitoring diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 589b1ade..4b239280 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -2,24 +2,34 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana namespace: monitoring spec: replicas: 1 selector: matchLabels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus template: metadata: annotations: - checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a + checksum/grafana-config: e1f5b84a1d40edb8a6527c98d24ff656 + checksum/grafana-dashboardproviders: 2c7c248e5512bb5576d633004725159c + checksum/grafana-datasources: b2cbbea3079b8634b7bdf42cb56c1537 labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 spec: containers: - env: [] - image: grafana/grafana:7.1.0 + image: grafana/grafana:8.1.3 name: grafana ports: - containerPort: 3000 @@ -45,6 +55,9 @@ spec: - mountPath: /etc/grafana/provisioning/dashboards name: grafana-dashboards readOnly: false + - mountPath: /grafana-dashboard-definitions/0/alertmanager-overview + name: grafana-dashboard-alertmanager-overview + readOnly: false - mountPath: /grafana-dashboard-definitions/0/apiserver name: grafana-dashboard-apiserver readOnly: false @@ -108,14 +121,14 @@ spec: - mountPath: /grafana-dashboard-definitions/0/scheduler name: grafana-dashboard-scheduler readOnly: false - - mountPath: /grafana-dashboard-definitions/0/statefulset - name: grafana-dashboard-statefulset - readOnly: false - mountPath: /grafana-dashboard-definitions/0/workload-total name: grafana-dashboard-workload-total readOnly: false + - mountPath: /etc/grafana + name: grafana-config + readOnly: false nodeSelector: - beta.kubernetes.io/os: linux + kubernetes.io/os: linux securityContext: fsGroup: 65534 runAsNonRoot: true @@ -130,6 +143,9 @@ spec: - configMap: name: grafana-dashboards name: grafana-dashboards + - configMap: + name: grafana-dashboard-alertmanager-overview + name: grafana-dashboard-alertmanager-overview - configMap: name: grafana-dashboard-apiserver name: grafana-dashboard-apiserver @@ -193,9 +209,9 @@ spec: - configMap: name: grafana-dashboard-scheduler name: grafana-dashboard-scheduler - - configMap: - name: grafana-dashboard-statefulset - name: grafana-dashboard-statefulset - configMap: name: grafana-dashboard-workload-total name: grafana-dashboard-workload-total + - name: grafana-config + secret: + secretName: grafana-config diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 5e7e1453..258a9720 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -2,7 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana namespace: monitoring spec: @@ -11,5 +14,6 @@ spec: port: 3000 targetPort: http selector: - app: grafana - type: NodePort + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/grafana-serviceMonitor.yaml b/manifests/grafana-serviceMonitor.yaml index 7ede266a..fedfd40c 100644 --- a/manifests/grafana-serviceMonitor.yaml +++ b/manifests/grafana-serviceMonitor.yaml @@ -1,6 +1,11 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana namespace: monitoring spec: @@ -9,4 +14,4 @@ spec: port: http selector: matchLabels: - app: grafana + app.kubernetes.io/name: grafana diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml new file mode 100644 index 00000000..f9778154 --- /dev/null +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + role: alert-rules + name: kube-prometheus-rules + namespace: monitoring +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index c04db290..9b7d81f1 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics rules: - apiGroups: @@ -24,16 +26,6 @@ rules: verbs: - list - watch -- apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - - ingresses - verbs: - - list - - watch - apiGroups: - apps resources: @@ -105,6 +97,14 @@ rules: - networking.k8s.io resources: - networkpolicies + - ingresses + verbs: + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases verbs: - list - watch diff --git a/manifests/kube-state-metrics-clusterRoleBinding.yaml b/manifests/kube-state-metrics-clusterRoleBinding.yaml index c8f9434d..43243f70 100644 --- a/manifests/kube-state-metrics-clusterRoleBinding.yaml +++ b/manifests/kube-state-metrics-clusterRoleBinding.yaml @@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index b54e6414..caf97030 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -2,20 +2,28 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring spec: replicas: 1 selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus template: metadata: + annotations: + kubectl.kubernetes.io/default-container: kube-state-metrics labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 spec: containers: - args: @@ -23,32 +31,59 @@ spec: - --port=8081 - --telemetry-host=127.0.0.1 - --telemetry-port=8082 - image: quay.io/coreos/kube-state-metrics:v1.9.7 + image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.2.0 name: kube-state-metrics + resources: + limits: + cpu: 100m + memory: 250Mi + requests: + cpu: 10m + memory: 190Mi + securityContext: + runAsUser: 65534 - args: - --logtostderr - --secure-listen-address=:8443 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8081/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy-main ports: - containerPort: 8443 name: https-main + resources: + limits: + cpu: 40m + memory: 40Mi + requests: + cpu: 20m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 - args: - --logtostderr - --secure-listen-address=:9443 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8082/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy-self ports: - containerPort: 9443 name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 nodeSelector: kubernetes.io/os: linux serviceAccountName: kube-state-metrics diff --git a/manifests/kube-state-metrics-prometheusRule.yaml b/manifests/kube-state-metrics-prometheusRule.yaml new file mode 100644 index 00000000..f90f6343 --- /dev/null +++ b/manifests/kube-state-metrics-prometheusRule.yaml @@ -0,0 +1,65 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 + prometheus: k8s + role: alert-rules + name: kube-state-metrics-rules + namespace: monitoring +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors + summary: kube-state-metrics is experiencing errors in list operations. + expr: | + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors + summary: kube-state-metrics is experiencing errors in watch operations. + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: | + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: | + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical diff --git a/manifests/kube-state-metrics-service.yaml b/manifests/kube-state-metrics-service.yaml index d734a19b..80bf3bdf 100644 --- a/manifests/kube-state-metrics-service.yaml +++ b/manifests/kube-state-metrics-service.yaml @@ -2,8 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring spec: @@ -16,4 +18,6 @@ spec: port: 9443 targetPort: https-self selector: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/kube-state-metrics-serviceAccount.yaml b/manifests/kube-state-metrics-serviceAccount.yaml index c23b36c1..248654d0 100644 --- a/manifests/kube-state-metrics-serviceAccount.yaml +++ b/manifests/kube-state-metrics-serviceAccount.yaml @@ -2,7 +2,9 @@ apiVersion: v1 kind: ServiceAccount metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index b860f4c3..052e6b22 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -2,8 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring spec: @@ -28,4 +30,6 @@ spec: jobLabel: app.kubernetes.io/name selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/kubernetes-prometheusRule.yaml b/manifests/kubernetes-prometheusRule.yaml new file mode 100644 index 00000000..82a67f1a --- /dev/null +++ b/manifests/kubernetes-prometheusRule.yaml @@ -0,0 +1,1380 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + role: alert-rules + name: kubernetes-monitoring-rules + namespace: monitoring +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0 + and + kube_pod_container_status_waiting{job="kube-state-metrics"} == 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod) ( + max by(namespace, pod) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} + ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( + 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_deployment_spec_replicas{job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion + summary: Job did not complete in time + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 12h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched descired number of replicas. + expr: | + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + and + (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods by {{ $value }} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + and + (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(kube_node_status_allocatable{resource="cpu"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(kube_node_status_allocatable{resource="memory",job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ $value }} different semantic versions of Kubernetes components running. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - alert: AggregatedAPIErrors + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors + summary: An aggregated API has reported errors. + expr: | + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + labels: + severity: warning + - alert: AggregatedAPIDown + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown + summary: An aggregated API is down. + expr: | + (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests + summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(node) ( + kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + ) > 0.95 + for: 15m + labels: + severity: info + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - name: kube-apiserver-burnrate.rules + rules: + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h + - name: kube-apiserver-histogram.rules + rules: + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules + rules: + - expr: | + avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) + record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h + - expr: | + sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d + - expr: | + sum by (cluster, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h + - expr: | + sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d + - expr: | + 1 - ( + ( + # write too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"}) + ) + ) + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - name: k8s.rules + rules: + - expr: | + sum by (cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - expr: | + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: | + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: | + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules + rules: + - expr: | + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (cluster, node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) + )) + record: node:node_num_cpu:sum + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/manifests/kubernetes-serviceMonitorApiserver.yaml b/manifests/kubernetes-serviceMonitorApiserver.yaml new file mode 100644 index 00000000..cce20bf7 --- /dev/null +++ b/manifests/kubernetes-serviceMonitorApiserver.yaml @@ -0,0 +1,74 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: apiserver + name: kube-apiserver + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|server).* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) + sourceLabels: + - __name__ + - le + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/kubernetes-serviceMonitorCoreDNS.yaml similarity index 83% rename from manifests/prometheus-serviceMonitorCoreDNS.yaml rename to manifests/kubernetes-serviceMonitorCoreDNS.yaml index 633aa18c..38b602d6 100644 --- a/manifests/prometheus-serviceMonitorCoreDNS.yaml +++ b/manifests/kubernetes-serviceMonitorCoreDNS.yaml @@ -2,7 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: coredns + app.kubernetes.io/name: coredns name: coredns namespace: monitoring spec: @@ -10,7 +10,7 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 15s port: metrics - jobLabel: k8s-app + jobLabel: app.kubernetes.io/name namespaceSelector: matchNames: - kube-system diff --git a/manifests/kubernetes-serviceMonitorKubeControllerManager.yaml b/manifests/kubernetes-serviceMonitorKubeControllerManager.yaml new file mode 100644 index 00000000..4aab7701 --- /dev/null +++ b/manifests/kubernetes-serviceMonitorKubeControllerManager.yaml @@ -0,0 +1,59 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kube-controller-manager + name: kube-controller-manager + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + port: https-metrics + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kube-controller-manager diff --git a/manifests/prometheus-serviceMonitorKubeScheduler.yaml b/manifests/kubernetes-serviceMonitorKubeScheduler.yaml similarity index 76% rename from manifests/prometheus-serviceMonitorKubeScheduler.yaml rename to manifests/kubernetes-serviceMonitorKubeScheduler.yaml index 8073eaca..ca30352e 100644 --- a/manifests/prometheus-serviceMonitorKubeScheduler.yaml +++ b/manifests/kubernetes-serviceMonitorKubeScheduler.yaml @@ -2,7 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: kube-scheduler + app.kubernetes.io/name: kube-scheduler name: kube-scheduler namespace: monitoring spec: @@ -13,10 +13,10 @@ spec: scheme: https tlsConfig: insecureSkipVerify: true - jobLabel: k8s-app + jobLabel: app.kubernetes.io/name namespaceSelector: matchNames: - kube-system selector: matchLabels: - k8s-app: kube-scheduler + app.kubernetes.io/name: kube-scheduler diff --git a/manifests/kubernetes-serviceMonitorKubelet.yaml b/manifests/kubernetes-serviceMonitorKubelet.yaml new file mode 100644 index 00000000..5c6fc6ff --- /dev/null +++ b/manifests/kubernetes-serviceMonitorKubelet.yaml @@ -0,0 +1,96 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kubelet + name: kubelet + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + - action: drop + regex: (container_fs_.*|container_spec_.*|container_blkio_device_usage_total|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; + sourceLabels: + - __name__ + - pod + - namespace + path: /metrics/cadvisor + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet diff --git a/manifests/node-exporter-clusterRole.yaml b/manifests/node-exporter-clusterRole.yaml index ad783ae9..fe5db25b 100644 --- a/manifests/node-exporter-clusterRole.yaml +++ b/manifests/node-exporter-clusterRole.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter rules: - apiGroups: diff --git a/manifests/node-exporter-clusterRoleBinding.yaml b/manifests/node-exporter-clusterRoleBinding.yaml index a5a20508..be1016be 100644 --- a/manifests/node-exporter-clusterRoleBinding.yaml +++ b/manifests/node-exporter-clusterRoleBinding.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 275d2753..69e14810 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -2,30 +2,37 @@ apiVersion: apps/v1 kind: DaemonSet metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring spec: selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus template: metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 spec: containers: - args: - --web.listen-address=127.0.0.1:9100 - - --path.procfs=/host/proc - --path.sysfs=/host/sys - --path.rootfs=/host/root - --no-collector.wifi - --no-collector.hwmon - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) - image: quay.io/prometheus/node-exporter:v1.0.1 + - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ + - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ + image: quay.io/prometheus/node-exporter:v1.2.2 name: node-exporter resources: limits: @@ -35,10 +42,6 @@ spec: cpu: 102m memory: 180Mi volumeMounts: - - mountPath: /host/proc - mountPropagation: HostToContainer - name: proc - readOnly: true - mountPath: /host/sys mountPropagation: HostToContainer name: sys @@ -57,7 +60,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy ports: - containerPort: 9100 @@ -70,6 +73,10 @@ spec: requests: cpu: 10m memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 hostNetwork: true hostPID: true nodeSelector: @@ -81,9 +88,6 @@ spec: tolerations: - operator: Exists volumes: - - hostPath: - path: /proc - name: proc - hostPath: path: /sys name: sys @@ -93,3 +97,4 @@ spec: updateStrategy: rollingUpdate: maxUnavailable: 10% + type: RollingUpdate diff --git a/manifests/node-exporter-prometheusRule.yaml b/manifests/node-exporter-prometheusRule.yaml new file mode 100644 index 00000000..0a39b890 --- /dev/null +++ b/manifests/node-exporter-prometheusRule.yaml @@ -0,0 +1,304 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: monitoring +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status[5m]) == 0 + and + node_timex_maxerror_seconds >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="failed"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - name: node-exporter.rules + rules: + - expr: | + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m diff --git a/manifests/node-exporter-service.yaml b/manifests/node-exporter-service.yaml index cb966601..c18c7c75 100644 --- a/manifests/node-exporter-service.yaml +++ b/manifests/node-exporter-service.yaml @@ -2,8 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring spec: @@ -13,4 +15,6 @@ spec: port: 9100 targetPort: https selector: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/node-exporter-serviceAccount.yaml b/manifests/node-exporter-serviceAccount.yaml index 8a03ac16..343fe6da 100644 --- a/manifests/node-exporter-serviceAccount.yaml +++ b/manifests/node-exporter-serviceAccount.yaml @@ -1,5 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring diff --git a/manifests/node-exporter-serviceMonitor.yaml b/manifests/node-exporter-serviceMonitor.yaml index 8e5a97c9..6d2edd3a 100644 --- a/manifests/node-exporter-serviceMonitor.yaml +++ b/manifests/node-exporter-serviceMonitor.yaml @@ -2,8 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring spec: @@ -24,4 +26,6 @@ spec: jobLabel: app.kubernetes.io/name selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-adapter-apiService.yaml b/manifests/prometheus-adapter-apiService.yaml index a215efe4..ea4476dc 100644 --- a/manifests/prometheus-adapter-apiService.yaml +++ b/manifests/prometheus-adapter-apiService.yaml @@ -1,6 +1,11 @@ apiVersion: apiregistration.k8s.io/v1 kind: APIService metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: v1beta1.metrics.k8s.io spec: group: metrics.k8s.io diff --git a/manifests/prometheus-adapter-clusterRole.yaml b/manifests/prometheus-adapter-clusterRole.yaml index a02d2bb0..091d9818 100644 --- a/manifests/prometheus-adapter-clusterRole.yaml +++ b/manifests/prometheus-adapter-clusterRole.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter rules: - apiGroups: diff --git a/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml b/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml index 9f0dbb34..4dc8e7a3 100644 --- a/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml +++ b/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml @@ -2,6 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 rbac.authorization.k8s.io/aggregate-to-admin: "true" rbac.authorization.k8s.io/aggregate-to-edit: "true" rbac.authorization.k8s.io/aggregate-to-view: "true" diff --git a/manifests/prometheus-adapter-clusterRoleBinding.yaml b/manifests/prometheus-adapter-clusterRoleBinding.yaml index 7e8f3da9..dc8bfbb1 100644 --- a/manifests/prometheus-adapter-clusterRoleBinding.yaml +++ b/manifests/prometheus-adapter-clusterRoleBinding.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml index 4295b50f..86e7517d 100644 --- a/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml +++ b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: resource-metrics:system:auth-delegator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/prometheus-adapter-clusterRoleServerResources.yaml b/manifests/prometheus-adapter-clusterRoleServerResources.yaml index fcb914c3..655efb9b 100644 --- a/manifests/prometheus-adapter-clusterRoleServerResources.yaml +++ b/manifests/prometheus-adapter-clusterRoleServerResources.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: resource-metrics-server-resources rules: - apiGroups: diff --git a/manifests/prometheus-adapter-configMap.yaml b/manifests/prometheus-adapter-configMap.yaml index b2bde3cd..b5329689 100644 --- a/manifests/prometheus-adapter-configMap.yaml +++ b/manifests/prometheus-adapter-configMap.yaml @@ -4,8 +4,26 @@ data: "resourceRules": "cpu": "containerLabel": "container" - "containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)" - "nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)" + "containerQuery": | + sum by (<<.GroupBy>>) ( + irate ( + container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s] + ) + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + 1 - irate( + node_cpu_seconds_total{mode="idle"}[60s] + ) + * on(namespace, pod) group_left(node) ( + node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>} + ) + ) + or sum by (<<.GroupBy>>) ( + 1 - irate( + windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m] + ) + ) "resources": "overrides": "namespace": @@ -16,8 +34,21 @@ data: "resource": "pod" "memory": "containerLabel": "container" - "containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)" - "nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)" + "containerQuery": | + sum by (<<.GroupBy>>) ( + container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""} + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} + - + node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>} + ) + or sum by (<<.GroupBy>>) ( + windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>} + - + windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>} + ) "resources": "overrides": "instance": @@ -29,5 +60,10 @@ data: "window": "5m" kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: adapter-config namespace: monitoring diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index bb85e251..e6c97afc 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -1,21 +1,31 @@ apiVersion: apps/v1 kind: Deployment metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring spec: - replicas: 1 + replicas: 2 selector: matchLabels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus strategy: rollingUpdate: maxSurge: 1 - maxUnavailable: 0 + maxUnavailable: 1 template: metadata: labels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 spec: containers: - args: @@ -25,10 +35,18 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090/ - --secure-port=6443 - image: directxman12/k8s-prometheus-adapter:v0.8.2 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA + image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.0 name: prometheus-adapter ports: - containerPort: 6443 + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi volumeMounts: - mountPath: /tmp name: tmpfs diff --git a/manifests/prometheus-adapter-podDisruptionBudget.yaml b/manifests/prometheus-adapter-podDisruptionBudget.yaml new file mode 100644 index 00000000..639eefd4 --- /dev/null +++ b/manifests/prometheus-adapter-podDisruptionBudget.yaml @@ -0,0 +1,17 @@ +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 + name: prometheus-adapter + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-adapter-roleBindingAuthReader.yaml b/manifests/prometheus-adapter-roleBindingAuthReader.yaml index 48c8f325..3bdf4ad9 100644 --- a/manifests/prometheus-adapter-roleBindingAuthReader.yaml +++ b/manifests/prometheus-adapter-roleBindingAuthReader.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: resource-metrics-auth-reader namespace: kube-system roleRef: diff --git a/manifests/prometheus-adapter-service.yaml b/manifests/prometheus-adapter-service.yaml index e786e01c..be8c44b6 100644 --- a/manifests/prometheus-adapter-service.yaml +++ b/manifests/prometheus-adapter-service.yaml @@ -2,7 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring spec: @@ -11,4 +14,6 @@ spec: port: 443 targetPort: 6443 selector: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-adapter-serviceAccount.yaml b/manifests/prometheus-adapter-serviceAccount.yaml index d7e70503..2ddbe460 100644 --- a/manifests/prometheus-adapter-serviceAccount.yaml +++ b/manifests/prometheus-adapter-serviceAccount.yaml @@ -1,5 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring diff --git a/manifests/prometheus-adapter-serviceMonitor.yaml b/manifests/prometheus-adapter-serviceMonitor.yaml index 91a8d51a..a7515852 100644 --- a/manifests/prometheus-adapter-serviceMonitor.yaml +++ b/manifests/prometheus-adapter-serviceMonitor.yaml @@ -2,7 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring spec: @@ -15,4 +18,6 @@ spec: insecureSkipVerify: true selector: matchLabels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-clusterRole.yaml b/manifests/prometheus-clusterRole.yaml index d5c45983..8fac941b 100644 --- a/manifests/prometheus-clusterRole.yaml +++ b/manifests/prometheus-clusterRole.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s rules: - apiGroups: diff --git a/manifests/prometheus-clusterRoleBinding.yaml b/manifests/prometheus-clusterRoleBinding.yaml index 554bb6f8..cafde390 100644 --- a/manifests/prometheus-clusterRoleBinding.yaml +++ b/manifests/prometheus-clusterRoleBinding.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/prometheus-operator-prometheusRule.yaml b/manifests/prometheus-operator-prometheusRule.yaml new file mode 100644 index 00000000..4921fe68 --- /dev/null +++ b/manifests/prometheus-operator-prometheusRule.yaml @@ -0,0 +1,86 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 + prometheus: k8s + role: alert-rules + name: prometheus-operator-rules + namespace: monitoring +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors + summary: Errors while performing list operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors + summary: Errors while performing watch operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed + summary: Last controller reconciliation failed + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors + summary: Errors while reconciling controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors + summary: Errors while reconciling Prometheus. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready + summary: Prometheus operator not ready + expr: | + min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning + - alert: PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources + summary: Resources rejected by Prometheus operator + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 5m + labels: + severity: warning diff --git a/manifests/prometheus-operator-serviceMonitor.yaml b/manifests/prometheus-operator-serviceMonitor.yaml index 9bae609a..dddeb0ac 100644 --- a/manifests/prometheus-operator-serviceMonitor.yaml +++ b/manifests/prometheus-operator-serviceMonitor.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring spec: @@ -19,4 +20,5 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 diff --git a/manifests/prometheus-podDisruptionBudget.yaml b/manifests/prometheus-podDisruptionBudget.yaml new file mode 100644 index 00000000..f7adef54 --- /dev/null +++ b/manifests/prometheus-podDisruptionBudget.yaml @@ -0,0 +1,18 @@ +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 + name: prometheus-k8s + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index e50cc9bf..8731891f 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -2,18 +2,31 @@ apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 prometheus: k8s name: k8s namespace: monitoring spec: alerting: alertmanagers: - - name: alertmanager-main + - apiVersion: v2 + name: alertmanager-main namespace: monitoring port: web - image: quay.io/prometheus/prometheus:v2.22.1 + enableFeatures: [] + externalLabels: {} + image: quay.io/prometheus/prometheus:v2.29.2 nodeSelector: kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 podMonitorNamespaceSelector: {} podMonitorSelector: {} probeNamespaceSelector: {} @@ -22,10 +35,8 @@ spec: resources: requests: memory: 400Mi - ruleSelector: - matchLabels: - prometheus: k8s - role: alert-rules + ruleNamespaceSelector: {} + ruleSelector: {} securityContext: fsGroup: 2000 runAsNonRoot: true @@ -33,4 +44,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.22.1 + version: 2.29.2 diff --git a/manifests/prometheus-prometheusRule.yaml b/manifests/prometheus-prometheusRule.yaml new file mode 100644 index 00000000..c15e8c13 --- /dev/null +++ b/manifests/prometheus-prometheusRule.yaml @@ -0,0 +1,249 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 + prometheus: k8s + role: alert-rules + name: prometheus-k8s-prometheus-rules + namespace: monitoring +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure + summary: Prometheus has failed to sync targets. + expr: | + increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0 + for: 5m + labels: + severity: critical + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical diff --git a/manifests/prometheus-roleBindingConfig.yaml b/manifests/prometheus-roleBindingConfig.yaml index ec0129db..ca17d7eb 100644 --- a/manifests/prometheus-roleBindingConfig.yaml +++ b/manifests/prometheus-roleBindingConfig.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s-config namespace: monitoring roleRef: diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml index c7527f6a..33309a20 100644 --- a/manifests/prometheus-roleBindingSpecificNamespaces.yaml +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -3,6 +3,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: default roleRef: @@ -16,6 +21,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: kube-system roleRef: @@ -29,6 +39,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: monitoring roleRef: diff --git a/manifests/prometheus-roleConfig.yaml b/manifests/prometheus-roleConfig.yaml index 5f1cd043..0f7129cb 100644 --- a/manifests/prometheus-roleConfig.yaml +++ b/manifests/prometheus-roleConfig.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s-config namespace: monitoring rules: diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml index 689baa93..ffe15628 100644 --- a/manifests/prometheus-roleSpecificNamespaces.yaml +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -3,6 +3,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: default rules: @@ -24,9 +29,22 @@ items: - get - list - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: kube-system rules: @@ -48,9 +66,22 @@ items: - get - list - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: monitoring rules: @@ -72,4 +103,12 @@ items: - get - list - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch kind: RoleList diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml deleted file mode 100644 index a46fe442..00000000 --- a/manifests/prometheus-rules.yaml +++ /dev/null @@ -1,2011 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: prometheus-k8s-rules - namespace: monitoring -spec: - groups: - - name: node-exporter.rules - rules: - - expr: | - count without (cpu) ( - count without (mode) ( - node_cpu_seconds_total{job="node-exporter"} - ) - ) - record: instance:node_num_cpu:sum - - expr: | - 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) - ) - record: instance:node_cpu_utilisation:rate1m - - expr: | - ( - node_load1{job="node-exporter"} - / - instance:node_num_cpu:sum{job="node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: | - 1 - ( - node_memory_MemAvailable_bytes{job="node-exporter"} - / - node_memory_MemTotal_bytes{job="node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: | - rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m - - name: kube-apiserver.rules - rules: - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) - labels: - verb: read - record: apiserver_request:burnrate1d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) - labels: - verb: read - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) - labels: - verb: read - record: apiserver_request:burnrate2h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) - labels: - verb: read - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) - labels: - verb: read - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - labels: - verb: write - record: apiserver_request:burnrate1d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - labels: - verb: write - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: read - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: write - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) - / - sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) - record: cluster:apiserver_request_duration_seconds:mean5m - - expr: | - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - interval: 3m - name: kube-apiserver-availability.rules - rules: - - expr: | - 1 - ( - ( - # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) + - ( - # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - - - ( - ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - ) + - # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d) - labels: - verb: all - record: apiserver_request:availability30d - - expr: | - 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - - - ( - # too slow - ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="read"}) - labels: - verb: read - record: apiserver_request:availability30d - - expr: | - 1 - ( - ( - # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="write"}) - labels: - verb: write - record: apiserver_request:availability30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) - labels: - verb: read - record: code:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - labels: - verb: write - record: code:apiserver_request_total:increase30d - - name: k8s.rules - rules: - - expr: | - sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace) - record: namespace:container_cpu_usage_seconds_total:sum_rate - - expr: | - sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - - expr: | - container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes - - expr: | - container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss - - expr: | - container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache - - expr: | - container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap - - expr: | - sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) - record: namespace:container_memory_usage_bytes:sum - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( - 1, max by (replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: deployment - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: daemonset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: statefulset - record: namespace_workload_pod:kube_pod_owner:relabel - - name: kube-scheduler.rules - rules: - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - name: node.rules - rules: - - expr: | - sum(min(kube_pod_info{node!=""}) by (cluster, node)) - record: ':kube_pod_info_node_count:' - - expr: | - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") - )) - record: 'node_namespace_pod:kube_pod_info:' - - expr: | - count by (cluster, node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - )) - record: node:node_num_cpu:sum - - expr: | - sum( - node_memory_MemAvailable_bytes{job="node-exporter"} or - ( - node_memory_Buffers_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Slab_bytes{job="node-exporter"} - ) - ) by (cluster) - record: :node_memory_MemAvailable_bytes:sum - - name: kubelet.rules - rules: - - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.99" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.9" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.5" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors - summary: kube-state-metrics is experiencing errors in list operations. - expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors - summary: kube-state-metrics is experiencing errors in watch operations. - expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: | - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: | - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: | - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: | - node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected - summary: Clock skew detected. - expr: | - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising - summary: Clock not synchronising. - expr: | - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded - summary: RAID Array is degraded - expr: | - node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure - summary: Failed device in RAID array - expr: | - node_md_disks{state="fail"} > 0 - labels: - severity: warning - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors - summary: Errors while performing list operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors - summary: Errors while performing watch operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed - summary: Last controller reconciliation failed - expr: | - min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors - summary: Errors while reconciling controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors - summary: Errors while reconciling Prometheus. - expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready - summary: Prometheus operator not ready - expr: | - min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) - for: 5m - labels: - severity: warning - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources - summary: Resources rejected by Prometheus operator - expr: | - min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 5m - labels: - severity: warning - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping - summary: Pod is crash looping. - expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: | - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: | - kube_deployment_status_observed_generation{job="kube-state-metrics"} - != - kube_deployment_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: | - kube_statefulset_status_observed_generation{job="kube-state-metrics"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: | - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: | - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeContainerWaiting - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: | - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 - for: 1h - labels: - severity: warning - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: | - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 - for: 10m - labels: - severity: warning - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: | - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeJobCompletion - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion - summary: Job did not complete in time - expr: | - kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 - for: 12h - labels: - severity: warning - - alert: KubeJobFailed - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed - summary: Job failed to complete. - expr: | - kube_job_failed{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch - summary: HPA has not matched descired number of replicas. - expr: | - (kube_hpa_status_desired_replicas{job="kube-state-metrics"} - != - kube_hpa_status_current_replicas{job="kube-state-metrics"}) - and - changes(kube_hpa_status_current_replicas[15m]) == 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout - summary: HPA is running at max replicas - expr: | - kube_hpa_status_current_replicas{job="kube-state-metrics"} - == - kube_hpa_spec_max_replicas{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) - / - sum(kube_node_status_allocatable_cpu_cores) - > - (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) - for: 5m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) - / - sum(kube_node_status_allocatable_memory_bytes) - > - (count(kube_node_status_allocatable_memory_bytes)-1) - / - count(kube_node_status_allocatable_memory_bytes) - for: 5m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(kube_node_status_allocatable_cpu_cores) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused - summary: Namespace quota is fully used. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) - > ( 25 / 100 ) - for: 15m - labels: - severity: info - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} - < 0.03 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - ( - kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} - ) < 0.15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: | - count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m - labels: - severity: warning - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 - for: 15m - labels: - severity: warning - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h - - name: kubernetes-system-apiserver - rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - labels: - severity: critical - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors - summary: An aggregated API has reported errors. - expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 - labels: - severity: warning - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown - summary: An aggregated API is down. - expr: | - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m - labels: - severity: warning - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready - summary: Node is not ready. - expr: | - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable - summary: Node is unreachable. - expr: | - (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m - labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods - summary: Kubelet is running at capacity. - expr: | - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) - ) - / - max by(node) ( - kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 - ) > 0.95 - for: 15m - labels: - severity: warning - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m - labels: - severity: warning - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: | - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m - labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: | - increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: | - increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-scheduler - rules: - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-controller-manager - rules: - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. - summary: Failed Prometheus configuration reload. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. - expr: | - ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. - summary: Prometheus is not connected to any Alertmanagers. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. - summary: Prometheus has issues reloading blocks from disk. - expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. - summary: Prometheus has issues compacting blocks. - expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. - summary: Prometheus is not ingesting samples. - expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 - for: 10m - labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - summary: Prometheus is dropping samples with duplicate timestamps. - expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - summary: Prometheus drops samples with out-of-order timestamps. - expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - summary: Prometheus fails to send samples to remote storage. - expr: | - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - summary: Prometheus remote write is behind. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - - on(job, instance) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - summary: Prometheus is failing rule evaluations. - expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: warning - - name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - annotations: - message: | - The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. - {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} - Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" - {{ end }} - expr: | - count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1 - for: 5m - labels: - severity: critical - - alert: AlertmanagerFailedReload - annotations: - message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - expr: | - alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 - for: 10m - labels: - severity: warning - - alert: AlertmanagerMembersInconsistent - annotations: - message: Alertmanager has not found all other members of the cluster. - expr: | - alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) - for: 5m - labels: - severity: critical - - name: general.rules - rules: - - alert: TargetDown - annotations: - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: Watchdog - annotations: - message: | - This is an alert meant to ensure that the entire alerting pipeline is functional. - This alert is always firing, therefore it should always be firing in Alertmanager - and always fire against a receiver. There are integrations with various notification - mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. - expr: vector(1) - labels: - severity: none - - name: node-network - rules: - - alert: NodeNetworkInterfaceFlapping - annotations: - message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" - expr: | - changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning diff --git a/manifests/prometheus-service.yaml b/manifests/prometheus-service.yaml index 4f61e88a..0b14d9bb 100644 --- a/manifests/prometheus-service.yaml +++ b/manifests/prometheus-service.yaml @@ -2,6 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 prometheus: k8s name: prometheus-k8s namespace: monitoring @@ -12,5 +16,8 @@ spec: targetPort: web selector: app: prometheus + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus prometheus: k8s sessionAffinity: ClientIP diff --git a/manifests/prometheus-serviceAccount.yaml b/manifests/prometheus-serviceAccount.yaml index 3e55fad6..371b8ec9 100644 --- a/manifests/prometheus-serviceAccount.yaml +++ b/manifests/prometheus-serviceAccount.yaml @@ -1,5 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: monitoring diff --git a/manifests/prometheus-serviceMonitor.yaml b/manifests/prometheus-serviceMonitor.yaml index b7605dbe..b5282ebf 100644 --- a/manifests/prometheus-serviceMonitor.yaml +++ b/manifests/prometheus-serviceMonitor.yaml @@ -2,8 +2,11 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: prometheus - name: prometheus + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 + name: prometheus-k8s namespace: monitoring spec: endpoints: @@ -11,4 +14,7 @@ spec: port: web selector: matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus prometheus: k8s diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml deleted file mode 100644 index 1ff61fe9..00000000 --- a/manifests/prometheus-serviceMonitorApiserver.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: apiserver - name: kube-apiserver - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(debugging|disk|server).* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_admission_controller_admission_latencies_seconds_.* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_admission_step_admission_latencies_seconds_.* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) - sourceLabels: - - __name__ - - le - port: https - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - serverName: kubernetes - jobLabel: component - namespaceSelector: - matchNames: - - default - selector: - matchLabels: - component: apiserver - provider: kubernetes diff --git a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml deleted file mode 100644 index 0f23d84d..00000000 --- a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml +++ /dev/null @@ -1,59 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: kube-controller-manager - name: kube-controller-manager - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(debugging|disk|request|server).* - sourceLabels: - - __name__ - port: https-metrics - scheme: https - tlsConfig: - insecureSkipVerify: true - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: kube-controller-manager diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml deleted file mode 100644 index 7db47ef0..00000000 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ /dev/null @@ -1,90 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: kubelet - name: kubelet - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - honorTimestamps: false - interval: 30s - metricRelabelings: - - action: drop - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) - sourceLabels: - - __name__ - path: /metrics/cadvisor - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - path: /metrics/probes - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: kubelet diff --git a/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml index 9c923d54..7f879959 100644 --- a/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: alertmanagerconfigs.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: AlertmanagerConfig listKind: AlertmanagerConfigList plural: alertmanagerconfigs @@ -17,7 +19,7 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: AlertmanagerConfig defines a namespaced AlertmanagerConfig to be aggregated across multiple namespaces configuring one Alertmanager. + description: AlertmanagerConfig defines a namespaced AlertmanagerConfig to be aggregated across multiple namespaces configuring one Alertmanager cluster. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' @@ -28,54 +30,251 @@ spec: metadata: type: object spec: + description: AlertmanagerConfigSpec is a specification of the desired behavior of the Alertmanager configuration. By definition, the Alertmanager configuration only applies to alerts for which the `namespace` label is equal to the namespace of the AlertmanagerConfig resource. properties: inhibitRules: + description: List of inhibition rules. The rules will only apply to alerts matching the resource’s namespace. items: + description: InhibitRule defines an inhibition rule that allows to mute alerts when other alerts are already firing. See https://prometheus.io/docs/alerting/latest/configuration/#inhibit_rule properties: equal: + description: Labels that must have an equal value in the source and target alert for the inhibition to take effect. items: type: string type: array sourceMatch: + description: Matchers for which one or more alerts have to exist for the inhibition to take effect. The operator enforces that the alert matches the resource’s namespace. items: + description: Matcher defines how to match on alert's labels. properties: name: + description: Label to match. + minLength: 1 type: string regex: + description: Whether to match on equality (false) or regular-expression (true). type: boolean value: + description: Label value to match. type: string required: - name - - value type: object type: array targetMatch: + description: Matchers that have to be fulfilled in the alerts to be muted. The operator enforces that the alert matches the resource’s namespace. items: + description: Matcher defines how to match on alert's labels. properties: name: + description: Label to match. + minLength: 1 type: string regex: + description: Whether to match on equality (false) or regular-expression (true). type: boolean value: + description: Label value to match. type: string required: - name - - value type: object type: array type: object type: array receivers: + description: List of receivers. items: + description: Receiver defines one or more notification integrations. properties: + emailConfigs: + description: List of Email configurations. + items: + description: EmailConfig configures notifications via Email. + properties: + authIdentity: + description: The identity to use for authentication. + type: string + authPassword: + description: The secret's key that contains the password to use for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + authSecret: + description: The secret's key that contains the CRAM-MD5 secret. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + authUsername: + description: The username to use for authentication. + type: string + from: + description: The sender address. + type: string + headers: + description: Further headers email header key/value pairs. Overrides any headers previously set by the notification implementation. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + hello: + description: The hostname to identify to the SMTP server. + type: string + html: + description: The HTML body of the email notification. + type: string + requireTLS: + description: The SMTP TLS requirement. Note that Go does not support unencrypted connections to remote SMTP endpoints. + type: boolean + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + smarthost: + description: The SMTP host through which emails are sent. + type: string + text: + description: The text body of the email notification. + type: string + tlsConfig: + description: TLS configuration + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + to: + description: The email address to send notifications to. + type: string + type: object + type: array name: + description: Name of the receiver. Must be unique across all items from the list. + minLength: 1 type: string opsgenieConfigs: + description: List of OpsGenie configurations. items: + description: OpsGenieConfig configures notifications via OpsGenie. See https://prometheus.io/docs/alerting/latest/configuration/#opsgenie_config properties: apiKey: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the OpsGenie API key. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -90,15 +289,22 @@ spec: - key type: object apiURL: + description: The URL to send OpsGenie API requests to. type: string description: + description: Description of the incident. type: string details: + description: A set of arbitrary key/value pairs that provide further detail about the incident. items: + description: KeyValue defines a (key, value) tuple. properties: key: + description: Key of the tuple. + minLength: 1 type: string value: + description: Value of the tuple. type: string required: - key @@ -106,9 +312,32 @@ spec: type: object type: array httpConfig: + description: HTTP client configuration. properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. properties: password: description: The secret in the service monitor namespace that contains the password for authentication. @@ -142,7 +371,7 @@ spec: type: object type: object bearerTokenSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -157,9 +386,10 @@ spec: - key type: object proxyURL: + description: Optional proxy URL. type: string tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration parameters. + description: TLS configuration for the client. properties: ca: description: Struct containing the CA cert to use for the targets. @@ -253,51 +483,78 @@ spec: type: object type: object message: + description: Alert text limited to 130 characters. type: string note: + description: Additional alert note. type: string priority: + description: Priority level of alert. Possible values are P1, P2, P3, P4, and P5. type: string responders: + description: List of responders responsible for notifications. items: + description: OpsGenieConfigResponder defines a responder to an incident. One of `id`, `name` or `username` has to be defined. properties: id: + description: ID of the responder. type: string name: + description: Name of the responder. type: string type: + description: Type of responder. + minLength: 1 type: string username: + description: Username of the responder. type: string + required: + - type type: object type: array sendResolved: + description: Whether or not to notify about resolved alerts. type: boolean source: + description: Backlink to the sender of the notification. type: string tags: + description: Comma separated list of tags attached to the notifications. type: string type: object type: array - pagerDutyConfigs: + pagerdutyConfigs: + description: List of PagerDuty configurations. items: + description: PagerDutyConfig configures notifications via PagerDuty. See https://prometheus.io/docs/alerting/latest/configuration/#pagerduty_config properties: class: + description: The class/type of the event. type: string client: + description: Client identification. type: string clientURL: + description: Backlink to the sender of notification. type: string component: + description: The part or component of the affected system that is broken. type: string description: + description: Description of the incident. type: string details: + description: Arbitrary key/value pairs that provide further detail about the incident. items: + description: KeyValue defines a (key, value) tuple. properties: key: + description: Key of the tuple. + minLength: 1 type: string value: + description: Value of the tuple. type: string required: - key @@ -305,11 +562,35 @@ spec: type: object type: array group: + description: A cluster or grouping of sources. type: string httpConfig: + description: HTTP client configuration. properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. properties: password: description: The secret in the service monitor namespace that contains the password for authentication. @@ -343,7 +624,7 @@ spec: type: object type: object bearerTokenSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -358,9 +639,10 @@ spec: - key type: object proxyURL: + description: Optional proxy URL. type: string tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration parameters. + description: TLS configuration for the client. properties: ca: description: Struct containing the CA cert to use for the targets. @@ -454,7 +736,7 @@ spec: type: object type: object routingKey: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the PagerDuty integration key (when using Events API v2). Either this field or `serviceKey` needs to be defined. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -469,9 +751,10 @@ spec: - key type: object sendResolved: + description: Whether or not to notify about resolved alerts. type: boolean serviceKey: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the PagerDuty service key (when using integration type "Prometheus"). Either this field or `routingKey` needs to be defined. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -486,18 +769,51 @@ spec: - key type: object severity: + description: Severity of the incident. type: string url: + description: The URL to send requests to. type: string type: object type: array - webhookConfigs: + pushoverConfigs: + description: List of Pushover configurations. items: + description: PushoverConfig configures notifications via Pushover. See https://prometheus.io/docs/alerting/latest/configuration/#pushover_config properties: + expire: + description: How long your notification will continue to be retried for, unless the user acknowledges the notification. + type: string + html: + description: Whether notification message is HTML or plain text. + type: boolean httpConfig: + description: HTTP client configuration. properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. properties: password: description: The secret in the service monitor namespace that contains the password for authentication. @@ -531,7 +847,7 @@ spec: type: object type: object bearerTokenSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -546,9 +862,763 @@ spec: - key type: object proxyURL: + description: Optional proxy URL. type: string tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration parameters. + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + message: + description: Notification message. + type: string + priority: + description: Priority, see https://pushover.net/api#priority + type: string + retry: + description: How often the Pushover servers will send the same notification to the user. Must be at least 30 seconds. + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + sound: + description: The name of one of the sounds supported by device clients to override the user's default sound choice + type: string + title: + description: Notification title. + type: string + token: + description: The secret's key that contains the registered application’s API token, see https://pushover.net/apps. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + url: + description: A supplementary URL shown alongside the message. + type: string + urlTitle: + description: A title for supplementary URL, otherwise just the URL is shown + type: string + userKey: + description: The secret's key that contains the recipient user’s user key. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + type: array + slackConfigs: + description: List of Slack configurations. + items: + description: SlackConfig configures notifications via Slack. See https://prometheus.io/docs/alerting/latest/configuration/#slack_config + properties: + actions: + description: A list of Slack actions that are sent with each notification. + items: + description: SlackAction configures a single Slack action that is sent with each notification. See https://api.slack.com/docs/message-attachments#action_fields and https://api.slack.com/docs/message-buttons for more information. + properties: + confirm: + description: SlackConfirmationField protect users from destructive actions or particularly distinguished decisions by asking them to confirm their button click one more time. See https://api.slack.com/docs/interactive-message-field-guide#confirmation_fields for more information. + properties: + dismissText: + type: string + okText: + type: string + text: + minLength: 1 + type: string + title: + type: string + required: + - text + type: object + name: + type: string + style: + type: string + text: + minLength: 1 + type: string + type: + minLength: 1 + type: string + url: + type: string + value: + type: string + required: + - text + - type + type: object + type: array + apiURL: + description: The secret's key that contains the Slack webhook URL. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + callbackId: + type: string + channel: + description: The channel or user to send notifications to. + type: string + color: + type: string + fallback: + type: string + fields: + description: A list of Slack fields that are sent with each notification. + items: + description: SlackField configures a single Slack field that is sent with each notification. Each field must contain a title, value, and optionally, a boolean value to indicate if the field is short enough to be displayed next to other fields designated as short. See https://api.slack.com/docs/message-attachments#fields for more information. + properties: + short: + type: boolean + title: + minLength: 1 + type: string + value: + minLength: 1 + type: string + required: + - title + - value + type: object + type: array + footer: + type: string + httpConfig: + description: HTTP client configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + iconEmoji: + type: string + iconURL: + type: string + imageURL: + type: string + linkNames: + type: boolean + mrkdwnIn: + items: + type: string + type: array + pretext: + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + shortFields: + type: boolean + text: + type: string + thumbURL: + type: string + title: + type: string + titleLink: + type: string + username: + type: string + type: object + type: array + victoropsConfigs: + description: List of VictorOps configurations. + items: + description: VictorOpsConfig configures notifications via VictorOps. See https://prometheus.io/docs/alerting/latest/configuration/#victorops_config + properties: + apiKey: + description: The secret's key that contains the API key to use when talking to the VictorOps API. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + apiUrl: + description: The VictorOps API URL. + type: string + customFields: + description: Additional custom fields for notification. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + entityDisplayName: + description: Contains summary of the alerted problem. + type: string + httpConfig: + description: The HTTP client's configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + messageType: + description: Describes the behavior of the alert (CRITICAL, WARNING, INFO). + type: string + monitoringTool: + description: The monitoring tool the state message is from. + type: string + routingKey: + description: A key used to map the alert to a team. + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + stateMessage: + description: Contains long explanation of the alerted problem. + type: string + type: object + type: array + webhookConfigs: + description: List of webhook configurations. + items: + description: WebhookConfig configures notifications via a generic receiver supporting the webhook payload. See https://prometheus.io/docs/alerting/latest/configuration/#webhook_config + properties: + httpConfig: + description: HTTP client configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. properties: ca: description: Struct containing the CA cert to use for the targets. @@ -642,14 +1712,18 @@ spec: type: object type: object maxAlerts: + description: Maximum number of alerts to be sent per webhook message. When 0, all alerts are included. format: int32 + minimum: 0 type: integer sendResolved: + description: Whether or not to notify about resolved alerts. type: boolean url: + description: The URL to send HTTP POST requests to. `urlSecret` takes precedence over `url`. One of `urlSecret` and `url` should be defined. type: string urlSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the webhook URL to send HTTP requests to. `urlSecret` takes precedence over `url`. One of `urlSecret` and `url` should be defined. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -665,43 +1739,271 @@ spec: type: object type: object type: array + wechatConfigs: + description: List of WeChat configurations. + items: + description: WeChatConfig configures notifications via WeChat. See https://prometheus.io/docs/alerting/latest/configuration/#wechat_config + properties: + agentID: + type: string + apiSecret: + description: The secret's key that contains the WeChat API key. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + apiURL: + description: The WeChat API URL. + type: string + corpID: + description: The corp id for authentication. + type: string + httpConfig: + description: HTTP client configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + message: + description: API request data as defined by the WeChat API. + type: string + messageType: + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + toParty: + type: string + toTag: + type: string + toUser: + type: string + type: object + type: array required: - name type: object type: array route: + description: The Alertmanager route definition for alerts matching the resource’s namespace. If present, it will be added to the generated Alertmanager configuration as a first-level route. properties: continue: + description: Boolean indicating whether an alert should continue matching subsequent sibling nodes. It will always be overridden to true for the first-level route by the Prometheus operator. type: boolean groupBy: + description: List of labels to group by. items: type: string type: array groupInterval: + description: How long to wait before sending an updated notification. Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds seconds minutes hours). type: string groupWait: + description: How long to wait before sending the initial notification. Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds seconds minutes hours). type: string matchers: + description: 'List of matchers that the alert’s labels should match. For the first level route, the operator removes any existing equality and regexp matcher on the `namespace` label and adds a `namespace: ` matcher.' items: + description: Matcher defines how to match on alert's labels. properties: name: + description: Label to match. + minLength: 1 type: string regex: + description: Whether to match on equality (false) or regular-expression (true). type: boolean value: + description: Label value to match. type: string required: - name - - value type: object type: array receiver: + description: Name of the receiver for this route. If not empty, it should be listed in the `receivers` field. type: string repeatInterval: + description: How long to wait before repeating the last notification. Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds seconds minutes hours). type: string routes: + description: Child routes. items: - type: object + x-kubernetes-preserve-unknown-fields: true type: array type: object type: object diff --git a/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index e567a350..4bbdb953 100644 --- a/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: alertmanagers.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Alertmanager listKind: AlertmanagerList plural: alertmanagers @@ -454,6 +456,15 @@ spec: clusterAdvertiseAddress: description: 'ClusterAdvertiseAddress is the explicit address to advertise in cluster. Needs to be provided for non RFC1918 [1] (public) addresses. [1] RFC1918: https://tools.ietf.org/html/rfc1918' type: string + clusterGossipInterval: + description: Interval between gossip attempts. + type: string + clusterPeerTimeout: + description: Timeout for cluster peering. + type: string + clusterPushpullInterval: + description: Interval between pushpull attempts. + type: string configMaps: description: ConfigMaps is a list of ConfigMaps in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The ConfigMaps are mounted into /etc/alertmanager/configmaps/. items: @@ -525,8 +536,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -831,12 +846,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -927,12 +947,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1241,8 +1269,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1547,12 +1579,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1643,12 +1680,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1884,13 +1929,17 @@ spec: logLevel: description: Log level for Alertmanager to be configured with. type: string + minReadySeconds: + description: Minimum number of seconds for which a newly created pod should be ready without any of its container crashing for it to be considered available. Defaults to 0 (pod will be considered available as soon as it is ready) This is an alpha field and requires enabling StatefulSetMinReadySeconds feature gate. + format: int32 + type: integer nodeSelector: additionalProperties: type: string description: Define which Nodes the Pods are scheduled on. type: object paused: - description: If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions. + description: If set to true all actions on the underlying managed objects are not goint to be performed, except for delete actions. type: boolean podMetadata: description: PodMetadata configures Labels and Annotations which are propagated to the alertmanager pods. @@ -1924,12 +1973,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2036,8 +2093,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -2094,12 +2155,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2153,7 +2222,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object conditions: @@ -2515,8 +2588,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2535,8 +2612,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. @@ -2843,8 +2924,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -3099,7 +3184,7 @@ spec: format: int32 type: integer paused: - description: Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed. + description: Represents whether any actions on the underlying managed objects are being performed. Only delete actions will be performed. type: boolean replicas: description: Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector). diff --git a/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml index 50096e73..927a0ba6 100644 --- a/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: podmonitors.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: PodMonitor listKind: PodMonitorList plural: podmonitors @@ -33,6 +35,18 @@ spec: jobLabel: description: The label to use to retrieve the job name from. type: string + labelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer namespaceSelector: description: Selector to select which namespaces the Endpoints objects are discovered from. properties: @@ -50,6 +64,28 @@ spec: items: description: PodMetricsEndpoint defines a scrapeable endpoint of a Kubernetes Pod serving Prometheus metrics. properties: + authorization: + description: Authorization section for this endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: 'BasicAuth allow an endpoint to authenticate over basic authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint' properties: @@ -139,6 +175,77 @@ spec: type: string type: object type: array + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object params: additionalProperties: items: @@ -156,7 +263,7 @@ spec: description: ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint. type: string relabelings: - description: 'RelabelConfigs to apply to samples before ingestion. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + description: 'RelabelConfigs to apply to samples before scraping. Prometheus Operator automatically adds relabelings for a few standard Kubernetes fields and replaces original scrape job name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines ``-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' properties: diff --git a/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml index 691b1e9f..aa511ffb 100644 --- a/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: probes.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Probe listKind: ProbeList plural: probes @@ -30,21 +32,209 @@ spec: spec: description: Specification of desired Ingress selection for target discovery by Prometheus. properties: + authorization: + description: Authorization section for this endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over basic authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint' + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: Secret to mount to read bearer token for scraping targets. The secret needs to be in the same namespace as the probe and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object interval: description: Interval at which targets are probed using the configured prober. If not specified Prometheus' global scrape interval is used. type: string jobName: description: The job name assigned to scraped metrics by default. type: string + labelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + metricRelabelings: + description: MetricRelabelConfigs to apply to samples before ingestion. + items: + description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines ``-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted value is matched. Default is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available. + type: string + type: object + type: array module: description: 'The module to use for probing specifying how to probe the target. Example module configuring in the blackbox exporter: https://github.com/prometheus/blackbox_exporter/blob/master/example.yml' type: string + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object prober: description: Specification for the prober to use for probing targets. The prober.URL parameter is required. Targets cannot be probed if left empty. properties: path: description: Path to collect metrics from. Defaults to `/probe`. type: string + proxyUrl: + description: Optional ProxyURL. + type: string scheme: description: HTTP scheme to use for scraping. Defaults to `http`. type: string @@ -54,9 +244,17 @@ spec: required: - url type: object + sampleLimit: + description: SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + format: int64 + type: integer scrapeTimeout: description: Timeout for scraping metrics from the Prometheus exporter. type: string + targetLimit: + description: TargetLimit defines a limit on the number of scraped targets that will be accepted. + format: int64 + type: integer targets: description: Targets defines a set of static and/or dynamically discovered targets to be probed using the prober. properties: @@ -145,6 +343,37 @@ spec: type: string description: Labels assigned to all metrics scraped from the targets. type: object + relabelingConfigs: + description: 'RelabelConfigs to apply to samples before ingestion. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + items: + description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines ``-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted value is matched. Default is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available. + type: string + type: object + type: array static: description: Targets is a list of URLs to probe using the configured prober. items: @@ -152,6 +381,99 @@ spec: type: array type: object type: object + tlsConfig: + description: TLS configuration to use when scraping the endpoint. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object type: object required: - spec diff --git a/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml index eb40f0fc..1122aec4 100644 --- a/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: prometheuses.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Prometheus listKind: PrometheusList plural: prometheuses @@ -439,6 +441,28 @@ spec: apiVersion: description: Version of the Alertmanager API that Prometheus uses to send alerts. It can be "v1" or "v2". type: string + authorization: + description: Authorization section for this alertmanager endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object bearerTokenFile: description: BearerTokenFile to read from filesystem to use when authenticating to Alertmanager. type: string @@ -580,6 +604,31 @@ spec: apiserverConfig: description: APIServerConfig allows specifying a host and auth methods to access apiserver. If left empty, Prometheus is assumed to run inside of the cluster and will discover API servers automatically and use the pod's CA certificate and bearer token file at /var/run/secrets/kubernetes.io/serviceaccount/. properties: + authorization: + description: Authorization section for accessing apiserver + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + credentialsFile: + description: File to read a secret from, mutually exclusive with Credentials (from SafeAuthorization) + type: string + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: BasicAuth allow an endpoint to authenticate over basic authentication properties: @@ -805,8 +854,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1111,12 +1164,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1207,12 +1265,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1445,19 +1511,36 @@ spec: enableAdminAPI: description: 'Enable access to prometheus web admin API. Defaults to the value of `false`. WARNING: Enabling the admin APIs enables mutating endpoints, to delete data, shutdown Prometheus, and more. Enabling this should be done with care and the user is advised to add additional authentication authorization via a proxy to ensure only clients authorized to perform these actions can do so. For more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' type: boolean + enableFeatures: + description: Enable access to Prometheus disabled features. By default, no features are enabled. Enabling disabled features is entirely outside the scope of what the maintainers will support and by doing so, you accept that this behaviour may break at any time without notice. For more information see https://prometheus.io/docs/prometheus/latest/disabled_features/ + items: + type: string + type: array + enforcedLabelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. If more than this number of labels are present post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + enforcedLabelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. If a label name is longer than this number post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + enforcedLabelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. If a label value is longer than this number post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer enforcedNamespaceLabel: - description: EnforcedNamespaceLabel enforces adding a namespace label of origin for each alert and metric that is user created. The label value will always be the namespace of the object that is being created. + description: "EnforcedNamespaceLabel If set, a label will be added to \n 1. all user-metrics (created by `ServiceMonitor`, `PodMonitor` and `ProbeConfig` object) and 2. in all `PrometheusRule` objects (except the ones excluded in `prometheusRulesExcludedFromEnforce`) to * alerting & recording rules and * the metrics used in their expressions (`expr`). \n Label name is this field's value. Label value is the namespace of the created object (mentioned above)." type: string enforcedSampleLimit: description: EnforcedSampleLimit defines global limit on number of scraped samples that will be accepted. This overrides any SampleLimit set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the SampleLimit to keep overall number of samples/series under the desired limit. Note that if SampleLimit is lower that value will be taken instead. format: int64 type: integer enforcedTargetLimit: - description: EnforcedTargetLimit defines a global limit on the number of scraped targets. This overrides any TargetLimit set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the TargetLimit to keep overall number of targets under the desired limit. Note that if TargetLimit is higher that value will be taken instead. + description: EnforcedTargetLimit defines a global limit on the number of scraped targets. This overrides any TargetLimit set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the TargetLimit to keep the overall number of targets under the desired limit. Note that if TargetLimit is lower, that value will be taken instead, except if either value is zero, in which case the non-zero value will be used. If both values are zero, no limit is enforced. format: int64 type: integer evaluationInterval: - description: Interval between consecutive evaluations. + description: 'Interval between consecutive evaluations. Default: `1m`' type: string externalLabels: additionalProperties: @@ -1484,7 +1567,7 @@ spec: type: object type: array initContainers: - description: 'InitContainers allows adding initContainers to the pod definition. Those can be used to e.g. fetch secrets for injection into the Prometheus configuration from external sources. Any errors during the execution of an initContainer will lead to a restart of the Pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ Using initContainers for any use case other then secret fetching is entirely outside the scope of what the maintainers will support and by doing so, you accept that this behaviour may break at any time without notice.' + description: 'InitContainers allows adding initContainers to the pod definition. Those can be used to e.g. fetch secrets for injection into the Prometheus configuration from external sources. Any errors during the execution of an initContainer will lead to a restart of the Pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ InitContainers described here modify an operator generated init containers if they share the same name and modifications are done via a strategic merge patch. The current init container name is: `init-config-reloader`. Overriding init containers is entirely outside the scope of what the maintainers will support and by doing so, you accept that this behaviour may break at any time without notice.' items: description: A single application container that you want to run within a pod. properties: @@ -1546,8 +1629,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1852,12 +1939,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1948,12 +2040,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2189,6 +2289,10 @@ spec: logLevel: description: Log level for Prometheus to be configured with. type: string + minReadySeconds: + description: Minimum number of seconds for which a newly created pod should be ready without any of its container crashing for it to be considered available. Defaults to 0 (pod will be considered available as soon as it is ready) This is an alpha field and requires enabling StatefulSetMinReadySeconds feature gate. + format: int32 + type: integer nodeSelector: additionalProperties: type: string @@ -2221,7 +2325,7 @@ spec: type: string type: object podMonitorNamespaceSelector: - description: Namespaces to be selected for PodMonitor discovery. If nil, only check own namespace. + description: Namespace's labels to match for PodMonitor discovery. If nil, only check own namespace. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. @@ -2391,6 +2495,31 @@ spec: items: description: RemoteReadSpec defines the remote_read configuration for prometheus. properties: + authorization: + description: Authorization section for remote read + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + credentialsFile: + description: File to read a secret from, mutually exclusive with Credentials (from SafeAuthorization) + type: string + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: BasicAuth for the URL. properties: @@ -2426,7 +2555,7 @@ spec: type: object type: object bearerToken: - description: bearer token for remote read. + description: Bearer token for remote read. type: string bearerTokenFile: description: File to read bearer token for remote read. @@ -2434,6 +2563,77 @@ spec: name: description: The name of the remote read queue, must be unique if specified. The name is used in metrics and logging in order to differentiate read configurations. Only valid in Prometheus versions 2.15.0 and newer. type: string + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object proxyUrl: description: Optional ProxyURL type: string @@ -2562,6 +2762,31 @@ spec: items: description: RemoteWriteSpec defines the remote_write configuration for prometheus. properties: + authorization: + description: Authorization section for remote write + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + credentialsFile: + description: File to read a secret from, mutually exclusive with Credentials (from SafeAuthorization) + type: string + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: BasicAuth for the URL. properties: @@ -2597,14 +2822,100 @@ spec: type: object type: object bearerToken: - description: File to read bearer token for remote write. + description: Bearer token for remote write. type: string bearerTokenFile: description: File to read bearer token for remote write. type: string + headers: + additionalProperties: + type: string + description: Custom HTTP headers to be sent along with each remote write request. Be aware that headers that are set by Prometheus itself can't be overwritten. Only valid in Prometheus versions 2.25.0 and newer. + type: object + metadataConfig: + description: MetadataConfig configures the sending of series metadata to remote storage. + properties: + send: + description: Whether metric metadata is sent to remote storage or not. + type: boolean + sendInterval: + description: How frequently metric metadata is sent to remote storage. + type: string + type: object name: description: The name of the remote write queue, must be unique if specified. The name is used in metrics and logging in order to differentiate queues. Only valid in Prometheus versions 2.15.0 and newer. type: string + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object proxyUrl: description: Optional ProxyURL type: string @@ -2639,6 +2950,9 @@ spec: remoteTimeout: description: Timeout for requests to the remote write endpoint. type: string + sendExemplars: + description: Enables sending of exemplars over remote write. Note that exemplar-storage itself must be enabled using the enableFeature option for exemplars to be scraped in the first place. Only valid in Prometheus versions 2.27.0 and newer. + type: boolean tlsConfig: description: TLS Config to use for remote write. properties: @@ -2783,7 +3097,7 @@ spec: description: Name of Prometheus external label used to denote replica name. Defaults to the value of `prometheus_replica`. External label will _not_ be added when value is set to empty string (`""`). type: string replicas: - description: Number of instances to deploy for a Prometheus deployment. + description: Number of replicas of each shard to deploy for a Prometheus deployment. Number of replicas multiplied by shards is the total number of Pods created. format: int32 type: integer resources: @@ -2791,12 +3105,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2804,7 +3126,7 @@ spec: description: Time duration Prometheus shall retain data for. Default is '24h', and must match the regular expression `[0-9]+(ms|s|m|h|d|w|y)` (milliseconds seconds minutes hours days weeks years). type: string retentionSize: - description: Maximum amount of disk space used by blocks. + description: 'Maximum amount of disk space used by blocks. Supported units: B, KB, MB, GB, TB, PB, EB. Ex: `512MB`.' type: string routePrefix: description: The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`. @@ -2887,7 +3209,7 @@ spec: type: object type: object scrapeInterval: - description: Interval between consecutive scrapes. + description: 'Interval between consecutive scrapes. Default: `1m`' type: string scrapeTimeout: description: Number of seconds to wait for target to respond before erroring. @@ -2974,7 +3296,7 @@ spec: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. type: string serviceMonitorNamespaceSelector: - description: Namespaces to be selected for ServiceMonitor discovery. If nil, only check own namespace. + description: Namespace's labels to match for ServiceMonitor discovery. If nil, only check own namespace. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. @@ -3036,6 +3358,10 @@ spec: sha: description: 'SHA of Prometheus container image to be deployed. Defaults to the value of `version`. Similar to a tag, but the SHA explicitly deploys an immutable container image. Version and Tag are ignored if SHA is set. Deprecated: use ''image'' instead. The image digest can be specified as part of the image URL.' type: string + shards: + description: 'EXPERIMENTAL: Number of shards to distribute targets onto. Number of replicas multiplied by shards is the total number of Pods created. Note that scaling down shards will not reshard data onto remaining instances, it must be manually moved. Increasing shards will not reshard data either but it will continue to be available from the same instances. To query globally use Thanos sidecar and Thanos querier or remote write data to a central location. Sharding is done on the content of the `__address__` target meta-label.' + format: int32 + type: integer storage: description: Storage spec to specify how storage shall be used. properties: @@ -3049,8 +3375,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -3107,12 +3437,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -3166,7 +3504,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object conditions: @@ -3331,7 +3673,7 @@ spec: description: MinTime for Thanos sidecar to be configured with. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. type: string objectStorageConfig: - description: ObjectStorageConfig configures object storage in Thanos. + description: ObjectStorageConfig configures object storage in Thanos. Alternative to ObjectStorageConfigFile, and lower order priority. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -3345,17 +3687,31 @@ spec: required: - key type: object + objectStorageConfigFile: + description: ObjectStorageConfigFile specifies the path of the object storage configuration file. When used alongside with ObjectStorageConfig, ObjectStorageConfigFile takes precedence. + type: string + readyTimeout: + description: ReadyTimeout is the maximum time Thanos sidecar will wait for Prometheus to start. Eg 10m + type: string resources: description: Resources defines the resource requirements for the Thanos sidecar. If not provided, no requests/limits will be set properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -3380,9 +3736,40 @@ spec: required: - key type: object + tracingConfigFile: + description: TracingConfig specifies the path of the tracing configuration file. When used alongside with TracingConfig, TracingConfigFile takes precedence. + type: string version: description: Version describes the version of Thanos to use. type: string + volumeMounts: + description: VolumeMounts allows configuration of additional VolumeMounts on the output StatefulSet definition. VolumeMounts specified will be appended to other VolumeMounts in the thanos-sidecar container. + items: + description: VolumeMount describes a mounting of a Volume within a container. + properties: + mountPath: + description: Path within the container at which the volume should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array type: object tolerations: description: If specified, the pod's tolerations. @@ -3705,8 +4092,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -3725,8 +4116,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. @@ -4033,8 +4428,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -4289,6 +4688,118 @@ spec: pageTitle: description: The prometheus web page title type: string + tlsConfig: + description: WebTLSConfig defines the TLS parameters for HTTPS. + properties: + cert: + description: Contains the TLS certificate for the server. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cipherSuites: + description: 'List of supported cipher suites for TLS versions up to TLS 1.2. If empty, Go default cipher suites are used. Available cipher suites are documented in the go documentation: https://golang.org/pkg/crypto/tls/#pkg-constants' + items: + type: string + type: array + client_ca: + description: Contains the CA certificate for client certificate authentication to the server. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientAuthType: + description: 'Server policy for client authentication. Maps to ClientAuth Policies. For more detail on clientAuth options: https://golang.org/pkg/crypto/tls/#ClientAuthType' + type: string + curvePreferences: + description: 'Elliptic curves that will be used in an ECDHE handshake, in preference order. Available curves are documented in the go documentation: https://golang.org/pkg/crypto/tls/#CurveID' + items: + type: string + type: array + keySecret: + description: Secret containing the TLS key for the server. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + maxVersion: + description: Maximum TLS version that is acceptable. Defaults to TLS13. + type: string + minVersion: + description: Minimum TLS version that is acceptable. Defaults to TLS12. + type: string + preferServerCipherSuites: + description: Controls whether the server selects the client's most preferred cipher suite, or the server's most preferred cipher suite. If true then the server's preference, as expressed in the order of elements in cipherSuites, is used. + type: boolean + required: + - cert + - keySecret + type: object type: object type: object status: @@ -4299,7 +4810,7 @@ spec: format: int32 type: integer paused: - description: Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed. + description: Represents whether any actions on the underlying managed objects are being performed. Only delete actions will be performed. type: boolean replicas: description: Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector). diff --git a/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index cf990715..a44c7058 100644 --- a/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: prometheusrules.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: PrometheusRule listKind: PrometheusRuleList plural: prometheusrules @@ -17,7 +19,7 @@ spec: - name: v1 schema: openAPIV3Schema: - description: PrometheusRule defines alerting rules for a Prometheus instance + description: PrometheusRule defines recording and alerting rules for a Prometheus instance properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' @@ -43,7 +45,7 @@ spec: type: string rules: items: - description: Rule describes an alerting or recording rule. + description: 'Rule describes an alerting or recording rule See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule' properties: alert: type: string diff --git a/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 6d946998..e5f84771 100644 --- a/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: servicemonitors.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: ServiceMonitor listKind: ServiceMonitorList plural: servicemonitors @@ -35,6 +37,28 @@ spec: items: description: Endpoint defines a scrapeable endpoint serving Prometheus metrics. properties: + authorization: + description: Authorization section for this endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' properties: @@ -127,6 +151,77 @@ spec: type: string type: object type: array + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object params: additionalProperties: items: @@ -144,7 +239,7 @@ spec: description: ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint. type: string relabelings: - description: 'RelabelConfigs to apply to samples before scraping. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + description: 'RelabelConfigs to apply to samples before scraping. Prometheus Operator automatically adds relabelings for a few standard Kubernetes fields and replaces original scrape job name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines ``-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' properties: @@ -291,10 +386,22 @@ spec: type: object type: array jobLabel: - description: The label to use to retrieve the job name from. + description: "Chooses the label of the Kubernetes `Endpoints`. Its value will be used for the `job`-label's value of the created metrics. \n Default & fallback value: the name of the respective Kubernetes `Endpoint`." type: string + labelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer namespaceSelector: - description: Selector to select which namespaces the Endpoints objects are discovered from. + description: Selector to select which namespaces the Kubernetes Endpoints objects are discovered from. properties: any: description: Boolean describing whether all namespaces are selected in contrast to a list restricting them. @@ -306,7 +413,7 @@ spec: type: array type: object podTargetLabels: - description: PodTargetLabels transfers labels on the Kubernetes Pod onto the target. + description: PodTargetLabels transfers labels on the Kubernetes `Pod` onto the created metrics. items: type: string type: array @@ -345,7 +452,7 @@ spec: type: object type: object targetLabels: - description: TargetLabels transfers labels on the Kubernetes Service onto the target. + description: TargetLabels transfers labels from the Kubernetes `Service` onto the created metrics. All labels set in `selector.matchLabels` are automatically transferred. items: type: string type: array diff --git a/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml index a6c61355..bd9ede43 100644 --- a/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: thanosrulers.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: ThanosRuler listKind: ThanosRulerList plural: thanosrulers @@ -462,8 +464,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -768,12 +774,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -864,12 +875,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1283,8 +1302,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1589,12 +1612,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1685,12 +1713,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1931,13 +1967,17 @@ spec: logLevel: description: Log level for ThanosRuler to be configured with. type: string + minReadySeconds: + description: Minimum number of seconds for which a newly created pod should be ready without any of its container crashing for it to be considered available. Defaults to 0 (pod will be considered available as soon as it is ready) This is an alpha field and requires enabling StatefulSetMinReadySeconds feature gate. + format: int32 + type: integer nodeSelector: additionalProperties: type: string description: Define which Nodes the Pods are scheduled on. type: object objectStorageConfig: - description: ObjectStorageConfig configures object storage in Thanos. + description: ObjectStorageConfig configures object storage in Thanos. Alternative to ObjectStorageConfigFile, and lower order priority. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -1951,6 +1991,9 @@ spec: required: - key type: object + objectStorageConfigFile: + description: ObjectStorageConfigFile specifies the path of the object storage configuration file. When used alongside with ObjectStorageConfig, ObjectStorageConfigFile takes precedence. + type: string paused: description: When a ThanosRuler deployment is paused, no actions except for deletion will be performed on the underlying objects. type: boolean @@ -2022,12 +2065,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2186,8 +2237,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -2244,12 +2299,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2303,7 +2366,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object conditions: @@ -2646,8 +2713,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2666,8 +2737,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. @@ -2974,8 +3049,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string diff --git a/manifests/setup/prometheus-operator-clusterRole.yaml b/manifests/setup/prometheus-operator-clusterRole.yaml index 8153e807..5eea2738 100644 --- a/manifests/setup/prometheus-operator-clusterRole.yaml +++ b/manifests/setup/prometheus-operator-clusterRole.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator rules: - apiGroups: diff --git a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml index 00f305a0..3ce2b874 100644 --- a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml +++ b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/setup/prometheus-operator-deployment.yaml b/manifests/setup/prometheus-operator-deployment.yaml index 119f6390..b832acb8 100644 --- a/manifests/setup/prometheus-operator-deployment.yaml +++ b/manifests/setup/prometheus-operator-deployment.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring spec: @@ -13,19 +14,22 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus template: metadata: + annotations: + kubectl.kubernetes.io/default-container: prometheus-operator labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 spec: containers: - args: - --kubelet-service=kube-system/kubelet - - --logtostderr=true - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.43.2 - image: quay.io/prometheus-operator/prometheus-operator:v0.43.2 + - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.50.0 + image: quay.io/prometheus-operator/prometheus-operator:v0.50.0 name: prometheus-operator ports: - containerPort: 8080 @@ -44,15 +48,24 @@ spec: - --secure-listen-address=:8443 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8080/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy ports: - containerPort: 8443 name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 nodeSelector: - beta.kubernetes.io/os: linux + kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/setup/prometheus-operator-service.yaml b/manifests/setup/prometheus-operator-service.yaml index 37d8f504..a8161b08 100644 --- a/manifests/setup/prometheus-operator-service.yaml +++ b/manifests/setup/prometheus-operator-service.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring spec: @@ -16,3 +17,4 @@ spec: selector: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/setup/prometheus-operator-serviceAccount.yaml b/manifests/setup/prometheus-operator-serviceAccount.yaml index cc4a1f8d..9acb906d 100644 --- a/manifests/setup/prometheus-operator-serviceAccount.yaml +++ b/manifests/setup/prometheus-operator-serviceAccount.yaml @@ -4,6 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring diff --git a/scripts/generate-schemas.sh b/scripts/generate-schemas.sh new file mode 100755 index 00000000..06b9bbe3 --- /dev/null +++ b/scripts/generate-schemas.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +DIR="crdschemas" + +# Go to git repository root +cd ./$(git rev-parse --show-cdup) + +rm -rf "$DIR" +mkdir "$DIR" + +for crd in vendor/prometheus-operator/*-crd.libsonnet; do + jq '.spec.versions[0].schema.openAPIV3Schema' < "$crd" > "$DIR/$(basename "$crd" | sed 's/-crd.libsonnet/.json/')" +done diff --git a/scripts/generate-versions.sh b/scripts/generate-versions.sh new file mode 100755 index 00000000..05c48a63 --- /dev/null +++ b/scripts/generate-versions.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -euo pipefail + +# Get component version from GitHub API +get_latest_version() { + echo >&2 "Checking release version for ${1}" + curl --retry 5 --silent --fail -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/repos/${1}/releases/latest" | jq '.tag_name' | tr -d '"v' +} + +# Get component version from version file +get_current_version() { + echo >&2 "Reading currently used version of ${1}" + v=$(jq -r ".${1}" "$VERSION_FILE") + if [ "${v}" == "" ]; then + echo >&2 "Couldn't read version of ${1} from $VERSION_FILE" + exit 1 + fi + echo "$v" +} + +# Get version from online source and filter out unstable releases. In case of unstable release use what is set in version file +get_version() { + component="${1}" + v="$(get_latest_version "${component}")" + + component="$(convert_to_camel_case "$(echo "${component}" | sed 's/^.*\///')")" + cv="$(get_current_version "${component}")" + + # Advanced AI heurestics to filter out common patterns suggesting new version is not stable /s + if [[ "$v" == "" ]] || [[ "$v" == *"alpha"* ]] || [[ "$v" == *"beta"* ]] || [[ "$v" == *"rc"* ]] || [[ "$v" == *"helm"* ]]; then + echo "$cv" + return + fi + + # Use higher version from new version and current version + v=$(printf '%s\n' "$v" "$cv" | sort -r | head -n1) + + echo "$v" +} + +convert_to_camel_case() { + echo "${1}" | sed -E 's/[ _-]([a-z])/\U\1/gi;s/^([A-Z])/\l\1/' +} + +# File is used to read current versions +VERSION_FILE="$(pwd)/jsonnet/kube-prometheus/versions.json" + +# token can be passed as `GITHUB_TOKEN` variable or passed as first argument +GITHUB_TOKEN=${GITHUB_TOKEN:-${1}} + +if [ -z "$GITHUB_TOKEN" ]; then + echo >&2 "GITHUB_TOKEN not set. Exiting" + exit 1 +fi + +cat <<-EOF +{ + "alertmanager": "$(get_version "prometheus/alertmanager")", + "blackboxExporter": "$(get_version "prometheus/blackbox_exporter")", + "grafana": "$(get_version "grafana/grafana")", + "kubeStateMetrics": "$(get_version "kubernetes/kube-state-metrics")", + "nodeExporter": "$(get_version "prometheus/node_exporter")", + "prometheus": "$(get_version "prometheus/prometheus")", + "prometheusAdapter": "$(get_version "kubernetes-sigs/prometheus-adapter")", + "prometheusOperator": "$(get_version "prometheus-operator/prometheus-operator")", + "kubeRbacProxy": "$(get_version "brancz/kube-rbac-proxy")", + "configmapReload": "$(get_version "jimmidyson/configmap-reload")" +} +EOF diff --git a/scripts/go.mod b/scripts/go.mod new file mode 100644 index 00000000..07b37e91 --- /dev/null +++ b/scripts/go.mod @@ -0,0 +1,11 @@ +module _ // go.mod created for tooling dependencies + +go 1.15 + +require ( + github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c + github.com/campoy/embedmd v1.0.0 + github.com/google/go-jsonnet v0.17.1-0.20210101181740-31d71aaccda6 // 7 commits after 0.17.0. Needed by jsonnet linter + github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 + github.com/yannh/kubeconform v0.4.7 +) diff --git a/scripts/go.sum b/scripts/go.sum new file mode 100644 index 00000000..1f96b0ed --- /dev/null +++ b/scripts/go.sum @@ -0,0 +1,74 @@ +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU= +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY= +github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/beevik/etree v1.1.0 h1:T0xke/WvNtMoCqgzPhkX2r4rjY3GDZFi+FjpRZY2Jbs= +github.com/beevik/etree v1.1.0/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A= +github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c h1:hb6WqfcKQZlNx/vahy51SaIvKnoXD5609Nm0PC4msEM= +github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c/go.mod h1:+00lOjYXPgMfxHVPvg9GDtc3BX5Xh5aFpB4gMB8gfMo= +github.com/campoy/embedmd v1.0.0 h1:V4kI2qTJJLf4J29RzI/MAt2c3Bl4dQSYPuflzwFH2hY= +github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX3MzVl8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= +github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= +github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= +github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/google/go-jsonnet v0.17.0 h1:/9NIEfhK1NQRKl3sP2536b2+x5HnZMdql7x3yK/l8JY= +github.com/google/go-jsonnet v0.17.0/go.mod h1:sOcuej3UW1vpPTZOr8L7RQimqai1a57bt5j22LzGZCw= +github.com/google/go-jsonnet v0.17.1-0.20210101181740-31d71aaccda6 h1:91EupyycmO5ctzKuWEZ9nX0Cal1NveMiWcXxmRtLyLQ= +github.com/google/go-jsonnet v0.17.1-0.20210101181740-31d71aaccda6/go.mod h1:sOcuej3UW1vpPTZOr8L7RQimqai1a57bt5j22LzGZCw= +github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 h1:4BKZ6LDqPc2wJDmaKnmYD/vDjUptJtnUpai802MibFc= +github.com/jsonnet-bundler/jsonnet-bundler v0.4.0/go.mod h1:/by7P/OoohkI3q4CgSFqcoFsVY+IaNbzOVDknEsKDeU= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= +github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA= +github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.11 h1:FxPOTFNqGkuDUGi3H/qkUbQO4ZiBa2brKq5r0l8TGeM= +github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= +github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw= +github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f h1:J9EGpcZtP0E/raorCMxlFGSTBrsSlaDGf3jU/qvAE2c= +github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= +github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= +github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= +github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= +github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= +github.com/yannh/kubeconform v0.4.2 h1:8ve/dz6ns9tT5efR1Qfn8569JkenPFqnWcVWGz3lqPw= +github.com/yannh/kubeconform v0.4.2/go.mod h1:Ysf3RSreh2rX8IJsVt/uT3Um/U3e3ykx6Fcz8nCdskM= +github.com/yannh/kubeconform v0.4.7 h1:ExAjZYd6D0WnG/Eq/IRxvTebPbARh6e6M96Pq8Xy5u0= +github.com/yannh/kubeconform v0.4.7/go.mod h1:lhkEiaDOtSewHGGZ8iR2iiTC0CSnR7xbMEtyL4bm4rE= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 h1:YyJpGZS1sBuBCzLAR1VEpK193GlqGZbnPFnPV/5Rsb4= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc= +gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= +gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= +sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= diff --git a/test.sh b/scripts/test.sh similarity index 80% rename from test.sh rename to scripts/test.sh index 58b06fe8..6774c63a 100755 --- a/test.sh +++ b/scripts/test.sh @@ -5,6 +5,8 @@ set -o pipefail # Make sure to use project tooling PATH="$(pwd)/tmp/bin:${PATH}" +TESTFILE="$(pwd)/tmp/test.jsonnet" +mkdir -p "$(pwd)/tmp" for i in examples/jsonnet-snippets/*.jsonnet; do [ -f "$i" ] || break @@ -14,13 +16,13 @@ for i in examples/jsonnet-snippets/*.jsonnet; do snippet="local kp = $fileContent; $( "test.jsonnet" + echo "${snippet}" > "${TESTFILE}" echo "\`\`\`" echo "${snippet}" echo "\`\`\`" echo "" - jsonnet -J vendor "test.jsonnet" > /dev/null - rm -rf "test.jsonnet" + jsonnet -J vendor "${TESTFILE}" > /dev/null + rm -rf "${TESTFILE}" done for i in examples/*.jsonnet; do diff --git a/scripts/tools.go b/scripts/tools.go index b6cba4f2..64813348 100644 --- a/scripts/tools.go +++ b/scripts/tools.go @@ -8,6 +8,8 @@ import ( _ "github.com/brancz/gojsontoyaml" _ "github.com/campoy/embedmd" _ "github.com/google/go-jsonnet/cmd/jsonnet" + _ "github.com/google/go-jsonnet/cmd/jsonnet-lint" _ "github.com/google/go-jsonnet/cmd/jsonnetfmt" + _ "github.com/yannh/kubeconform/cmd/kubeconform" _ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb" )