# NOTE: These rules were kindly contributed by the SoundCloud engineering team. ### Container resources ### cluster_namespace_controller_pod_container:spec_memory_limit_bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:spec_cpu_shares = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:cpu_usage:rate = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( irate( container_cpu_usage_seconds_total{container_name!=""}[5m] ), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_working_set:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_rss:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_cache:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:disk_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_pagefaults:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( irate( container_memory_failures_total{container_name!=""}[5m] ), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) cluster_namespace_controller_pod_container:memory_oom:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( irate( container_memory_failcnt{container_name!=""}[5m] ), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+" ) ) ### Cluster resources ### cluster:memory_allocation:percent = 100 * sum by (cluster) ( container_spec_memory_limit_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) cluster:memory_used:percent = 100 * sum by (cluster) ( container_memory_usage_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) cluster:cpu_allocation:percent = 100 * sum by (cluster) ( container_spec_cpu_shares{pod_name!=""} ) / sum by (cluster) ( container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores ) cluster:node_cpu_use:percent = 100 * sum by (cluster) ( rate(node_cpu{mode!="idle"}[5m]) ) / sum by (cluster) ( machine_cpu_cores ) ### API latency ### # Raw metrics are in microseconds. Convert to seconds. cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = histogram_quantile( 0.99, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = histogram_quantile( 0.9, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = histogram_quantile( 0.5, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 ### Scheduling latency ### cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 ALERT K8SNodeDown IF up{job="kubelets"} == 0 FOR 1h LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Kubelet cannot be scraped", description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", } ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h LABELS { service = "k8s", severity = "warning", } ANNOTATIONS { summary = "Node status is NotReady", description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", } ALERT K8SManyNodesNotReady IF count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 AND ( count by (cluster) (kube_node_status_ready{condition="true"} == 0) / count by (cluster) (kube_node_status_ready{condition="true"}) ) > 0.2 FOR 1m LABELS { service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Many K8s nodes are Not Ready", description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", } ALERT K8SKubeletNodeExporterDown IF up{job="node-exporter"} == 0 FOR 15m LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Kubelet node_exporter cannot be scraped", description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", } ALERT K8SKubeletDown IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 FOR 1h LABELS { service = "k8s", severity = "critical" } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", } ALERT K8SApiserverDown IF up{job="kubernetes"} == 0 FOR 15m LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "API server unreachable", description = "An API server could not be scraped.", } # Disable for non HA kubernetes setups. ALERT K8SApiserverDown IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 FOR 5m LABELS { service = "k8s", severity = "critical" } ANNOTATIONS { summary = "API server unreachable", description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", } ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) FOR 5m LABELS { service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", } ALERT K8SMoreThanOneController IF count by (job,cluster) (up{job=~"kube-scheduler|kube-controller-manager"}) > 1 FOR 5m LABELS { service = "k8s", severity = "critical", } ANNOTATIONS { summary = "More than one controller node is active", description = "There is more than one {{ $labels.job }} managing the cluster. Cluster behaviour is undefined.", } ALERT K8SConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 FOR 10m LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } ALERT K8SConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 LABELS { service = "k8s", severity = "critical" } ANNOTATIONS { summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } # To catch the conntrack sysctl de-tuning when it happens ALERT K8SConntrackTuningMissing IF node_nf_conntrack_udp_timeout > 10 FOR 10m LABELS { service = "k8s", severity = "warning", } ANNOTATIONS { summary = "Node does not have the correct conntrack tunings", description = "Nodes keep un-setting the correct tunings, investigate when it happens.", } ALERT K8STooManyOpenFiles IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 FOR 10m LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "{{ $labels.job }} has too many open file descriptors", description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", } ALERT K8STooManyOpenFiles IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 FOR 10m LABELS { service = "k8s", severity = "critical" } ANNOTATIONS { summary = "{{ $labels.job }} has too many open file descriptors", description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", } # Some verbs excluded because they are expected to be long-lasting: # WATCHLIST is long-poll, CONNECT is `kubectl exec`. ALERT K8SApiServerLatency IF histogram_quantile( 0.99, sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) ) / 1e6 > 1.0 FOR 10m LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Kubernetes apiserver latency is high", description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", } ALERT K8SApiServerEtcdAccessLatency IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 FOR 15m LABELS { service = "k8s", severity = "warning" } ANNOTATIONS { summary = "Access to etcd is slow", description = "99th percentile latency for apiserver to access etcd is higher than 1s.", } ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { service = "k8s", severity = "warning", } ANNOTATIONS { summary = "Kubelet is close to pod limit", description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", }