### Up Alerting ### Alert TargetDown IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Targets are down", description = "{{ $value }}% or more of {{ $labels.job }} targets are down." } ### Dead man's switch ### ALERT DeadMansSwitch IF vector(1) LABELS { severity = "none", } ANNOTATIONS { summary = "Alerting DeadMansSwitch", description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", } ### File descriptor alerts ### ALERT TooManyOpenFileDescriptors IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", } instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", }