feat: add histogram metric for notification_latency_seconds (#16637)

This metric can be used to create alerting based on how many
notifications finish or do not finish within a certain amount of time.

Change-Id: afbf3d8ceb3994c7d6220389353cff92
Signed-Off-By: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com>
Co-authored-by: Björn Rabenstein <github@rabenste.in>

---------

Signed-off-by: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com>
Co-authored-by: Björn Rabenstein <github@rabenste.in>
This commit is contained in:
Kevin Hellemun 2025-11-11 13:47:37 +01:00 committed by GitHub
parent 9d508a4888
commit 33082be0e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 33 additions and 7 deletions

View File

@ -111,7 +111,8 @@ func (s *alertmanagerSet) sync(tgs []*targetgroup.Group) {
if _, ok := seen[us]; ok {
continue
}
s.metrics.latency.DeleteLabelValues(us)
s.metrics.latencySummary.DeleteLabelValues(us)
s.metrics.latencyHistogram.DeleteLabelValues(us)
s.metrics.sent.DeleteLabelValues(us)
s.metrics.errors.DeleteLabelValues(us)
seen[us] = struct{}{}

View File

@ -498,7 +498,9 @@ func (n *Manager) sendAll(alerts ...*Alert) bool {
amSetCovered.CompareAndSwap(k, false, true)
}
n.metrics.latency.WithLabelValues(url).Observe(time.Since(begin).Seconds())
durationSeconds := time.Since(begin).Seconds()
n.metrics.latencySummary.WithLabelValues(url).Observe(durationSeconds)
n.metrics.latencyHistogram.WithLabelValues(url).Observe(durationSeconds)
n.metrics.sent.WithLabelValues(url).Add(float64(count))
wg.Done()

View File

@ -13,10 +13,15 @@
package notifier
import "github.com/prometheus/client_golang/prometheus"
import (
"time"
"github.com/prometheus/client_golang/prometheus"
)
type alertMetrics struct {
latency *prometheus.SummaryVec
latencySummary *prometheus.SummaryVec
latencyHistogram *prometheus.HistogramVec
errors *prometheus.CounterVec
sent *prometheus.CounterVec
dropped prometheus.Counter
@ -25,9 +30,13 @@ type alertMetrics struct {
alertmanagersDiscovered prometheus.GaugeFunc
}
func newAlertMetrics(r prometheus.Registerer, queueCap int, queueLen, alertmanagersDiscovered func() float64) *alertMetrics {
func newAlertMetrics(
r prometheus.Registerer,
queueCap int,
queueLen, alertmanagersDiscovered func() float64,
) *alertMetrics {
m := &alertMetrics{
latency: prometheus.NewSummaryVec(prometheus.SummaryOpts{
latencySummary: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_seconds",
@ -36,6 +45,19 @@ func newAlertMetrics(r prometheus.Registerer, queueCap int, queueLen, alertmanag
},
[]string{alertmanagerLabel},
),
latencyHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_histogram_seconds",
Help: "Latency histogram for sending alert notifications.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{alertmanagerLabel},
),
errors: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
@ -80,7 +102,8 @@ func newAlertMetrics(r prometheus.Registerer, queueCap int, queueLen, alertmanag
if r != nil {
r.MustRegister(
m.latency,
m.latencySummary,
m.latencyHistogram,
m.errors,
m.sent,
m.dropped,