prometheus/notifier/metric.go
Kevin Hellemun 33082be0e2
feat: add histogram metric for notification_latency_seconds (#16637)
This metric can be used to create alerting based on how many
notifications finish or do not finish within a certain amount of time.

Change-Id: afbf3d8ceb3994c7d6220389353cff92
Signed-Off-By: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com>
Co-authored-by: Björn Rabenstein <github@rabenste.in>

---------

Signed-off-by: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com>
Co-authored-by: Björn Rabenstein <github@rabenste.in>
2025-11-11 13:47:37 +01:00

118 lines
3.6 KiB
Go

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package notifier
import (
"time"
"github.com/prometheus/client_golang/prometheus"
)
type alertMetrics struct {
latencySummary *prometheus.SummaryVec
latencyHistogram *prometheus.HistogramVec
errors *prometheus.CounterVec
sent *prometheus.CounterVec
dropped prometheus.Counter
queueLength prometheus.GaugeFunc
queueCapacity prometheus.Gauge
alertmanagersDiscovered prometheus.GaugeFunc
}
func newAlertMetrics(
r prometheus.Registerer,
queueCap int,
queueLen, alertmanagersDiscovered func() float64,
) *alertMetrics {
m := &alertMetrics{
latencySummary: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_seconds",
Help: "Latency quantiles for sending alert notifications.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{alertmanagerLabel},
),
latencyHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_histogram_seconds",
Help: "Latency histogram for sending alert notifications.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{alertmanagerLabel},
),
errors: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "errors_total",
Help: "Total number of sent alerts affected by errors.",
},
[]string{alertmanagerLabel},
),
sent: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sent_total",
Help: "Total number of alerts sent.",
},
[]string{alertmanagerLabel},
),
dropped: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dropped_total",
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
}),
queueLength: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "queue_length",
Help: "The number of alert notifications in the queue.",
}, queueLen),
queueCapacity: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "queue_capacity",
Help: "The capacity of the alert notifications queue.",
}),
alertmanagersDiscovered: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "prometheus_notifications_alertmanagers_discovered",
Help: "The number of alertmanagers discovered and active.",
}, alertmanagersDiscovered),
}
m.queueCapacity.Set(float64(queueCap))
if r != nil {
r.MustRegister(
m.latencySummary,
m.latencyHistogram,
m.errors,
m.sent,
m.dropped,
m.queueLength,
m.queueCapacity,
m.alertmanagersDiscovered,
)
}
return m
}