metrics: add rule type to evaluation failures and rule groups metrics

Adds a rule_type label to prometheus_rule_evaluation_failures_total and
prometheus_rule_group_rules metrics to describe whether the metrics
relate to alerting or recording rules.

Signed-off-by: Kyle Stang <kylestng@amazon.com>
This commit is contained in:
Kyle Stang 2025-02-12 22:53:00 +00:00
parent 93e991ef7e
commit 512b335ac7
3 changed files with 38 additions and 11 deletions

View File

@ -107,15 +107,28 @@ func NewGroup(o GroupOptions) *Group {
metrics = NewGroupMetrics(opts.Registerer) metrics = NewGroupMetrics(opts.Registerer)
} }
alertingCount := 0
recordingCount := 0
for _, rule := range o.Rules {
switch rule.(type) {
case *AlertingRule:
alertingCount++
case *RecordingRule:
recordingCount++
}
}
key := GroupKey(o.File, o.Name) key := GroupKey(o.File, o.Name)
metrics.IterationsMissed.WithLabelValues(key) metrics.IterationsMissed.WithLabelValues(key)
metrics.IterationsScheduled.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key)
metrics.EvalTotal.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key)
metrics.EvalFailures.WithLabelValues(key) metrics.EvalFailures.WithLabelValues(key, KindAlerting)
metrics.EvalFailures.WithLabelValues(key, KindRecording)
metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastEvalTime.WithLabelValues(key)
metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key)
metrics.GroupLastRuleDurationSum.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key)
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) metrics.GroupRules.WithLabelValues(key, KindAlerting).Set(float64(alertingCount))
metrics.GroupRules.WithLabelValues(key, KindRecording).Set(float64(recordingCount))
metrics.GroupSamples.WithLabelValues(key) metrics.GroupSamples.WithLabelValues(key)
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
@ -542,7 +555,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
rule.SetHealth(HealthBad) rule.SetHealth(HealthBad)
rule.SetLastError(err) rule.SetLastError(err)
sp.SetStatus(codes.Error, err.Error()) sp.SetStatus(codes.Error, err.Error())
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
switch rule.(type) {
case *AlertingRule:
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc()
case *RecordingRule:
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc()
}
// Canceled queries are intentional termination of queries. This normally // Canceled queries are intentional termination of queries. This normally
// happens on shutdown and thus we skip logging of any errors here. // happens on shutdown and thus we skip logging of any errors here.
@ -572,7 +591,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
rule.SetHealth(HealthBad) rule.SetHealth(HealthBad)
rule.SetLastError(err) rule.SetLastError(err)
sp.SetStatus(codes.Error, err.Error()) sp.SetStatus(codes.Error, err.Error())
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
switch rule.(type) {
case *AlertingRule:
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc()
case *RecordingRule:
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc()
}
logger.Warn("Rule sample appending failed", "err", err) logger.Warn("Rule sample appending failed", "err", err)
return return
@ -974,7 +999,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
Name: "rule_evaluation_failures_total", Name: "rule_evaluation_failures_total",
Help: "The total number of rule evaluation failures.", Help: "The total number of rule evaluation failures.",
}, },
[]string{"rule_group"}, []string{"rule_group", "rule_type"},
), ),
GroupInterval: prometheus.NewGaugeVec( GroupInterval: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
@ -1022,7 +1047,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
Name: "rule_group_rules", Name: "rule_group_rules",
Help: "The number of rules.", Help: "The number of rules.",
}, },
[]string{"rule_group"}, []string{"rule_group", "rule_type"},
), ),
GroupSamples: prometheus.NewGaugeVec( GroupSamples: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{

View File

@ -269,11 +269,13 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
m.IterationsMissed.DeleteLabelValues(n) m.IterationsMissed.DeleteLabelValues(n)
m.IterationsScheduled.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n)
m.EvalTotal.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n)
m.EvalFailures.DeleteLabelValues(n) m.EvalFailures.DeleteLabelValues(n, KindAlerting)
m.EvalFailures.DeleteLabelValues(n, KindRecording)
m.GroupInterval.DeleteLabelValues(n) m.GroupInterval.DeleteLabelValues(n)
m.GroupLastEvalTime.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n)
m.GroupLastDuration.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n)
m.GroupRules.DeleteLabelValues(n) m.GroupRules.DeleteLabelValues(n, KindAlerting)
m.GroupRules.DeleteLabelValues(n, KindRecording)
m.GroupSamples.DeleteLabelValues((n)) m.GroupSamples.DeleteLabelValues((n))
} }
wg.Done() wg.Done()

View File

@ -1024,11 +1024,11 @@ func TestMetricsUpdate(t *testing.T) {
}{ }{
{ {
files: files, files: files,
metrics: 12, metrics: 16,
}, },
{ {
files: files[:1], files: files[:1],
metrics: 6, metrics: 8,
}, },
{ {
files: files[:0], files: files[:0],
@ -1036,7 +1036,7 @@ func TestMetricsUpdate(t *testing.T) {
}, },
{ {
files: files[1:], files: files[1:],
metrics: 6, metrics: 8,
}, },
} }