From 512b335ac76d9ccd63743088e8c591d65a51e58e Mon Sep 17 00:00:00 2001 From: Kyle Stang Date: Wed, 12 Feb 2025 22:53:00 +0000 Subject: [PATCH] metrics: add rule type to evaluation failures and rule groups metrics Adds a rule_type label to prometheus_rule_evaluation_failures_total and prometheus_rule_group_rules metrics to describe whether the metrics relate to alerting or recording rules. Signed-off-by: Kyle Stang --- rules/group.go | 37 +++++++++++++++++++++++++++++++------ rules/manager.go | 6 ++++-- rules/manager_test.go | 6 +++--- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/rules/group.go b/rules/group.go index c9f5162cda..52779f99eb 100644 --- a/rules/group.go +++ b/rules/group.go @@ -107,15 +107,28 @@ func NewGroup(o GroupOptions) *Group { metrics = NewGroupMetrics(opts.Registerer) } + alertingCount := 0 + recordingCount := 0 + for _, rule := range o.Rules { + switch rule.(type) { + case *AlertingRule: + alertingCount++ + case *RecordingRule: + recordingCount++ + } + } + key := GroupKey(o.File, o.Name) metrics.IterationsMissed.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) - metrics.EvalFailures.WithLabelValues(key) + metrics.EvalFailures.WithLabelValues(key, KindAlerting) + metrics.EvalFailures.WithLabelValues(key, KindRecording) metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) - metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) + metrics.GroupRules.WithLabelValues(key, KindAlerting).Set(float64(alertingCount)) + metrics.GroupRules.WithLabelValues(key, KindRecording).Set(float64(recordingCount)) metrics.GroupSamples.WithLabelValues(key) metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) @@ -542,7 +555,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + switch rule.(type) { + case *AlertingRule: + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc() + case *RecordingRule: + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc() + } // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. @@ -572,7 +591,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + switch rule.(type) { + case *AlertingRule: + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc() + case *RecordingRule: + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc() + } logger.Warn("Rule sample appending failed", "err", err) return @@ -974,7 +999,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_evaluation_failures_total", Help: "The total number of rule evaluation failures.", }, - []string{"rule_group"}, + []string{"rule_group", "rule_type"}, ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -1022,7 +1047,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_group_rules", Help: "The number of rules.", }, - []string{"rule_group"}, + []string{"rule_group", "rule_type"}, ), GroupSamples: prometheus.NewGaugeVec( prometheus.GaugeOpts{ diff --git a/rules/manager.go b/rules/manager.go index a38be82ebe..f673483cc1 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -269,11 +269,13 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels m.IterationsMissed.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n) - m.EvalFailures.DeleteLabelValues(n) + m.EvalFailures.DeleteLabelValues(n, KindAlerting) + m.EvalFailures.DeleteLabelValues(n, KindRecording) m.GroupInterval.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n) - m.GroupRules.DeleteLabelValues(n) + m.GroupRules.DeleteLabelValues(n, KindAlerting) + m.GroupRules.DeleteLabelValues(n, KindRecording) m.GroupSamples.DeleteLabelValues((n)) } wg.Done() diff --git a/rules/manager_test.go b/rules/manager_test.go index 46a87787ce..e2d7da9825 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -1024,11 +1024,11 @@ func TestMetricsUpdate(t *testing.T) { }{ { files: files, - metrics: 12, + metrics: 16, }, { files: files[:1], - metrics: 6, + metrics: 8, }, { files: files[:0], @@ -1036,7 +1036,7 @@ func TestMetricsUpdate(t *testing.T) { }, { files: files[1:], - metrics: 6, + metrics: 8, }, }