mirror of
https://github.com/prometheus/prometheus.git
synced 2025-08-06 14:17:12 +02:00
metrics: add rule type to evaluation failures and rule groups metrics
Adds a rule_type label to prometheus_rule_evaluation_failures_total and prometheus_rule_group_rules metrics to describe whether the metrics relate to alerting or recording rules. Signed-off-by: Kyle Stang <kylestng@amazon.com>
This commit is contained in:
parent
93e991ef7e
commit
512b335ac7
@ -107,15 +107,28 @@ func NewGroup(o GroupOptions) *Group {
|
||||
metrics = NewGroupMetrics(opts.Registerer)
|
||||
}
|
||||
|
||||
alertingCount := 0
|
||||
recordingCount := 0
|
||||
for _, rule := range o.Rules {
|
||||
switch rule.(type) {
|
||||
case *AlertingRule:
|
||||
alertingCount++
|
||||
case *RecordingRule:
|
||||
recordingCount++
|
||||
}
|
||||
}
|
||||
|
||||
key := GroupKey(o.File, o.Name)
|
||||
metrics.IterationsMissed.WithLabelValues(key)
|
||||
metrics.IterationsScheduled.WithLabelValues(key)
|
||||
metrics.EvalTotal.WithLabelValues(key)
|
||||
metrics.EvalFailures.WithLabelValues(key)
|
||||
metrics.EvalFailures.WithLabelValues(key, KindAlerting)
|
||||
metrics.EvalFailures.WithLabelValues(key, KindRecording)
|
||||
metrics.GroupLastEvalTime.WithLabelValues(key)
|
||||
metrics.GroupLastDuration.WithLabelValues(key)
|
||||
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
|
||||
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
|
||||
metrics.GroupRules.WithLabelValues(key, KindAlerting).Set(float64(alertingCount))
|
||||
metrics.GroupRules.WithLabelValues(key, KindRecording).Set(float64(recordingCount))
|
||||
metrics.GroupSamples.WithLabelValues(key)
|
||||
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
|
||||
|
||||
@ -542,7 +555,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
||||
rule.SetHealth(HealthBad)
|
||||
rule.SetLastError(err)
|
||||
sp.SetStatus(codes.Error, err.Error())
|
||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
|
||||
|
||||
switch rule.(type) {
|
||||
case *AlertingRule:
|
||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc()
|
||||
case *RecordingRule:
|
||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc()
|
||||
}
|
||||
|
||||
// Canceled queries are intentional termination of queries. This normally
|
||||
// happens on shutdown and thus we skip logging of any errors here.
|
||||
@ -572,7 +591,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
||||
rule.SetHealth(HealthBad)
|
||||
rule.SetLastError(err)
|
||||
sp.SetStatus(codes.Error, err.Error())
|
||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
|
||||
|
||||
switch rule.(type) {
|
||||
case *AlertingRule:
|
||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc()
|
||||
case *RecordingRule:
|
||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc()
|
||||
}
|
||||
|
||||
logger.Warn("Rule sample appending failed", "err", err)
|
||||
return
|
||||
@ -974,7 +999,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
||||
Name: "rule_evaluation_failures_total",
|
||||
Help: "The total number of rule evaluation failures.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
[]string{"rule_group", "rule_type"},
|
||||
),
|
||||
GroupInterval: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
@ -1022,7 +1047,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of rules.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
[]string{"rule_group", "rule_type"},
|
||||
),
|
||||
GroupSamples: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
|
@ -269,11 +269,13 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
|
||||
m.IterationsMissed.DeleteLabelValues(n)
|
||||
m.IterationsScheduled.DeleteLabelValues(n)
|
||||
m.EvalTotal.DeleteLabelValues(n)
|
||||
m.EvalFailures.DeleteLabelValues(n)
|
||||
m.EvalFailures.DeleteLabelValues(n, KindAlerting)
|
||||
m.EvalFailures.DeleteLabelValues(n, KindRecording)
|
||||
m.GroupInterval.DeleteLabelValues(n)
|
||||
m.GroupLastEvalTime.DeleteLabelValues(n)
|
||||
m.GroupLastDuration.DeleteLabelValues(n)
|
||||
m.GroupRules.DeleteLabelValues(n)
|
||||
m.GroupRules.DeleteLabelValues(n, KindAlerting)
|
||||
m.GroupRules.DeleteLabelValues(n, KindRecording)
|
||||
m.GroupSamples.DeleteLabelValues((n))
|
||||
}
|
||||
wg.Done()
|
||||
|
@ -1024,11 +1024,11 @@ func TestMetricsUpdate(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
files: files,
|
||||
metrics: 12,
|
||||
metrics: 16,
|
||||
},
|
||||
{
|
||||
files: files[:1],
|
||||
metrics: 6,
|
||||
metrics: 8,
|
||||
},
|
||||
{
|
||||
files: files[:0],
|
||||
@ -1036,7 +1036,7 @@ func TestMetricsUpdate(t *testing.T) {
|
||||
},
|
||||
{
|
||||
files: files[1:],
|
||||
metrics: 6,
|
||||
metrics: 8,
|
||||
},
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user