From e2f037e554a8a336ca4487324dd4ceee893fda0a Mon Sep 17 00:00:00 2001
From: Julien Duchesne <julien.duchesne@grafana.com>
Date: Fri, 13 Dec 2024 16:42:46 -0500
Subject: [PATCH] rules: Add new `RuleEvaluationTimeSum` field to groups
 (#15672)

* feat(ruler): Add new `RuleEvaluationTimeSum` field to groups
Coupled with a metric: `rule_group_last_rule_duration_sum_seconds`

This will give us more observability into how fast a group runs with or without concurrency

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Update rules/group.go

Co-authored-by: gotjosh <josue.abreu@gmail.com>
Signed-off-by: Julien Duchesne <julienduchesne@live.com>
Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Apply suggestions from code review

Co-authored-by: gotjosh <josue.abreu@gmail.com>
Signed-off-by: Julien Duchesne <julienduchesne@live.com>
Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Remove `in seconds`. A duration is a duration

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

---------

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>
Signed-off-by: Julien Duchesne <julienduchesne@live.com>
Co-authored-by: gotjosh <josue.abreu@gmail.com>
---
 rules/group.go        | 60 +++++++++++++++++++++++++++++++++----------
 rules/manager.go      |  1 +
 rules/manager_test.go | 12 ++++++---
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/rules/group.go b/rules/group.go
index 1f7953188b..1ce15be9f2 100644
--- a/rules/group.go
+++ b/rules/group.go
@@ -44,19 +44,20 @@ import (
 
 // Group is a set of rules that have a logical relation.
 type Group struct {
-	name                 string
-	file                 string
-	interval             time.Duration
-	queryOffset          *time.Duration
-	limit                int
-	rules                []Rule
-	seriesInPreviousEval []map[string]labels.Labels // One per Rule.
-	staleSeries          []labels.Labels
-	opts                 *ManagerOptions
-	mtx                  sync.Mutex
-	evaluationTime       time.Duration
-	lastEvaluation       time.Time // Wall-clock time of most recent evaluation.
-	lastEvalTimestamp    time.Time // Time slot used for most recent evaluation.
+	name                  string
+	file                  string
+	interval              time.Duration
+	queryOffset           *time.Duration
+	limit                 int
+	rules                 []Rule
+	seriesInPreviousEval  []map[string]labels.Labels // One per Rule.
+	staleSeries           []labels.Labels
+	opts                  *ManagerOptions
+	mtx                   sync.Mutex
+	evaluationTime        time.Duration // Time it took to evaluate the group.
+	evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group.
+	lastEvaluation        time.Time     // Wall-clock time of most recent evaluation.
+	lastEvalTimestamp     time.Time     // Time slot used for most recent evaluation.
 
 	shouldRestore bool
 
@@ -115,6 +116,7 @@ func NewGroup(o GroupOptions) *Group {
 	metrics.EvalFailures.WithLabelValues(key)
 	metrics.GroupLastEvalTime.WithLabelValues(key)
 	metrics.GroupLastDuration.WithLabelValues(key)
+	metrics.GroupLastRuleDurationSum.WithLabelValues(key)
 	metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
 	metrics.GroupSamples.WithLabelValues(key)
 	metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
@@ -370,6 +372,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) {
 	g.evaluationTime = dur
 }
 
+// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
+func (g *Group) GetRuleEvaluationTimeSum() time.Duration {
+	g.mtx.Lock()
+	defer g.mtx.Unlock()
+	return g.evaluationRuleTimeSum
+}
+
+// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
+// It collects the times from the rules themselves.
+func (g *Group) updateRuleEvaluationTimeSum() {
+	var sum time.Duration
+	for _, rule := range g.rules {
+		sum += rule.GetEvaluationDuration()
+	}
+
+	g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds())
+
+	g.mtx.Lock()
+	defer g.mtx.Unlock()
+	g.evaluationRuleTimeSum = sum
+}
+
 // GetLastEvaluation returns the time the last evaluation of the rule group took place.
 func (g *Group) GetLastEvaluation() time.Time {
 	g.mtx.Lock()
@@ -874,6 +898,7 @@ type Metrics struct {
 	GroupInterval            *prometheus.GaugeVec
 	GroupLastEvalTime        *prometheus.GaugeVec
 	GroupLastDuration        *prometheus.GaugeVec
+	GroupLastRuleDurationSum *prometheus.GaugeVec
 	GroupLastRestoreDuration *prometheus.GaugeVec
 	GroupRules               *prometheus.GaugeVec
 	GroupSamples             *prometheus.GaugeVec
@@ -952,6 +977,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 			},
 			[]string{"rule_group"},
 		),
+		GroupLastRuleDurationSum: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "rule_group_last_rule_duration_sum_seconds",
+				Help:      "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
+			},
+			[]string{"rule_group"},
+		),
 		GroupLastRestoreDuration: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
 				Namespace: namespace,
@@ -989,6 +1022,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 			m.GroupInterval,
 			m.GroupLastEvalTime,
 			m.GroupLastDuration,
+			m.GroupLastRuleDurationSum,
 			m.GroupLastRestoreDuration,
 			m.GroupRules,
 			m.GroupSamples,
diff --git a/rules/manager.go b/rules/manager.go
index df3d48700a..edc67a832b 100644
--- a/rules/manager.go
+++ b/rules/manager.go
@@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
 	timeSinceStart := time.Since(start)
 
 	g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
+	g.updateRuleEvaluationTimeSum()
 	g.setEvaluationTime(timeSinceStart)
 	g.setLastEvaluation(start)
 	g.setLastEvalTimestamp(evalTimestamp)
diff --git a/rules/manager_test.go b/rules/manager_test.go
index 327addcbc1..94ee1e8b8b 100644
--- a/rules/manager_test.go
+++ b/rules/manager_test.go
@@ -1985,7 +1985,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
 			require.Len(t, group.rules, ruleCount)
 
 			start := time.Now()
-			group.Eval(ctx, start)
+			DefaultEvalIterationFunc(ctx, group, start)
 
 			// Never expect more than 1 inflight query at a time.
 			require.EqualValues(t, 1, maxInflight.Load())
@@ -1993,6 +1993,8 @@ func TestAsyncRuleEvaluation(t *testing.T) {
 			require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
 			// Each rule produces one vector.
 			require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
+			// Group duration is higher than the sum of rule durations (group overhead).
+			require.GreaterOrEqual(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
 		}
 	})
 
@@ -2023,7 +2025,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
 			require.Len(t, group.rules, ruleCount)
 
 			start := time.Now()
-			group.Eval(ctx, start)
+			DefaultEvalIterationFunc(ctx, group, start)
 
 			// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
 			require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
@@ -2061,7 +2063,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
 			require.Len(t, group.rules, ruleCount)
 
 			start := time.Now()
-			group.Eval(ctx, start)
+			DefaultEvalIterationFunc(ctx, group, start)
 
 			// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
 			require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
@@ -2100,7 +2102,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
 
 			start := time.Now()
 
-			group.Eval(ctx, start)
+			DefaultEvalIterationFunc(ctx, group, start)
 
 			// Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once.
 			require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals)
@@ -2108,6 +2110,8 @@ func TestAsyncRuleEvaluation(t *testing.T) {
 			require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
 			// Each rule produces one vector.
 			require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
+			// Group duration is less than the sum of rule durations
+			require.Less(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
 		}
 	})