From ff67596a82ef683c0097d75fefd1b5938ad0320f Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Wed, 7 May 2025 15:59:48 +0200
Subject: [PATCH 1/2] promql: Simplify avg aggregation and avg_over_time

As it turns out, if we combine Kahan summation and incremental mean
calculation properly, it works quite well and we do not need to switch
between simple mean calculation and incremental calculation based on
overflow.

This simplifies the code quite a bit.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 promql/engine.go    | 62 ++++++++++++++----------------------------
 promql/functions.go | 65 ++++++++++++++-------------------------------
 2 files changed, 40 insertions(+), 87 deletions(-)

diff --git a/promql/engine.go b/promql/engine.go
index fec2bbf761..888df311d0 100644
--- a/promql/engine.go
+++ b/promql/engine.go
@@ -2981,7 +2981,6 @@ type groupedAggregation struct {
 	hasHistogram           bool // Has at least 1 histogram sample aggregated.
 	incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
 	groupAggrComplete      bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
-	incrementalMean        bool // True after reverting to incremental calculation of the mean value.
 }
 
 // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
@@ -3080,6 +3079,21 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 			}
 
 		case parser.AVG:
+			// For the average calculation, we use incremental mean
+			// calculation. In particular in combination with Kahan
+			// summation (which we do for floats, but not yet for
+			// histograms, see issue #14105), this is quite accurate
+			// and only breaks in extreme cases (see testdata for
+			// avg_over_time). One might assume that simple direct
+			// mean calculation works better in some cases, but so
+			// far, our conclusion is that we fare best with the
+			// incremental approach plus Kahan summation (for
+			// floats). For a relevant discussion, see
+			// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
+			// Additional note: For even better numerical accuracy,
+			// we would need to process the values in a particular
+			// order, but that would be very hard to implement given
+			// how the PromQL engine works.
 			group.groupCount++
 			if h != nil {
 				group.hasHistogram = true
@@ -3104,45 +3118,11 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				// point in copying the histogram in that case.
 			} else {
 				group.hasFloat = true
-				if !group.incrementalMean {
-					newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC)
-					if !math.IsInf(newV, 0) {
-						// The sum doesn't overflow, so we propagate it to the
-						// group struct and continue with the regular
-						// calculation of the mean value.
-						group.floatValue, group.floatKahanC = newV, newC
-						break
-					}
-					// If we are here, we know that the sum _would_ overflow. So
-					// instead of continue to sum up, we revert to incremental
-					// calculation of the mean value from here on.
-					group.incrementalMean = true
-					group.floatMean = group.floatValue / (group.groupCount - 1)
-					group.floatKahanC /= group.groupCount - 1
-				}
-				if math.IsInf(group.floatMean, 0) {
-					if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) {
-						// The `floatMean` and `s.F` values are `Inf` of the same sign.  They
-						// can't be subtracted, but the value of `floatMean` is correct
-						// already.
-						break
-					}
-					if !math.IsInf(f, 0) && !math.IsNaN(f) {
-						// At this stage, the mean is an infinite. If the added
-						// value is neither an Inf or a Nan, we can keep that mean
-						// value.
-						// This is required because our calculation below removes
-						// the mean value, which would look like Inf += x - Inf and
-						// end up as a NaN.
-						break
-					}
-				}
-				currentMean := group.floatMean + group.floatKahanC
+				q := (group.groupCount - 1) / group.groupCount
 				group.floatMean, group.floatKahanC = kahanSumInc(
-					// Divide each side of the `-` by `group.groupCount` to avoid float64 overflows.
-					f/group.groupCount-currentMean/group.groupCount,
-					group.floatMean,
-					group.floatKahanC,
+					f/group.groupCount,
+					q*group.floatMean,
+					q*group.floatKahanC,
 				)
 			}
 
@@ -3215,10 +3195,8 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				continue
 			case aggr.hasHistogram:
 				aggr.histogramValue = aggr.histogramValue.Compact(0)
-			case aggr.incrementalMean:
-				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
 			default:
-				aggr.floatValue = (aggr.floatValue + aggr.floatKahanC) / aggr.groupCount
+				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
 			}
 
 		case parser.COUNT:
diff --git a/promql/functions.go b/promql/functions.go
index e4699f3914..7e5205844a 100644
--- a/promql/functions.go
+++ b/promql/functions.go
@@ -671,6 +671,20 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		metricName := firstSeries.Metric.Get(labels.MetricName)
 		return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
 	}
+	// For the average calculation, we use incremental mean calculation. In
+	// particular in combination with Kahan summation (which we do for
+	// floats, but not yet for histograms, see issue #14105), this is quite
+	// accurate and only breaks in extreme cases (see testdata). One might
+	// assume that simple direct mean calculation works better in some
+	// cases, but so far, our conclusion is that we fare best with the
+	// incremental approach plus Kahan summation (for floats). For a
+	// relevant discussion, see
+	// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
+	// Additional note: For even better numerical accuracy, we would need to
+	// process the values in a particular order. For avg_over_time, that
+	// would be more or less feasible, but it would be more expensivo, and
+	// it would also be much harder for the avg aggregator, given how the
+	// PromQL engine works.
 	if len(firstSeries.Floats) == 0 {
 		// The passed values only contain histograms.
 		vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
@@ -702,52 +716,13 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		return vec, nil
 	}
 	return aggrOverTime(vals, enh, func(s Series) float64 {
-		var (
-			sum, mean, count, kahanC float64
-			incrementalMean          bool
-		)
-		for _, f := range s.Floats {
-			count++
-			if !incrementalMean {
-				newSum, newC := kahanSumInc(f.F, sum, kahanC)
-				// Perform regular mean calculation as long as
-				// the sum doesn't overflow and (in any case)
-				// for the first iteration (even if we start
-				// with ±Inf) to not run into division-by-zero
-				// problems below.
-				if count == 1 || !math.IsInf(newSum, 0) {
-					sum, kahanC = newSum, newC
-					continue
-				}
-				// Handle overflow by reverting to incremental calculation of the mean value.
-				incrementalMean = true
-				mean = sum / (count - 1)
-				kahanC /= count - 1
-			}
-			if math.IsInf(mean, 0) {
-				if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) {
-					// The `mean` and `f.F` values are `Inf` of the same sign.  They
-					// can't be subtracted, but the value of `mean` is correct
-					// already.
-					continue
-				}
-				if !math.IsInf(f.F, 0) && !math.IsNaN(f.F) {
-					// At this stage, the mean is an infinite. If the added
-					// value is neither an Inf or a Nan, we can keep that mean
-					// value.
-					// This is required because our calculation below removes
-					// the mean value, which would look like Inf += x - Inf and
-					// end up as a NaN.
-					continue
-				}
-			}
-			correctedMean := mean + kahanC
-			mean, kahanC = kahanSumInc(f.F/count-correctedMean/count, mean, kahanC)
+		var mean, kahanC float64
+		for i, f := range s.Floats {
+			count := float64(i + 1)
+			q := float64(i) / count
+			mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
 		}
-		if incrementalMean {
-			return mean + kahanC
-		}
-		return (sum + kahanC) / count
+		return mean + kahanC
 	}), nil
 }
 

From 64431830b8daa58849ffe97a3334512bab43d215 Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Thu, 5 Jun 2025 21:52:50 +0200
Subject: [PATCH 2/2] Add more avg_over_time test cases with extreme values

These tests were initially created by @crush-on-anechka. I modified
them slightly.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 promql/promqltest/testdata/functions.test | 73 +++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test
index 83c49320ed..beb216b93c 100644
--- a/promql/promqltest/testdata/functions.test
+++ b/promql/promqltest/testdata/functions.test
@@ -1013,6 +1013,79 @@ eval instant at 1m sum_over_time(metric[2m])
 eval instant at 1m avg_over_time(metric[2m])
   {} 0.5
 
+# More tests for extreme values.
+clear
+# All positive values with varying magnitudes.
+load 5s
+  metric1 1e10 1e-6 1e-6 1e-6 1e-6 1e-6
+  metric2 5.30921651659898 0.961118537914768 1.62091361305318 0.865089463758091 0.323055185914577 0.951811357687154
+  metric3 1.78264e50 0.76342771 1.9592258 7.69805412 458.90154
+  metric4 0.76342771 1.9592258 7.69805412 1.78264e50 458.90154
+  metric5 1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 458.90154
+
+eval instant at 55s avg_over_time(metric1[1m])
+  {} 1.6666666666666675e+09
+
+eval instant at 55s avg_over_time(metric2[1m])
+  {} 1.67186744582113
+  
+eval instant at 55s avg_over_time(metric3[1m])
+  {} 3.56528E+49
+  
+eval instant at 55s avg_over_time(metric4[1m])
+  {} 3.56528E+49
+  
+eval instant at 55s avg_over_time(metric5[1m])
+  {} 3.76333333333333E+219
+  
+# Contains negative values; result is dominated by a very large value.
+load 5s
+  metric6 -1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154
+  metric7 -1.78264E+50 0.76342771 1.9592258 -2.258E+220 7.69805412 -458.90154
+  metric8 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154
+  metric9 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 1.78264E+215 -458.90154
+  metric10 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
+
+eval instant at 55s avg_over_time(metric6[1m])
+  {} 3.76333333333333E+219
+  
+eval instant at 55s avg_over_time(metric7[1m])
+  {} -3.76333333333333E+219
+  
+eval instant at 55s avg_over_time(metric8[1m])
+  {} 3.76330362266667E+219
+
+eval instant at 55s avg_over_time(metric9[1m])
+  {} 3.225714285714286e+219
+
+# Interestingly, before PR #16569, this test failed with a result of
+# 3.225714285714286e+219, so the incremental calculation combined with
+# Kahan summation (in the improved way as introduced by PR #16569)
+# seems to be more accurate than the simple mean calculation with
+# Kahan summation.
+eval instant at 55s avg_over_time(metric10[1m])
+  {} 2.5088888888888888e+219
+
+# Large values of different magnitude, combined with small values. The
+# large values, however, all cancel each other exactly, so that the actual
+# average here is determined by only the small values. Therefore, the correct
+# outcome is -44.848083237000004.
+load 5s
+  metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
+
+# Thus, the result here is very far off! With the different orders of
+# magnitudes involved, even Kahan summation cannot cope.
+# Interestingly, with the simple mean calculation (still including
+# Kahan summation) prior to PR #16569, the result is 0. That's
+# arguably more accurate, but it still shows that Kahan summation
+# doesn't work well in this situation. To solve this properly, we
+# needed to do something like sorting the values (which is hard given
+# how the PromQL engine works). The question is how practically
+# relevant this scenario is.
+eval instant at 55s avg_over_time(metric11[1m])
+  {} -1.881783551706252e+203
+# {} -44.848083237000004 <- This is the correct value.
+
 # Test per-series aggregation on dense samples.
 clear
 load 1ms