From 2b3fc1f115e7c09eb182ced42f8ec6d826b676ee Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Tue, 24 Jun 2025 12:41:30 +0200
Subject: [PATCH 1/3] promql: Add test cases for direct mean calculation

These demonstrate that direct mean calculation has some merits after
all.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 promql/promqltest/testdata/aggregators.test | 57 +++++++++++++++++++++
 promql/promqltest/testdata/functions.test   | 39 ++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test
index 7062d37f05..0eae8ba61e 100644
--- a/promql/promqltest/testdata/aggregators.test
+++ b/promql/promqltest/testdata/aggregators.test
@@ -663,6 +663,63 @@ eval instant at 1m avg by (group) (data{test="nan"})
 
 clear
 
+# Demonstrate robustness of direct mean calculation vs. incremental mean calculation.
+# The tests below are prone to small inaccuracies with incremental mean calculation.
+# The exact number of aggregated values that trigger an inaccuracy depends on the
+# hardware.
+# See also discussion in https://github.com/prometheus/prometheus/issues/16714
+load 5m
+  foo{idx="0"} 52
+  foo{idx="1"} 52
+  foo{idx="2"} 52
+  foo{idx="3"} 52
+  foo{idx="4"} 52
+  foo{idx="5"} 52
+  foo{idx="6"} 52
+  foo{idx="7"} 52
+  foo{idx="8"} 52
+  foo{idx="9"} 52
+  foo{idx="10"} 52
+  foo{idx="11"} 52
+
+eval instant at 0 avg(foo) - 52
+  {} 0
+
+eval instant at 0 avg(topk(11, foo)) - 52
+  {} 0
+
+eval instant at 0 avg(topk(10, foo)) - 52
+  {} 0
+
+eval instant at 0 avg(topk(9, foo)) - 52
+  {} 0
+
+eval instant at 0 avg(topk(8, foo)) - 52
+  {} 0
+
+# The following tests do not utilize the tolerance built into the
+# testing framework but rely on the exact equality implemented in
+# PromQL. They currently pass, but we should keep in mind that this is
+# not a hard requirement, and generally it is a bad idea in practice
+# to rely on exact equality like this in alerting rules etc.
+
+eval instant at 0 avg(foo) == 52
+  {} 52
+
+eval instant at 0 avg(topk(11, foo)) == 52
+  {} 52
+
+eval instant at 0 avg(topk(10, foo)) == 52
+  {} 52
+
+eval instant at 0 avg(topk(9, foo)) == 52
+  {} 52
+
+eval instant at 0 avg(topk(8, foo)) == 52
+  {} 52
+
+clear
+
 # Test that aggregations are deterministic.
 # Commented because it is flaky in range mode.
 #load 10s
diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test
index b99dcd7e0d..a2f04f9dbf 100644
--- a/promql/promqltest/testdata/functions.test
+++ b/promql/promqltest/testdata/functions.test
@@ -1089,6 +1089,45 @@ load 5s
 #   {} -1.881783551706252e+203 <- This is the result on linux/amd64.
 #   {} 2.303079268822384e+202 <- This is the result on darwin/arm64.
 
+# Demonstrate robustness of direct mean calculation vs. incremental mean calculation.
+# The tests below are prone to small inaccuracies with incremental mean calculation.
+# The exact number of aggregated values that trigger an inaccuracy depends on the
+# hardware.
+# See also discussion in https://github.com/prometheus/prometheus/issues/16714
+clear
+load 10s
+  foo 52+0x100
+
+eval instant at 10m avg_over_time(foo[100s]) - 52
+  {} 0
+
+eval instant at 10m avg_over_time(foo[110s]) - 52
+  {} 0
+
+eval instant at 10m avg_over_time(foo[120s]) - 52
+  {} 0
+
+eval instant at 10m avg_over_time(foo[130s]) - 52
+  {} 0
+
+# The following tests do not utilize the tolerance built into the
+# testing framework but rely on the exact equality implemented in
+# PromQL. They currently pass, but we should keep in mind that this is
+# not a hard requirement, and generally it is a bad idea in practice
+# to rely on exact equality like this in alerting rules etc.
+
+eval instant at 10m avg_over_time(foo[100s]) == 52
+  {} 52
+
+eval instant at 10m avg_over_time(foo[110s]) == 52
+  {} 52
+
+eval instant at 10m avg_over_time(foo[120s]) == 52
+  {} 52
+
+eval instant at 10m avg_over_time(foo[130s]) == 52
+  {} 52
+
 # Test per-series aggregation on dense samples.
 clear
 load 1ms

From f71daa797786b8a98a82c98f86f71bb709e5c609 Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Tue, 24 Jun 2025 14:35:58 +0200
Subject: [PATCH 2/3] promql: Remove falsified comment from test

The test in question actually worked fine even before #16569. The
finding reported in the comment has turned out to be caused by
something else.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 promql/promqltest/testdata/functions.test | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test
index a2f04f9dbf..23d7f53af6 100644
--- a/promql/promqltest/testdata/functions.test
+++ b/promql/promqltest/testdata/functions.test
@@ -1060,11 +1060,6 @@ eval instant at 55s avg_over_time(metric8[1m])
 eval instant at 55s avg_over_time(metric9[1m])
   {} 3.225714285714286e+219
 
-# Interestingly, before PR #16569, this test failed with a result of
-# 3.225714285714286e+219, so the incremental calculation combined with
-# Kahan summation (in the improved way as introduced by PR #16569)
-# seems to be more accurate than the simple mean calculation with
-# Kahan summation.
 eval instant at 55s avg_over_time(metric10[1m])
   {} 2.5088888888888888e+219
 

From ce809e625f8ba5768fa0928cb82cd5d5c3eb0627 Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Tue, 24 Jun 2025 15:52:35 +0200
Subject: [PATCH 3/3] promql: Re-introduce direct mean calculation for better
 accuracy

This commit brings back direct mean calculation (for `avg` and
`avg_over_time`) but isn't an outright revert of #16569. It keeps the
improved incremental mean calculation and features generally a bit
cleaner code than before.

Also, this commit...

- ...updates the lengthy comment explaining the whole situation and
  trade-offs.

- ...divides the running sum and the Kahan compensation term
  separately (in direct mean calculation) to avoid the (unlikely)
  possibility that sum and Kahan compensation together ovorflow
  float64.

- ...uncomments the tests that should now work again on darwin/arm64.

- ...uncomments the test that should now reliably yield the
  (inaccurate) value 0 on all hardware platforms. Also, the test
  description has been updated accordingly.

- ...adds avg_over_time tests for zero and one sample in the range.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 promql/engine.go                            | 66 +++++++++++++-----
 promql/functions.go                         | 75 ++++++++++++++-------
 promql/promqltest/testdata/aggregators.test |  6 +-
 promql/promqltest/testdata/functions.test   | 34 +++++-----
 4 files changed, 122 insertions(+), 59 deletions(-)

diff --git a/promql/engine.go b/promql/engine.go
index 0622bcb388..adf49ed96e 100644
--- a/promql/engine.go
+++ b/promql/engine.go
@@ -2998,6 +2998,7 @@ type groupedAggregation struct {
 	hasHistogram           bool // Has at least 1 histogram sample aggregated.
 	incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
 	groupAggrComplete      bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
+	incrementalMean        bool // True after reverting to incremental calculation of the mean value.
 }
 
 // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
@@ -3096,21 +3097,38 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 			}
 
 		case parser.AVG:
-			// For the average calculation, we use incremental mean
-			// calculation. In particular in combination with Kahan
-			// summation (which we do for floats, but not yet for
-			// histograms, see issue #14105), this is quite accurate
-			// and only breaks in extreme cases (see testdata for
-			// avg_over_time). One might assume that simple direct
-			// mean calculation works better in some cases, but so
-			// far, our conclusion is that we fare best with the
-			// incremental approach plus Kahan summation (for
-			// floats). For a relevant discussion, see
+			// For the average calculation of histograms, we use
+			// incremental mean calculation without the help of
+			// Kahan summation (but this should change, see
+			// https://github.com/prometheus/prometheus/issues/14105
+			// ). For floats, we improve the accuracy with the help
+			// of Kahan summation. For a while, we assumed that
+			// incremental mean calculation combined with Kahan
+			// summation (see
 			// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
-			// Additional note: For even better numerical accuracy,
-			// we would need to process the values in a particular
-			// order, but that would be very hard to implement given
-			// how the PromQL engine works.
+			// for inspiration) is generally the preferred solution.
+			// However, it then turned out that direct mean
+			// calculation (still in combination with Kahan
+			// summation) is often more accurate. See discussion in
+			// https://github.com/prometheus/prometheus/issues/16714
+			// . The problem with the direct mean calculation is
+			// that it can overflow float64 for inputs on which the
+			// incremental mean calculation works just fine. Our
+			// current approach is therefore to use direct mean
+			// calculation as long as we do not overflow (or
+			// underflow) the running sum. Once the latter would
+			// happen, we switch to incremental mean calculation.
+			// This seems to work reasonably well, but note that a
+			// deeper understanding would be needed to find out if
+			// maybe an earlier switch to incremental mean
+			// calculation would be better in terms of accuracy.
+			// Also, we could apply a number of additional means to
+			// improve the accuracy, like processing the values in a
+			// particular order. For now, we decided that the
+			// current implementation is accurate enough for
+			// practical purposes, in particular given that changing
+			// the order of summation would be hard, given how the
+			// PromQL engine implements aggregations.
 			group.groupCount++
 			if h != nil {
 				group.hasHistogram = true
@@ -3135,6 +3153,22 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				// point in copying the histogram in that case.
 			} else {
 				group.hasFloat = true
+				if !group.incrementalMean {
+					newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC)
+					if !math.IsInf(newV, 0) {
+						// The sum doesn't overflow, so we propagate it to the
+						// group struct and continue with the regular
+						// calculation of the mean value.
+						group.floatValue, group.floatKahanC = newV, newC
+						break
+					}
+					// If we are here, we know that the sum _would_ overflow. So
+					// instead of continue to sum up, we revert to incremental
+					// calculation of the mean value from here on.
+					group.incrementalMean = true
+					group.floatMean = group.floatValue / (group.groupCount - 1)
+					group.floatKahanC /= group.groupCount - 1
+				}
 				q := (group.groupCount - 1) / group.groupCount
 				group.floatMean, group.floatKahanC = kahanSumInc(
 					f/group.groupCount,
@@ -3212,8 +3246,10 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				continue
 			case aggr.hasHistogram:
 				aggr.histogramValue = aggr.histogramValue.Compact(0)
-			default:
+			case aggr.incrementalMean:
 				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
+			default:
+				aggr.floatValue = aggr.floatValue/aggr.groupCount + aggr.floatKahanC/aggr.groupCount
 			}
 
 		case parser.COUNT:
diff --git a/promql/functions.go b/promql/functions.go
index 7e3099ec60..fdf042eb8b 100644
--- a/promql/functions.go
+++ b/promql/functions.go
@@ -671,29 +671,36 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		metricName := firstSeries.Metric.Get(labels.MetricName)
 		return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
 	}
-	// For the average calculation, we use incremental mean calculation. In
-	// particular in combination with Kahan summation (which we do for
-	// floats, but not yet for histograms, see issue #14105), this is quite
-	// accurate and only breaks in extreme cases (see testdata). One might
-	// assume that simple direct mean calculation works better in some
-	// cases, but so far, our conclusion is that we fare best with the
-	// incremental approach plus Kahan summation (for floats). For a
-	// relevant discussion, see
+	// For the average calculation of histograms, we use incremental mean
+	// calculation without the help of Kahan summation (but this should
+	// change, see https://github.com/prometheus/prometheus/issues/14105 ).
+	// For floats, we improve the accuracy with the help of Kahan summation.
+	// For a while, we assumed that incremental mean calculation combined
+	// with Kahan summation (see
 	// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
-	// Additional note: For even better numerical accuracy, we would need to
-	// process the values in a particular order. For avg_over_time, that
-	// would be more or less feasible, but it would be more expensive, and
-	// it would also be much harder for the avg aggregator, given how the
-	// PromQL engine works.
+	// for inspiration) is generally the preferred solution. However, it
+	// then turned out that direct mean calculation (still in combination
+	// with Kahan summation) is often more accurate. See discussion in
+	// https://github.com/prometheus/prometheus/issues/16714 . The problem
+	// with the direct mean calculation is that it can overflow float64 for
+	// inputs on which the incremental mean calculation works just fine. Our
+	// current approach is therefore to use direct mean calculation as long
+	// as we do not overflow (or underflow) the running sum. Once the latter
+	// would happen, we switch to incremental mean calculation. This seems
+	// to work reasonably well, but note that a deeper understanding would
+	// be needed to find out if maybe an earlier switch to incremental mean
+	// calculation would be better in terms of accuracy. Also, we could
+	// apply a number of additional means to improve the accuracy, like
+	// processing the values in a particular order. For now, we decided that
+	// the current implementation is accurate enough for practical purposes.
 	if len(firstSeries.Floats) == 0 {
 		// The passed values only contain histograms.
 		vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
-			count := 1
 			mean := s.Histograms[0].H.Copy()
-			for _, h := range s.Histograms[1:] {
-				count++
-				left := h.H.Copy().Div(float64(count))
-				right := mean.Copy().Div(float64(count))
+			for i, h := range s.Histograms[1:] {
+				count := float64(i + 2)
+				left := h.H.Copy().Div(count)
+				right := mean.Copy().Div(count)
 				toAdd, err := left.Sub(right)
 				if err != nil {
 					return mean, err
@@ -716,13 +723,35 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		return vec, nil
 	}
 	return aggrOverTime(vals, enh, func(s Series) float64 {
-		var mean, kahanC float64
-		for i, f := range s.Floats {
-			count := float64(i + 1)
-			q := float64(i) / count
+		var (
+			// Pre-set the 1st sample to start the loop with the 2nd.
+			sum, count      = s.Floats[0].F, 1.
+			mean, kahanC    float64
+			incrementalMean bool
+		)
+		for i, f := range s.Floats[1:] {
+			count = float64(i + 2)
+			if !incrementalMean {
+				newSum, newC := kahanSumInc(f.F, sum, kahanC)
+				// Perform regular mean calculation as long as
+				// the sum doesn't overflow.
+				if !math.IsInf(newSum, 0) {
+					sum, kahanC = newSum, newC
+					continue
+				}
+				// Handle overflow by reverting to incremental
+				// calculation of the mean value.
+				incrementalMean = true
+				mean = sum / (count - 1)
+				kahanC /= (count - 1)
+			}
+			q := (count - 1) / count
 			mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
 		}
-		return mean + kahanC
+		if incrementalMean {
+			return mean + kahanC
+		}
+		return sum/count + kahanC/count
 	}), nil
 }
 
diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test
index 0eae8ba61e..f77b418edf 100644
--- a/promql/promqltest/testdata/aggregators.test
+++ b/promql/promqltest/testdata/aggregators.test
@@ -593,10 +593,8 @@ eval instant at 1m avg(data{test="big"})
 eval instant at 1m avg(data{test="-big"})
 	{} -9.988465674311579e+307
 
-# This test fails on darwin/arm64.
-# Deactivated until issue #16714 is fixed.
-# eval instant at 1m avg(data{test="bigzero"})
-#	{} 0
+eval instant at 1m avg(data{test="bigzero"})
+	{} 0
 
 # If NaN is in the mix, the result is NaN.
 eval instant at 1m avg(data)
diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test
index 23d7f53af6..037e40923a 100644
--- a/promql/promqltest/testdata/functions.test
+++ b/promql/promqltest/testdata/functions.test
@@ -886,6 +886,14 @@ load 10s
   metric12 1 2 3 {{schema:0 sum:1 count:1}} {{schema:0 sum:3 count:3}}
   metric13 {{schema:0 sum:1 count:1}}x5
 
+# No samples in the range.
+eval instant at 55s avg_over_time(metric[10s])
+
+# One sample in the range.
+eval instant at 55s avg_over_time(metric[20s])
+  {} 5
+
+# All samples in the range.
 eval instant at 55s avg_over_time(metric[1m])
   {} 3
 
@@ -1010,10 +1018,8 @@ load 10s
 eval instant at 1m sum_over_time(metric[2m])
   {} 2
 
-# This test fails on darwin/arm64.
-# Deactivated until issue #16714 is fixed.
-# eval instant at 1m avg_over_time(metric[2m])
-#  {} 0.5
+eval instant at 1m avg_over_time(metric[2m])
+  {} 0.5
 
 # More tests for extreme values.
 clear
@@ -1070,19 +1076,13 @@ eval instant at 55s avg_over_time(metric10[1m])
 load 5s
   metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
 
-# Thus, the result here is very far off! With the different orders of
-# magnitudes involved, even Kahan summation cannot cope.
-# Interestingly, with the simple mean calculation (still including
-# Kahan summation) prior to PR #16569, the result is 0. That's
-# arguably more accurate, but it still shows that Kahan summation
-# doesn't work well in this situation. To solve this properly, we
-# needed to do something like sorting the values (which is hard given
-# how the PromQL engine works). The question is how practically
-# relevant this scenario is.
-# eval instant at 55s avg_over_time(metric11[1m])
-#   {} -44.848083237000004 <- This is the correct value.
-#   {} -1.881783551706252e+203 <- This is the result on linux/amd64.
-#   {} 2.303079268822384e+202 <- This is the result on darwin/arm64.
+# Even Kahan summation cannot cope with values from a number of different
+# orders of magnitude and comes up with a result of zero here.
+# (Note that incremental mean calculation comes up with different but still
+# wrong values that also depend on the used hardware architecture.)
+eval instant at 55s avg_over_time(metric11[1m])
+  {} 0
+#   {} -44.848083237000004 <- This would be the correct value.
 
 # Demonstrate robustness of direct mean calculation vs. incremental mean calculation.
 # The tests below are prone to small inaccuracies with incremental mean calculation.