From 2b3fc1f115e7c09eb182ced42f8ec6d826b676ee Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 24 Jun 2025 12:41:30 +0200 Subject: [PATCH 1/3] promql: Add test cases for direct mean calculation These demonstrate that direct mean calculation has some merits after all. Signed-off-by: beorn7 --- promql/promqltest/testdata/aggregators.test | 57 +++++++++++++++++++++ promql/promqltest/testdata/functions.test | 39 ++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test index 7062d37f05..0eae8ba61e 100644 --- a/promql/promqltest/testdata/aggregators.test +++ b/promql/promqltest/testdata/aggregators.test @@ -663,6 +663,63 @@ eval instant at 1m avg by (group) (data{test="nan"}) clear +# Demonstrate robustness of direct mean calculation vs. incremental mean calculation. +# The tests below are prone to small inaccuracies with incremental mean calculation. +# The exact number of aggregated values that trigger an inaccuracy depends on the +# hardware. +# See also discussion in https://github.com/prometheus/prometheus/issues/16714 +load 5m + foo{idx="0"} 52 + foo{idx="1"} 52 + foo{idx="2"} 52 + foo{idx="3"} 52 + foo{idx="4"} 52 + foo{idx="5"} 52 + foo{idx="6"} 52 + foo{idx="7"} 52 + foo{idx="8"} 52 + foo{idx="9"} 52 + foo{idx="10"} 52 + foo{idx="11"} 52 + +eval instant at 0 avg(foo) - 52 + {} 0 + +eval instant at 0 avg(topk(11, foo)) - 52 + {} 0 + +eval instant at 0 avg(topk(10, foo)) - 52 + {} 0 + +eval instant at 0 avg(topk(9, foo)) - 52 + {} 0 + +eval instant at 0 avg(topk(8, foo)) - 52 + {} 0 + +# The following tests do not utilize the tolerance built into the +# testing framework but rely on the exact equality implemented in +# PromQL. They currently pass, but we should keep in mind that this is +# not a hard requirement, and generally it is a bad idea in practice +# to rely on exact equality like this in alerting rules etc. + +eval instant at 0 avg(foo) == 52 + {} 52 + +eval instant at 0 avg(topk(11, foo)) == 52 + {} 52 + +eval instant at 0 avg(topk(10, foo)) == 52 + {} 52 + +eval instant at 0 avg(topk(9, foo)) == 52 + {} 52 + +eval instant at 0 avg(topk(8, foo)) == 52 + {} 52 + +clear + # Test that aggregations are deterministic. # Commented because it is flaky in range mode. #load 10s diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index b99dcd7e0d..a2f04f9dbf 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -1089,6 +1089,45 @@ load 5s # {} -1.881783551706252e+203 <- This is the result on linux/amd64. # {} 2.303079268822384e+202 <- This is the result on darwin/arm64. +# Demonstrate robustness of direct mean calculation vs. incremental mean calculation. +# The tests below are prone to small inaccuracies with incremental mean calculation. +# The exact number of aggregated values that trigger an inaccuracy depends on the +# hardware. +# See also discussion in https://github.com/prometheus/prometheus/issues/16714 +clear +load 10s + foo 52+0x100 + +eval instant at 10m avg_over_time(foo[100s]) - 52 + {} 0 + +eval instant at 10m avg_over_time(foo[110s]) - 52 + {} 0 + +eval instant at 10m avg_over_time(foo[120s]) - 52 + {} 0 + +eval instant at 10m avg_over_time(foo[130s]) - 52 + {} 0 + +# The following tests do not utilize the tolerance built into the +# testing framework but rely on the exact equality implemented in +# PromQL. They currently pass, but we should keep in mind that this is +# not a hard requirement, and generally it is a bad idea in practice +# to rely on exact equality like this in alerting rules etc. + +eval instant at 10m avg_over_time(foo[100s]) == 52 + {} 52 + +eval instant at 10m avg_over_time(foo[110s]) == 52 + {} 52 + +eval instant at 10m avg_over_time(foo[120s]) == 52 + {} 52 + +eval instant at 10m avg_over_time(foo[130s]) == 52 + {} 52 + # Test per-series aggregation on dense samples. clear load 1ms From f71daa797786b8a98a82c98f86f71bb709e5c609 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 24 Jun 2025 14:35:58 +0200 Subject: [PATCH 2/3] promql: Remove falsified comment from test The test in question actually worked fine even before #16569. The finding reported in the comment has turned out to be caused by something else. Signed-off-by: beorn7 --- promql/promqltest/testdata/functions.test | 5 ----- 1 file changed, 5 deletions(-) diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index a2f04f9dbf..23d7f53af6 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -1060,11 +1060,6 @@ eval instant at 55s avg_over_time(metric8[1m]) eval instant at 55s avg_over_time(metric9[1m]) {} 3.225714285714286e+219 -# Interestingly, before PR #16569, this test failed with a result of -# 3.225714285714286e+219, so the incremental calculation combined with -# Kahan summation (in the improved way as introduced by PR #16569) -# seems to be more accurate than the simple mean calculation with -# Kahan summation. eval instant at 55s avg_over_time(metric10[1m]) {} 2.5088888888888888e+219 From ce809e625f8ba5768fa0928cb82cd5d5c3eb0627 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 24 Jun 2025 15:52:35 +0200 Subject: [PATCH 3/3] promql: Re-introduce direct mean calculation for better accuracy This commit brings back direct mean calculation (for `avg` and `avg_over_time`) but isn't an outright revert of #16569. It keeps the improved incremental mean calculation and features generally a bit cleaner code than before. Also, this commit... - ...updates the lengthy comment explaining the whole situation and trade-offs. - ...divides the running sum and the Kahan compensation term separately (in direct mean calculation) to avoid the (unlikely) possibility that sum and Kahan compensation together ovorflow float64. - ...uncomments the tests that should now work again on darwin/arm64. - ...uncomments the test that should now reliably yield the (inaccurate) value 0 on all hardware platforms. Also, the test description has been updated accordingly. - ...adds avg_over_time tests for zero and one sample in the range. Signed-off-by: beorn7 --- promql/engine.go | 66 +++++++++++++----- promql/functions.go | 75 ++++++++++++++------- promql/promqltest/testdata/aggregators.test | 6 +- promql/promqltest/testdata/functions.test | 34 +++++----- 4 files changed, 122 insertions(+), 59 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index 0622bcb388..adf49ed96e 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -2998,6 +2998,7 @@ type groupedAggregation struct { hasHistogram bool // Has at least 1 histogram sample aggregated. incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets. groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group. + incrementalMean bool // True after reverting to incremental calculation of the mean value. } // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix. @@ -3096,21 +3097,38 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix } case parser.AVG: - // For the average calculation, we use incremental mean - // calculation. In particular in combination with Kahan - // summation (which we do for floats, but not yet for - // histograms, see issue #14105), this is quite accurate - // and only breaks in extreme cases (see testdata for - // avg_over_time). One might assume that simple direct - // mean calculation works better in some cases, but so - // far, our conclusion is that we fare best with the - // incremental approach plus Kahan summation (for - // floats). For a relevant discussion, see + // For the average calculation of histograms, we use + // incremental mean calculation without the help of + // Kahan summation (but this should change, see + // https://github.com/prometheus/prometheus/issues/14105 + // ). For floats, we improve the accuracy with the help + // of Kahan summation. For a while, we assumed that + // incremental mean calculation combined with Kahan + // summation (see // https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average - // Additional note: For even better numerical accuracy, - // we would need to process the values in a particular - // order, but that would be very hard to implement given - // how the PromQL engine works. + // for inspiration) is generally the preferred solution. + // However, it then turned out that direct mean + // calculation (still in combination with Kahan + // summation) is often more accurate. See discussion in + // https://github.com/prometheus/prometheus/issues/16714 + // . The problem with the direct mean calculation is + // that it can overflow float64 for inputs on which the + // incremental mean calculation works just fine. Our + // current approach is therefore to use direct mean + // calculation as long as we do not overflow (or + // underflow) the running sum. Once the latter would + // happen, we switch to incremental mean calculation. + // This seems to work reasonably well, but note that a + // deeper understanding would be needed to find out if + // maybe an earlier switch to incremental mean + // calculation would be better in terms of accuracy. + // Also, we could apply a number of additional means to + // improve the accuracy, like processing the values in a + // particular order. For now, we decided that the + // current implementation is accurate enough for + // practical purposes, in particular given that changing + // the order of summation would be hard, given how the + // PromQL engine implements aggregations. group.groupCount++ if h != nil { group.hasHistogram = true @@ -3135,6 +3153,22 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix // point in copying the histogram in that case. } else { group.hasFloat = true + if !group.incrementalMean { + newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC) + if !math.IsInf(newV, 0) { + // The sum doesn't overflow, so we propagate it to the + // group struct and continue with the regular + // calculation of the mean value. + group.floatValue, group.floatKahanC = newV, newC + break + } + // If we are here, we know that the sum _would_ overflow. So + // instead of continue to sum up, we revert to incremental + // calculation of the mean value from here on. + group.incrementalMean = true + group.floatMean = group.floatValue / (group.groupCount - 1) + group.floatKahanC /= group.groupCount - 1 + } q := (group.groupCount - 1) / group.groupCount group.floatMean, group.floatKahanC = kahanSumInc( f/group.groupCount, @@ -3212,8 +3246,10 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix continue case aggr.hasHistogram: aggr.histogramValue = aggr.histogramValue.Compact(0) - default: + case aggr.incrementalMean: aggr.floatValue = aggr.floatMean + aggr.floatKahanC + default: + aggr.floatValue = aggr.floatValue/aggr.groupCount + aggr.floatKahanC/aggr.groupCount } case parser.COUNT: diff --git a/promql/functions.go b/promql/functions.go index 7e3099ec60..fdf042eb8b 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -671,29 +671,36 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode metricName := firstSeries.Metric.Get(labels.MetricName) return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange())) } - // For the average calculation, we use incremental mean calculation. In - // particular in combination with Kahan summation (which we do for - // floats, but not yet for histograms, see issue #14105), this is quite - // accurate and only breaks in extreme cases (see testdata). One might - // assume that simple direct mean calculation works better in some - // cases, but so far, our conclusion is that we fare best with the - // incremental approach plus Kahan summation (for floats). For a - // relevant discussion, see + // For the average calculation of histograms, we use incremental mean + // calculation without the help of Kahan summation (but this should + // change, see https://github.com/prometheus/prometheus/issues/14105 ). + // For floats, we improve the accuracy with the help of Kahan summation. + // For a while, we assumed that incremental mean calculation combined + // with Kahan summation (see // https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average - // Additional note: For even better numerical accuracy, we would need to - // process the values in a particular order. For avg_over_time, that - // would be more or less feasible, but it would be more expensive, and - // it would also be much harder for the avg aggregator, given how the - // PromQL engine works. + // for inspiration) is generally the preferred solution. However, it + // then turned out that direct mean calculation (still in combination + // with Kahan summation) is often more accurate. See discussion in + // https://github.com/prometheus/prometheus/issues/16714 . The problem + // with the direct mean calculation is that it can overflow float64 for + // inputs on which the incremental mean calculation works just fine. Our + // current approach is therefore to use direct mean calculation as long + // as we do not overflow (or underflow) the running sum. Once the latter + // would happen, we switch to incremental mean calculation. This seems + // to work reasonably well, but note that a deeper understanding would + // be needed to find out if maybe an earlier switch to incremental mean + // calculation would be better in terms of accuracy. Also, we could + // apply a number of additional means to improve the accuracy, like + // processing the values in a particular order. For now, we decided that + // the current implementation is accurate enough for practical purposes. if len(firstSeries.Floats) == 0 { // The passed values only contain histograms. vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) { - count := 1 mean := s.Histograms[0].H.Copy() - for _, h := range s.Histograms[1:] { - count++ - left := h.H.Copy().Div(float64(count)) - right := mean.Copy().Div(float64(count)) + for i, h := range s.Histograms[1:] { + count := float64(i + 2) + left := h.H.Copy().Div(count) + right := mean.Copy().Div(count) toAdd, err := left.Sub(right) if err != nil { return mean, err @@ -716,13 +723,35 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode return vec, nil } return aggrOverTime(vals, enh, func(s Series) float64 { - var mean, kahanC float64 - for i, f := range s.Floats { - count := float64(i + 1) - q := float64(i) / count + var ( + // Pre-set the 1st sample to start the loop with the 2nd. + sum, count = s.Floats[0].F, 1. + mean, kahanC float64 + incrementalMean bool + ) + for i, f := range s.Floats[1:] { + count = float64(i + 2) + if !incrementalMean { + newSum, newC := kahanSumInc(f.F, sum, kahanC) + // Perform regular mean calculation as long as + // the sum doesn't overflow. + if !math.IsInf(newSum, 0) { + sum, kahanC = newSum, newC + continue + } + // Handle overflow by reverting to incremental + // calculation of the mean value. + incrementalMean = true + mean = sum / (count - 1) + kahanC /= (count - 1) + } + q := (count - 1) / count mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC) } - return mean + kahanC + if incrementalMean { + return mean + kahanC + } + return sum/count + kahanC/count }), nil } diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test index 0eae8ba61e..f77b418edf 100644 --- a/promql/promqltest/testdata/aggregators.test +++ b/promql/promqltest/testdata/aggregators.test @@ -593,10 +593,8 @@ eval instant at 1m avg(data{test="big"}) eval instant at 1m avg(data{test="-big"}) {} -9.988465674311579e+307 -# This test fails on darwin/arm64. -# Deactivated until issue #16714 is fixed. -# eval instant at 1m avg(data{test="bigzero"}) -# {} 0 +eval instant at 1m avg(data{test="bigzero"}) + {} 0 # If NaN is in the mix, the result is NaN. eval instant at 1m avg(data) diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index 23d7f53af6..037e40923a 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -886,6 +886,14 @@ load 10s metric12 1 2 3 {{schema:0 sum:1 count:1}} {{schema:0 sum:3 count:3}} metric13 {{schema:0 sum:1 count:1}}x5 +# No samples in the range. +eval instant at 55s avg_over_time(metric[10s]) + +# One sample in the range. +eval instant at 55s avg_over_time(metric[20s]) + {} 5 + +# All samples in the range. eval instant at 55s avg_over_time(metric[1m]) {} 3 @@ -1010,10 +1018,8 @@ load 10s eval instant at 1m sum_over_time(metric[2m]) {} 2 -# This test fails on darwin/arm64. -# Deactivated until issue #16714 is fixed. -# eval instant at 1m avg_over_time(metric[2m]) -# {} 0.5 +eval instant at 1m avg_over_time(metric[2m]) + {} 0.5 # More tests for extreme values. clear @@ -1070,19 +1076,13 @@ eval instant at 55s avg_over_time(metric10[1m]) load 5s metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154 -# Thus, the result here is very far off! With the different orders of -# magnitudes involved, even Kahan summation cannot cope. -# Interestingly, with the simple mean calculation (still including -# Kahan summation) prior to PR #16569, the result is 0. That's -# arguably more accurate, but it still shows that Kahan summation -# doesn't work well in this situation. To solve this properly, we -# needed to do something like sorting the values (which is hard given -# how the PromQL engine works). The question is how practically -# relevant this scenario is. -# eval instant at 55s avg_over_time(metric11[1m]) -# {} -44.848083237000004 <- This is the correct value. -# {} -1.881783551706252e+203 <- This is the result on linux/amd64. -# {} 2.303079268822384e+202 <- This is the result on darwin/arm64. +# Even Kahan summation cannot cope with values from a number of different +# orders of magnitude and comes up with a result of zero here. +# (Note that incremental mean calculation comes up with different but still +# wrong values that also depend on the used hardware architecture.) +eval instant at 55s avg_over_time(metric11[1m]) + {} 0 +# {} -44.848083237000004 <- This would be the correct value. # Demonstrate robustness of direct mean calculation vs. incremental mean calculation. # The tests below are prone to small inaccuracies with incremental mean calculation.