diff --git a/promql/engine.go b/promql/engine.go index 1e01f21db6..3ea663e2f9 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -2998,7 +2998,6 @@ type groupedAggregation struct { hasHistogram bool // Has at least 1 histogram sample aggregated. incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets. groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group. - incrementalMean bool // True after reverting to incremental calculation of the mean value. } // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix. @@ -3097,6 +3096,21 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix } case parser.AVG: + // For the average calculation, we use incremental mean + // calculation. In particular in combination with Kahan + // summation (which we do for floats, but not yet for + // histograms, see issue #14105), this is quite accurate + // and only breaks in extreme cases (see testdata for + // avg_over_time). One might assume that simple direct + // mean calculation works better in some cases, but so + // far, our conclusion is that we fare best with the + // incremental approach plus Kahan summation (for + // floats). For a relevant discussion, see + // https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average + // Additional note: For even better numerical accuracy, + // we would need to process the values in a particular + // order, but that would be very hard to implement given + // how the PromQL engine works. group.groupCount++ if h != nil { group.hasHistogram = true @@ -3121,45 +3135,11 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix // point in copying the histogram in that case. } else { group.hasFloat = true - if !group.incrementalMean { - newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC) - if !math.IsInf(newV, 0) { - // The sum doesn't overflow, so we propagate it to the - // group struct and continue with the regular - // calculation of the mean value. - group.floatValue, group.floatKahanC = newV, newC - break - } - // If we are here, we know that the sum _would_ overflow. So - // instead of continue to sum up, we revert to incremental - // calculation of the mean value from here on. - group.incrementalMean = true - group.floatMean = group.floatValue / (group.groupCount - 1) - group.floatKahanC /= group.groupCount - 1 - } - if math.IsInf(group.floatMean, 0) { - if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) { - // The `floatMean` and `s.F` values are `Inf` of the same sign. They - // can't be subtracted, but the value of `floatMean` is correct - // already. - break - } - if !math.IsInf(f, 0) && !math.IsNaN(f) { - // At this stage, the mean is an infinite. If the added - // value is neither an Inf or a Nan, we can keep that mean - // value. - // This is required because our calculation below removes - // the mean value, which would look like Inf += x - Inf and - // end up as a NaN. - break - } - } - currentMean := group.floatMean + group.floatKahanC + q := (group.groupCount - 1) / group.groupCount group.floatMean, group.floatKahanC = kahanSumInc( - // Divide each side of the `-` by `group.groupCount` to avoid float64 overflows. - f/group.groupCount-currentMean/group.groupCount, - group.floatMean, - group.floatKahanC, + f/group.groupCount, + q*group.floatMean, + q*group.floatKahanC, ) } @@ -3232,10 +3212,8 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix continue case aggr.hasHistogram: aggr.histogramValue = aggr.histogramValue.Compact(0) - case aggr.incrementalMean: - aggr.floatValue = aggr.floatMean + aggr.floatKahanC default: - aggr.floatValue = (aggr.floatValue + aggr.floatKahanC) / aggr.groupCount + aggr.floatValue = aggr.floatMean + aggr.floatKahanC } case parser.COUNT: diff --git a/promql/functions.go b/promql/functions.go index e4699f3914..7e5205844a 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -671,6 +671,20 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode metricName := firstSeries.Metric.Get(labels.MetricName) return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange())) } + // For the average calculation, we use incremental mean calculation. In + // particular in combination with Kahan summation (which we do for + // floats, but not yet for histograms, see issue #14105), this is quite + // accurate and only breaks in extreme cases (see testdata). One might + // assume that simple direct mean calculation works better in some + // cases, but so far, our conclusion is that we fare best with the + // incremental approach plus Kahan summation (for floats). For a + // relevant discussion, see + // https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average + // Additional note: For even better numerical accuracy, we would need to + // process the values in a particular order. For avg_over_time, that + // would be more or less feasible, but it would be more expensivo, and + // it would also be much harder for the avg aggregator, given how the + // PromQL engine works. if len(firstSeries.Floats) == 0 { // The passed values only contain histograms. vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) { @@ -702,52 +716,13 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode return vec, nil } return aggrOverTime(vals, enh, func(s Series) float64 { - var ( - sum, mean, count, kahanC float64 - incrementalMean bool - ) - for _, f := range s.Floats { - count++ - if !incrementalMean { - newSum, newC := kahanSumInc(f.F, sum, kahanC) - // Perform regular mean calculation as long as - // the sum doesn't overflow and (in any case) - // for the first iteration (even if we start - // with ±Inf) to not run into division-by-zero - // problems below. - if count == 1 || !math.IsInf(newSum, 0) { - sum, kahanC = newSum, newC - continue - } - // Handle overflow by reverting to incremental calculation of the mean value. - incrementalMean = true - mean = sum / (count - 1) - kahanC /= count - 1 - } - if math.IsInf(mean, 0) { - if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) { - // The `mean` and `f.F` values are `Inf` of the same sign. They - // can't be subtracted, but the value of `mean` is correct - // already. - continue - } - if !math.IsInf(f.F, 0) && !math.IsNaN(f.F) { - // At this stage, the mean is an infinite. If the added - // value is neither an Inf or a Nan, we can keep that mean - // value. - // This is required because our calculation below removes - // the mean value, which would look like Inf += x - Inf and - // end up as a NaN. - continue - } - } - correctedMean := mean + kahanC - mean, kahanC = kahanSumInc(f.F/count-correctedMean/count, mean, kahanC) + var mean, kahanC float64 + for i, f := range s.Floats { + count := float64(i + 1) + q := float64(i) / count + mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC) } - if incrementalMean { - return mean + kahanC - } - return (sum + kahanC) / count + return mean + kahanC }), nil } diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index 83c49320ed..beb216b93c 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -1013,6 +1013,79 @@ eval instant at 1m sum_over_time(metric[2m]) eval instant at 1m avg_over_time(metric[2m]) {} 0.5 +# More tests for extreme values. +clear +# All positive values with varying magnitudes. +load 5s + metric1 1e10 1e-6 1e-6 1e-6 1e-6 1e-6 + metric2 5.30921651659898 0.961118537914768 1.62091361305318 0.865089463758091 0.323055185914577 0.951811357687154 + metric3 1.78264e50 0.76342771 1.9592258 7.69805412 458.90154 + metric4 0.76342771 1.9592258 7.69805412 1.78264e50 458.90154 + metric5 1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 458.90154 + +eval instant at 55s avg_over_time(metric1[1m]) + {} 1.6666666666666675e+09 + +eval instant at 55s avg_over_time(metric2[1m]) + {} 1.67186744582113 + +eval instant at 55s avg_over_time(metric3[1m]) + {} 3.56528E+49 + +eval instant at 55s avg_over_time(metric4[1m]) + {} 3.56528E+49 + +eval instant at 55s avg_over_time(metric5[1m]) + {} 3.76333333333333E+219 + +# Contains negative values; result is dominated by a very large value. +load 5s + metric6 -1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154 + metric7 -1.78264E+50 0.76342771 1.9592258 -2.258E+220 7.69805412 -458.90154 + metric8 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154 + metric9 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 1.78264E+215 -458.90154 + metric10 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154 + +eval instant at 55s avg_over_time(metric6[1m]) + {} 3.76333333333333E+219 + +eval instant at 55s avg_over_time(metric7[1m]) + {} -3.76333333333333E+219 + +eval instant at 55s avg_over_time(metric8[1m]) + {} 3.76330362266667E+219 + +eval instant at 55s avg_over_time(metric9[1m]) + {} 3.225714285714286e+219 + +# Interestingly, before PR #16569, this test failed with a result of +# 3.225714285714286e+219, so the incremental calculation combined with +# Kahan summation (in the improved way as introduced by PR #16569) +# seems to be more accurate than the simple mean calculation with +# Kahan summation. +eval instant at 55s avg_over_time(metric10[1m]) + {} 2.5088888888888888e+219 + +# Large values of different magnitude, combined with small values. The +# large values, however, all cancel each other exactly, so that the actual +# average here is determined by only the small values. Therefore, the correct +# outcome is -44.848083237000004. +load 5s + metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154 + +# Thus, the result here is very far off! With the different orders of +# magnitudes involved, even Kahan summation cannot cope. +# Interestingly, with the simple mean calculation (still including +# Kahan summation) prior to PR #16569, the result is 0. That's +# arguably more accurate, but it still shows that Kahan summation +# doesn't work well in this situation. To solve this properly, we +# needed to do something like sorting the values (which is hard given +# how the PromQL engine works). The question is how practically +# relevant this scenario is. +eval instant at 55s avg_over_time(metric11[1m]) + {} -1.881783551706252e+203 +# {} -44.848083237000004 <- This is the correct value. + # Test per-series aggregation on dense samples. clear load 1ms