mirror of
https://github.com/prometheus/prometheus.git
synced 2025-08-06 22:27:17 +02:00
Merge pull request #16773 from prometheus/beorn7/promql
promql: Re-introduce direct mean calculation
This commit is contained in:
commit
9e73fb43b3
@ -2998,6 +2998,7 @@ type groupedAggregation struct {
|
|||||||
hasHistogram bool // Has at least 1 histogram sample aggregated.
|
hasHistogram bool // Has at least 1 histogram sample aggregated.
|
||||||
incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
|
incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
|
||||||
groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
|
groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
|
||||||
|
incrementalMean bool // True after reverting to incremental calculation of the mean value.
|
||||||
}
|
}
|
||||||
|
|
||||||
// aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
|
// aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
|
||||||
@ -3096,21 +3097,38 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
|
|||||||
}
|
}
|
||||||
|
|
||||||
case parser.AVG:
|
case parser.AVG:
|
||||||
// For the average calculation, we use incremental mean
|
// For the average calculation of histograms, we use
|
||||||
// calculation. In particular in combination with Kahan
|
// incremental mean calculation without the help of
|
||||||
// summation (which we do for floats, but not yet for
|
// Kahan summation (but this should change, see
|
||||||
// histograms, see issue #14105), this is quite accurate
|
// https://github.com/prometheus/prometheus/issues/14105
|
||||||
// and only breaks in extreme cases (see testdata for
|
// ). For floats, we improve the accuracy with the help
|
||||||
// avg_over_time). One might assume that simple direct
|
// of Kahan summation. For a while, we assumed that
|
||||||
// mean calculation works better in some cases, but so
|
// incremental mean calculation combined with Kahan
|
||||||
// far, our conclusion is that we fare best with the
|
// summation (see
|
||||||
// incremental approach plus Kahan summation (for
|
|
||||||
// floats). For a relevant discussion, see
|
|
||||||
// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
|
// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
|
||||||
// Additional note: For even better numerical accuracy,
|
// for inspiration) is generally the preferred solution.
|
||||||
// we would need to process the values in a particular
|
// However, it then turned out that direct mean
|
||||||
// order, but that would be very hard to implement given
|
// calculation (still in combination with Kahan
|
||||||
// how the PromQL engine works.
|
// summation) is often more accurate. See discussion in
|
||||||
|
// https://github.com/prometheus/prometheus/issues/16714
|
||||||
|
// . The problem with the direct mean calculation is
|
||||||
|
// that it can overflow float64 for inputs on which the
|
||||||
|
// incremental mean calculation works just fine. Our
|
||||||
|
// current approach is therefore to use direct mean
|
||||||
|
// calculation as long as we do not overflow (or
|
||||||
|
// underflow) the running sum. Once the latter would
|
||||||
|
// happen, we switch to incremental mean calculation.
|
||||||
|
// This seems to work reasonably well, but note that a
|
||||||
|
// deeper understanding would be needed to find out if
|
||||||
|
// maybe an earlier switch to incremental mean
|
||||||
|
// calculation would be better in terms of accuracy.
|
||||||
|
// Also, we could apply a number of additional means to
|
||||||
|
// improve the accuracy, like processing the values in a
|
||||||
|
// particular order. For now, we decided that the
|
||||||
|
// current implementation is accurate enough for
|
||||||
|
// practical purposes, in particular given that changing
|
||||||
|
// the order of summation would be hard, given how the
|
||||||
|
// PromQL engine implements aggregations.
|
||||||
group.groupCount++
|
group.groupCount++
|
||||||
if h != nil {
|
if h != nil {
|
||||||
group.hasHistogram = true
|
group.hasHistogram = true
|
||||||
@ -3135,6 +3153,22 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
|
|||||||
// point in copying the histogram in that case.
|
// point in copying the histogram in that case.
|
||||||
} else {
|
} else {
|
||||||
group.hasFloat = true
|
group.hasFloat = true
|
||||||
|
if !group.incrementalMean {
|
||||||
|
newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC)
|
||||||
|
if !math.IsInf(newV, 0) {
|
||||||
|
// The sum doesn't overflow, so we propagate it to the
|
||||||
|
// group struct and continue with the regular
|
||||||
|
// calculation of the mean value.
|
||||||
|
group.floatValue, group.floatKahanC = newV, newC
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// If we are here, we know that the sum _would_ overflow. So
|
||||||
|
// instead of continue to sum up, we revert to incremental
|
||||||
|
// calculation of the mean value from here on.
|
||||||
|
group.incrementalMean = true
|
||||||
|
group.floatMean = group.floatValue / (group.groupCount - 1)
|
||||||
|
group.floatKahanC /= group.groupCount - 1
|
||||||
|
}
|
||||||
q := (group.groupCount - 1) / group.groupCount
|
q := (group.groupCount - 1) / group.groupCount
|
||||||
group.floatMean, group.floatKahanC = kahanSumInc(
|
group.floatMean, group.floatKahanC = kahanSumInc(
|
||||||
f/group.groupCount,
|
f/group.groupCount,
|
||||||
@ -3212,8 +3246,10 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
|
|||||||
continue
|
continue
|
||||||
case aggr.hasHistogram:
|
case aggr.hasHistogram:
|
||||||
aggr.histogramValue = aggr.histogramValue.Compact(0)
|
aggr.histogramValue = aggr.histogramValue.Compact(0)
|
||||||
default:
|
case aggr.incrementalMean:
|
||||||
aggr.floatValue = aggr.floatMean + aggr.floatKahanC
|
aggr.floatValue = aggr.floatMean + aggr.floatKahanC
|
||||||
|
default:
|
||||||
|
aggr.floatValue = aggr.floatValue/aggr.groupCount + aggr.floatKahanC/aggr.groupCount
|
||||||
}
|
}
|
||||||
|
|
||||||
case parser.COUNT:
|
case parser.COUNT:
|
||||||
|
@ -671,29 +671,36 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
|
|||||||
metricName := firstSeries.Metric.Get(labels.MetricName)
|
metricName := firstSeries.Metric.Get(labels.MetricName)
|
||||||
return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
|
return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
|
||||||
}
|
}
|
||||||
// For the average calculation, we use incremental mean calculation. In
|
// For the average calculation of histograms, we use incremental mean
|
||||||
// particular in combination with Kahan summation (which we do for
|
// calculation without the help of Kahan summation (but this should
|
||||||
// floats, but not yet for histograms, see issue #14105), this is quite
|
// change, see https://github.com/prometheus/prometheus/issues/14105 ).
|
||||||
// accurate and only breaks in extreme cases (see testdata). One might
|
// For floats, we improve the accuracy with the help of Kahan summation.
|
||||||
// assume that simple direct mean calculation works better in some
|
// For a while, we assumed that incremental mean calculation combined
|
||||||
// cases, but so far, our conclusion is that we fare best with the
|
// with Kahan summation (see
|
||||||
// incremental approach plus Kahan summation (for floats). For a
|
|
||||||
// relevant discussion, see
|
|
||||||
// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
|
// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
|
||||||
// Additional note: For even better numerical accuracy, we would need to
|
// for inspiration) is generally the preferred solution. However, it
|
||||||
// process the values in a particular order. For avg_over_time, that
|
// then turned out that direct mean calculation (still in combination
|
||||||
// would be more or less feasible, but it would be more expensive, and
|
// with Kahan summation) is often more accurate. See discussion in
|
||||||
// it would also be much harder for the avg aggregator, given how the
|
// https://github.com/prometheus/prometheus/issues/16714 . The problem
|
||||||
// PromQL engine works.
|
// with the direct mean calculation is that it can overflow float64 for
|
||||||
|
// inputs on which the incremental mean calculation works just fine. Our
|
||||||
|
// current approach is therefore to use direct mean calculation as long
|
||||||
|
// as we do not overflow (or underflow) the running sum. Once the latter
|
||||||
|
// would happen, we switch to incremental mean calculation. This seems
|
||||||
|
// to work reasonably well, but note that a deeper understanding would
|
||||||
|
// be needed to find out if maybe an earlier switch to incremental mean
|
||||||
|
// calculation would be better in terms of accuracy. Also, we could
|
||||||
|
// apply a number of additional means to improve the accuracy, like
|
||||||
|
// processing the values in a particular order. For now, we decided that
|
||||||
|
// the current implementation is accurate enough for practical purposes.
|
||||||
if len(firstSeries.Floats) == 0 {
|
if len(firstSeries.Floats) == 0 {
|
||||||
// The passed values only contain histograms.
|
// The passed values only contain histograms.
|
||||||
vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
|
vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
|
||||||
count := 1
|
|
||||||
mean := s.Histograms[0].H.Copy()
|
mean := s.Histograms[0].H.Copy()
|
||||||
for _, h := range s.Histograms[1:] {
|
for i, h := range s.Histograms[1:] {
|
||||||
count++
|
count := float64(i + 2)
|
||||||
left := h.H.Copy().Div(float64(count))
|
left := h.H.Copy().Div(count)
|
||||||
right := mean.Copy().Div(float64(count))
|
right := mean.Copy().Div(count)
|
||||||
toAdd, err := left.Sub(right)
|
toAdd, err := left.Sub(right)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return mean, err
|
return mean, err
|
||||||
@ -716,13 +723,35 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
|
|||||||
return vec, nil
|
return vec, nil
|
||||||
}
|
}
|
||||||
return aggrOverTime(vals, enh, func(s Series) float64 {
|
return aggrOverTime(vals, enh, func(s Series) float64 {
|
||||||
var mean, kahanC float64
|
var (
|
||||||
for i, f := range s.Floats {
|
// Pre-set the 1st sample to start the loop with the 2nd.
|
||||||
count := float64(i + 1)
|
sum, count = s.Floats[0].F, 1.
|
||||||
q := float64(i) / count
|
mean, kahanC float64
|
||||||
|
incrementalMean bool
|
||||||
|
)
|
||||||
|
for i, f := range s.Floats[1:] {
|
||||||
|
count = float64(i + 2)
|
||||||
|
if !incrementalMean {
|
||||||
|
newSum, newC := kahanSumInc(f.F, sum, kahanC)
|
||||||
|
// Perform regular mean calculation as long as
|
||||||
|
// the sum doesn't overflow.
|
||||||
|
if !math.IsInf(newSum, 0) {
|
||||||
|
sum, kahanC = newSum, newC
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Handle overflow by reverting to incremental
|
||||||
|
// calculation of the mean value.
|
||||||
|
incrementalMean = true
|
||||||
|
mean = sum / (count - 1)
|
||||||
|
kahanC /= (count - 1)
|
||||||
|
}
|
||||||
|
q := (count - 1) / count
|
||||||
mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
|
mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
|
||||||
}
|
}
|
||||||
return mean + kahanC
|
if incrementalMean {
|
||||||
|
return mean + kahanC
|
||||||
|
}
|
||||||
|
return sum/count + kahanC/count
|
||||||
}), nil
|
}), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
63
promql/promqltest/testdata/aggregators.test
vendored
63
promql/promqltest/testdata/aggregators.test
vendored
@ -593,10 +593,8 @@ eval instant at 1m avg(data{test="big"})
|
|||||||
eval instant at 1m avg(data{test="-big"})
|
eval instant at 1m avg(data{test="-big"})
|
||||||
{} -9.988465674311579e+307
|
{} -9.988465674311579e+307
|
||||||
|
|
||||||
# This test fails on darwin/arm64.
|
eval instant at 1m avg(data{test="bigzero"})
|
||||||
# Deactivated until issue #16714 is fixed.
|
{} 0
|
||||||
# eval instant at 1m avg(data{test="bigzero"})
|
|
||||||
# {} 0
|
|
||||||
|
|
||||||
# If NaN is in the mix, the result is NaN.
|
# If NaN is in the mix, the result is NaN.
|
||||||
eval instant at 1m avg(data)
|
eval instant at 1m avg(data)
|
||||||
@ -663,6 +661,63 @@ eval instant at 1m avg by (group) (data{test="nan"})
|
|||||||
|
|
||||||
clear
|
clear
|
||||||
|
|
||||||
|
# Demonstrate robustness of direct mean calculation vs. incremental mean calculation.
|
||||||
|
# The tests below are prone to small inaccuracies with incremental mean calculation.
|
||||||
|
# The exact number of aggregated values that trigger an inaccuracy depends on the
|
||||||
|
# hardware.
|
||||||
|
# See also discussion in https://github.com/prometheus/prometheus/issues/16714
|
||||||
|
load 5m
|
||||||
|
foo{idx="0"} 52
|
||||||
|
foo{idx="1"} 52
|
||||||
|
foo{idx="2"} 52
|
||||||
|
foo{idx="3"} 52
|
||||||
|
foo{idx="4"} 52
|
||||||
|
foo{idx="5"} 52
|
||||||
|
foo{idx="6"} 52
|
||||||
|
foo{idx="7"} 52
|
||||||
|
foo{idx="8"} 52
|
||||||
|
foo{idx="9"} 52
|
||||||
|
foo{idx="10"} 52
|
||||||
|
foo{idx="11"} 52
|
||||||
|
|
||||||
|
eval instant at 0 avg(foo) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(11, foo)) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(10, foo)) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(9, foo)) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(8, foo)) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
# The following tests do not utilize the tolerance built into the
|
||||||
|
# testing framework but rely on the exact equality implemented in
|
||||||
|
# PromQL. They currently pass, but we should keep in mind that this is
|
||||||
|
# not a hard requirement, and generally it is a bad idea in practice
|
||||||
|
# to rely on exact equality like this in alerting rules etc.
|
||||||
|
|
||||||
|
eval instant at 0 avg(foo) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(11, foo)) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(10, foo)) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(9, foo)) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 0 avg(topk(8, foo)) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
clear
|
||||||
|
|
||||||
# Test that aggregations are deterministic.
|
# Test that aggregations are deterministic.
|
||||||
# Commented because it is flaky in range mode.
|
# Commented because it is flaky in range mode.
|
||||||
#load 10s
|
#load 10s
|
||||||
|
78
promql/promqltest/testdata/functions.test
vendored
78
promql/promqltest/testdata/functions.test
vendored
@ -886,6 +886,14 @@ load 10s
|
|||||||
metric12 1 2 3 {{schema:0 sum:1 count:1}} {{schema:0 sum:3 count:3}}
|
metric12 1 2 3 {{schema:0 sum:1 count:1}} {{schema:0 sum:3 count:3}}
|
||||||
metric13 {{schema:0 sum:1 count:1}}x5
|
metric13 {{schema:0 sum:1 count:1}}x5
|
||||||
|
|
||||||
|
# No samples in the range.
|
||||||
|
eval instant at 55s avg_over_time(metric[10s])
|
||||||
|
|
||||||
|
# One sample in the range.
|
||||||
|
eval instant at 55s avg_over_time(metric[20s])
|
||||||
|
{} 5
|
||||||
|
|
||||||
|
# All samples in the range.
|
||||||
eval instant at 55s avg_over_time(metric[1m])
|
eval instant at 55s avg_over_time(metric[1m])
|
||||||
{} 3
|
{} 3
|
||||||
|
|
||||||
@ -1010,10 +1018,8 @@ load 10s
|
|||||||
eval instant at 1m sum_over_time(metric[2m])
|
eval instant at 1m sum_over_time(metric[2m])
|
||||||
{} 2
|
{} 2
|
||||||
|
|
||||||
# This test fails on darwin/arm64.
|
eval instant at 1m avg_over_time(metric[2m])
|
||||||
# Deactivated until issue #16714 is fixed.
|
{} 0.5
|
||||||
# eval instant at 1m avg_over_time(metric[2m])
|
|
||||||
# {} 0.5
|
|
||||||
|
|
||||||
# More tests for extreme values.
|
# More tests for extreme values.
|
||||||
clear
|
clear
|
||||||
@ -1060,11 +1066,6 @@ eval instant at 55s avg_over_time(metric8[1m])
|
|||||||
eval instant at 55s avg_over_time(metric9[1m])
|
eval instant at 55s avg_over_time(metric9[1m])
|
||||||
{} 3.225714285714286e+219
|
{} 3.225714285714286e+219
|
||||||
|
|
||||||
# Interestingly, before PR #16569, this test failed with a result of
|
|
||||||
# 3.225714285714286e+219, so the incremental calculation combined with
|
|
||||||
# Kahan summation (in the improved way as introduced by PR #16569)
|
|
||||||
# seems to be more accurate than the simple mean calculation with
|
|
||||||
# Kahan summation.
|
|
||||||
eval instant at 55s avg_over_time(metric10[1m])
|
eval instant at 55s avg_over_time(metric10[1m])
|
||||||
{} 2.5088888888888888e+219
|
{} 2.5088888888888888e+219
|
||||||
|
|
||||||
@ -1075,19 +1076,52 @@ eval instant at 55s avg_over_time(metric10[1m])
|
|||||||
load 5s
|
load 5s
|
||||||
metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
|
metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
|
||||||
|
|
||||||
# Thus, the result here is very far off! With the different orders of
|
# Even Kahan summation cannot cope with values from a number of different
|
||||||
# magnitudes involved, even Kahan summation cannot cope.
|
# orders of magnitude and comes up with a result of zero here.
|
||||||
# Interestingly, with the simple mean calculation (still including
|
# (Note that incremental mean calculation comes up with different but still
|
||||||
# Kahan summation) prior to PR #16569, the result is 0. That's
|
# wrong values that also depend on the used hardware architecture.)
|
||||||
# arguably more accurate, but it still shows that Kahan summation
|
eval instant at 55s avg_over_time(metric11[1m])
|
||||||
# doesn't work well in this situation. To solve this properly, we
|
{} 0
|
||||||
# needed to do something like sorting the values (which is hard given
|
# {} -44.848083237000004 <- This would be the correct value.
|
||||||
# how the PromQL engine works). The question is how practically
|
|
||||||
# relevant this scenario is.
|
# Demonstrate robustness of direct mean calculation vs. incremental mean calculation.
|
||||||
# eval instant at 55s avg_over_time(metric11[1m])
|
# The tests below are prone to small inaccuracies with incremental mean calculation.
|
||||||
# {} -44.848083237000004 <- This is the correct value.
|
# The exact number of aggregated values that trigger an inaccuracy depends on the
|
||||||
# {} -1.881783551706252e+203 <- This is the result on linux/amd64.
|
# hardware.
|
||||||
# {} 2.303079268822384e+202 <- This is the result on darwin/arm64.
|
# See also discussion in https://github.com/prometheus/prometheus/issues/16714
|
||||||
|
clear
|
||||||
|
load 10s
|
||||||
|
foo 52+0x100
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[100s]) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[110s]) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[120s]) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[130s]) - 52
|
||||||
|
{} 0
|
||||||
|
|
||||||
|
# The following tests do not utilize the tolerance built into the
|
||||||
|
# testing framework but rely on the exact equality implemented in
|
||||||
|
# PromQL. They currently pass, but we should keep in mind that this is
|
||||||
|
# not a hard requirement, and generally it is a bad idea in practice
|
||||||
|
# to rely on exact equality like this in alerting rules etc.
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[100s]) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[110s]) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[120s]) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
|
eval instant at 10m avg_over_time(foo[130s]) == 52
|
||||||
|
{} 52
|
||||||
|
|
||||||
# Test per-series aggregation on dense samples.
|
# Test per-series aggregation on dense samples.
|
||||||
clear
|
clear
|
||||||
|
Loading…
Reference in New Issue
Block a user