promql: Prevent extrapolation below zero for histogram count

This deals with the count field of native histograms in the same way as with simple float counters. It then scale the whole histogram with the same factor as it has scaled the count. This will still allow individual buckets to get extrapolated below zero, but maybe that is fine. This implements approach (2) as described in https://github.com/prometheus/prometheus/issues/15976#issuecomment-3032095158 Signed-off-by: beorn7 <beorn@grafana.com>
2025-08-06 06:07:11 +02:00 · 2025-07-03 16:47:37 +02:00 · 2025-07-03 16:47:37 +02:00 · bcf7a822a0
commit bcf7a822a0
parent c565e95808
2 changed files with 27 additions and 14 deletions
--- a/promql/functions.go
+++ b/promql/functions.go
@ -144,32 +144,37 @@ func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNod
 	// (which is our guess for where the series actually starts or ends).

 	extrapolationThreshold := averageDurationBetweenSamples * 1.1
-	extrapolateToInterval := sampledInterval
-
 	if durationToStart >= extrapolationThreshold {
 		durationToStart = averageDurationBetweenSamples / 2
 	}
-	if isCounter && resultFloat > 0 && len(samples.Floats) > 0 && samples.Floats[0].F >= 0 {
+	if isCounter {
 		// Counters cannot be negative. If we have any slope at all
 		// (i.e. resultFloat went up), we can extrapolate the zero point
 		// of the counter. If the duration to the zero point is shorter
 		// than the durationToStart, we take the zero point as the start
 		// of the series, thereby avoiding extrapolation to negative
 		// counter values.
-		// TODO(beorn7): Do this for histograms, too.
-		durationToZero := sampledInterval * (samples.Floats[0].F / resultFloat)
+		durationToZero := durationToStart
+		if resultFloat > 0 &&
+			len(samples.Floats) > 0 &&
+			samples.Floats[0].F >= 0 {
+			durationToZero = sampledInterval * (samples.Floats[0].F / resultFloat)
+		} else if resultHistogram != nil &&
+			resultHistogram.Count > 0 &&
+			len(samples.Histograms) > 0 &&
+			samples.Histograms[0].H.Count >= 0 {
+			durationToZero = sampledInterval * (samples.Histograms[0].H.Count / resultHistogram.Count)
+		}
 		if durationToZero < durationToStart {
 			durationToStart = durationToZero
 		}
 	}
-	extrapolateToInterval += durationToStart

 	if durationToEnd >= extrapolationThreshold {
 		durationToEnd = averageDurationBetweenSamples / 2
 	}
-	extrapolateToInterval += durationToEnd

-	factor := extrapolateToInterval / sampledInterval
+	factor := (sampledInterval + durationToStart + durationToEnd) / sampledInterval
 	if isRate {
 		factor /= ms.Range.Seconds()
 	}
--- a/promql/promqltest/testdata/native_histograms.test
+++ b/promql/promqltest/testdata/native_histograms.test
@ -1041,11 +1041,15 @@ eval_warn instant at 1m rate(some_metric[1m30s])
 eval_warn instant at 1m30s rate(some_metric[1m30s])
    # Should produce no results.

-# Start with custom, end with exponential. Return the exponential histogram divided by 30.
+# Start with custom, end with exponential. Return the exponential histogram divided by 48.
+# (The 1st sample is the NHCB with count:1. It is mostly ignored with the exception of the
+# count, which means the rate calculation extrapolates until the count hits 0.)
 eval instant at 1m rate(some_metric[1m])
-    {} {{schema:0 sum:0.16666666666666666 count:0.13333333333333333 buckets:[0.03333333333333333 0.06666666666666667 0.03333333333333333]}}
+    {} {{count:0.08333333333333333 sum:0.10416666666666666 counter_reset_hint:gauge buckets:[0.020833333333333332 0.041666666666666664 0.020833333333333332]}}

 # Start with exponential, end with custom. Return the custom buckets histogram divided by 30.
+# (With the 2nd sample having a count of 1, the extrapolation to zero lands exactly at the
+# left boundary of the range, so no extrapolation limitation needed.)
 eval instant at 30s rate(some_metric[1m])
    {} {{schema:-53 sum:0.03333333333333333 count:0.03333333333333333 custom_values:[5 10] buckets:[0.03333333333333333]}}

@ -1376,21 +1380,25 @@ eval instant at 1m histogram_fraction(-Inf, +Inf, histogram_nan)

 clear

-# Tests to demonstrate how an extrapolation below zero is prevented for a float counter, but not for native histograms.
-# I.e. the float counter that behaves the same as the histogram count might yield a different result after `increase`.
+# Tests to demonstrate how an extrapolation below zero is prevented for both float counters and native counter histograms.
+# Note that the float counter behaves the same as the histogram count after `increase`.

 load 1m
  metric{type="histogram"} {{schema:0 count:15 sum:25 buckets:[5 10]}} {{schema:0 count:2490 sum:75 buckets:[15 2475]}}x55
  metric{type="counter"} 15 2490x55

 # End of range coincides with sample. Zero point of count is reached within the range.
+# Note that the 2nd bucket has an exaggerated increase of 2479.939393939394 (although
+# it has a value of only 2475 at the end of the range).
 eval instant at 55m increase(metric[90m])
-    {type="histogram"} {{count:2497.5 sum:50.45454545454545 counter_reset_hint:gauge buckets:[10.09090909090909 2487.409090909091]}}
+    {type="histogram"} {{count:2490 sum:50.303030303030305 counter_reset_hint:gauge buckets:[10.06060606060606 2479.939393939394]}}
    {type="counter"} 2490

 # End of range does not coincide with sample. Zero point of count is reached within the range.
+# The 2nd bucket again has an exaggerated increase, but it is less obvious because of the
+# right-side extrapolation.
 eval instant at 54m30s increase(metric[90m])
-    {type="histogram"} {{count:2520.8333333333335 sum:50.92592592592593 counter_reset_hint:gauge buckets:[10.185185185185187 2510.6481481481483]}}
+    {type="histogram"} {{count:2512.9166666666665 sum:50.76599326599326 counter_reset_hint:gauge buckets:[10.153198653198652 2502.7634680134674]}}
    {type="counter"} 2512.9166666666665
    
 # End of range coincides with sample. Zero point of count is reached outside of (i.e. before) the range.