diff --git a/promql/quantile.go b/promql/quantile.go index a4a6f136b4..4250ec3881 100644 --- a/promql/quantile.go +++ b/promql/quantile.go @@ -85,6 +85,8 @@ func bucketQuantile(q model.SampleValue, buckets buckets) float64 { return math.NaN() } + ensureMonotonic(buckets) + rank := q * buckets[len(buckets)-1].count b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank }) @@ -107,6 +109,51 @@ func bucketQuantile(q model.SampleValue, buckets buckets) float64 { return bucketStart + (bucketEnd-bucketStart)*float64(rank/count) } +// The assumption that bucket counts increase monotonically with increasing +// upperBound may be violated during: +// +// * Recording rule evaluation of histogram_quantile, especially when rate() +// has been applied to the underlying bucket timeseries. +// * Evaluation of histogram_quantile computed over federated bucket +// timeseries, especially when rate() has been applied. +// +// This is because scraped data is not made available to rule evaluation or +// federation atomically, so some buckets are computed with data from the +// most recent scrapes, but the other buckets are missing data from the most +// recent scrape. +// +// Monotonicity is usually guaranteed because if a bucket with upper bound +// u1 has count c1, then any bucket with a higher upper bound u > u1 must +// have counted all c1 observations and perhaps more, so that c >= c1. +// +// Randomly interspersed partial sampling breaks that guarantee, and rate() +// exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from +// 4 samples but the bucket with le=2000 has a count of 7 from 3 samples. The +// monotonicity is broken. It is exacerbated by rate() because under normal +// operation, cumulative counting of buckets will cause the bucket counts to +// diverge such that small differences from missing samples are not a problem. +// rate() removes this divergence.) +// +// bucketQuantile depends on that monotonicity to do a binary search for the +// bucket with the φ-quantile count, so breaking the monotonicity +// guarantee causes bucketQuantile() to return undefined (nonsense) results. +// +// As a somewhat hacky solution until ingestion is atomic per scrape, we +// calculate the "envelope" of the histogram buckets, essentially removing +// any decreases in the count between successive buckets. + +func ensureMonotonic(buckets buckets) { + max := buckets[0].count + for i := range buckets[1:] { + switch { + case buckets[i].count > max: + max = buckets[i].count + case buckets[i].count < max: + buckets[i].count = max + } + } +} + // qauntile calculates the given quantile of a vector of samples. // // The vector will be sorted. diff --git a/promql/testdata/histograms.test b/promql/testdata/histograms.test index 2478c34846..b1e76ab63e 100644 --- a/promql/testdata/histograms.test +++ b/promql/testdata/histograms.test @@ -139,3 +139,21 @@ eval instant at 50m histogram_quantile(0.5, rate(request_duration_seconds_bucket {instance="ins2", job="job1"} 0.13333333333333333 {instance="ins1", job="job2"} 0.1 {instance="ins2", job="job2"} 0.11666666666666667 + +# A histogram with nonmonotonic bucket counts. This may happen when recording +# rule evaluation or federation races scrape ingestion, causing some buckets +# counts to be derived from fewer samples. The wrong answer we want to avoid +# is for histogram_quantile(0.99, nonmonotonic_bucket) to return ~1000 instead +# of 1. + +load 5m + nonmonotonic_bucket{le="0.1"} 0+1x10 + nonmonotonic_bucket{le="1"} 0+9x10 + nonmonotonic_bucket{le="10"} 0+8x10 + nonmonotonic_bucket{le="100"} 0+8x10 + nonmonotonic_bucket{le="1000"} 0+9x10 + nonmonotonic_bucket{le="+Inf"} 0+9x10 + +# Nonmonotonic buckets +eval instant at 50m histogram_quantile(0.99, nonmonotonic_bucket) + {} 0.989875