From 6518941d73ad60be8fcf812df6ba0e975dd0450e Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 6 Feb 2025 14:54:03 +0100 Subject: [PATCH 1/2] promql: Expose problem with histogram counter resets in subquery Signed-off-by: beorn7 --- promql/promqltest/testdata/subquery.test | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/promql/promqltest/testdata/subquery.test b/promql/promqltest/testdata/subquery.test index 5a5e4e0092..377ee1e5ce 100644 --- a/promql/promqltest/testdata/subquery.test +++ b/promql/promqltest/testdata/subquery.test @@ -134,3 +134,19 @@ eval instant at 20s min_over_time(metric_total[15s:10s]) eval instant at 20m min_over_time(rate(metric_total[5m])[20m:1m]) {} 0.12119047619047618 +clear + +# This native histogram series has a counter reset at 5m. +# TODO(beorn7): Write this more nicely once https://github.com/prometheus/prometheus/issues/15984 is fixed. +load 1m + native_histogram {{sum:100 count:100}} {{sum:103 count:103}} {{sum:106 count:106}} {{sum:109 count:109}} {{sum:112 count:112}} {{sum:3 count:3 counter_reset_hint:reset}} {{sum:6 count:6}}+{{sum:3 count:3}}x5 + +# This sub-query has to detect the counter reset even if it does +# not include the exact sample where the counter reset has happened. +eval instant at 10m increase(native_histogram[10m:3m]) + {} {{count:25 sum:25}} +# This sub-query must not detect the counter reset multiple times +# even though the sample where the counter reset happened is returned +# by the sub-query multiple times. +eval instant at 10m increase(native_histogram[10m:15s]) + {} {{count:30.769230769230766 sum:30.769230769230766}} From 14bb63c467963b5923aa01c565ddaffb3058d3b2 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 6 Feb 2025 14:54:37 +0100 Subject: [PATCH 2/2] promql: Fix counter reset detection in subqueries with histograms Signed-off-by: beorn7 --- promql/engine.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/promql/engine.go b/promql/engine.go index 3e14a2529d..579fcd1c06 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -1548,6 +1548,28 @@ func (ev *evaluator) evalSubquery(ctx context.Context, subq *parser.SubqueryExpr VectorSelector: vs, } for _, s := range mat { + // Set any "NotCounterReset" and "CounterReset" hints in native + // histograms to "UnknownCounterReset" because we might + // otherwise miss a counter reset happening in samples not + // returned by the subquery, or we might over-detect counter + // resets if the sample with a counter reset is returned + // multiple times by a high-res subquery. This intentionally + // does not attempt to be clever (like detecting if we are + // really missing underlying samples or returning underlying + // samples multiple times) because subqueries on counters are + // inherently problematic WRT counter reset handling, so we + // cannot really solve the problem for good. We only want to + // avoid problems that happen due to the explicitly set counter + // reset hints and go back to the behavior we already know from + // float samples. + for i, hp := range s.Histograms { + switch hp.H.CounterResetHint { + case histogram.NotCounterReset, histogram.CounterReset: + h := *hp.H // Shallow copy is sufficient, we only change CounterResetHint. + h.CounterResetHint = histogram.UnknownCounterReset + s.Histograms[i].H = &h + } + } vs.Series = append(vs.Series, NewStorageSeries(s)) } return ms, mat.TotalSamples(), ws