From 2ad39baa726958d9688ae2094bcedf7002c46c56 Mon Sep 17 00:00:00 2001 From: Jeanette Tan Date: Sat, 22 Apr 2023 03:14:19 +0800 Subject: [PATCH] Treat bucket limit like sample limit and make it fail the whole scrape and return an error Signed-off-by: Jeanette Tan --- config/config.go | 4 +-- docs/configuration/configuration.md | 4 +-- scrape/scrape.go | 40 ++++++++++++++++------------- scrape/scrape_test.go | 7 +++-- scrape/target.go | 9 ++++--- storage/interface.go | 1 - 6 files changed, 37 insertions(+), 28 deletions(-) diff --git a/config/config.go b/config/config.go index 956e1c6c65..31ebb288e3 100644 --- a/config/config.go +++ b/config/config.go @@ -489,8 +489,8 @@ type ScrapeConfig struct { // More than this label value length post metric-relabeling will cause the // scrape to fail. LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"` - // More than this many buckets in a native histogram will cause the histogram - // to be ignored, but it will not make the whole scrape fail. + // More than this many buckets in a native histogram will cause the scrape to + // fail. NativeHistogramBucketLimit uint `yaml:"bucket_limit,omitempty"` // We cannot do proper Go type embedding below as the parser will then parse diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index f51227320d..cfb11e21e4 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -378,8 +378,8 @@ metric_relabel_configs: [ target_limit: | default = 0 ] # Limit on total number of positive and negative buckets allowed in a native -# histogram. If this is exceeded, the histogram will be ignored, but this will -# not make the scrape fail. 0 means no limit. +# histogram. If this is exceeded, the entire scrape will be treated as failed. +# 0 means no limit. [ sample_limit: | default = 0 ] ``` diff --git a/scrape/scrape.go b/scrape/scrape.go index 7c24c1070b..179adbb2a2 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -194,7 +194,7 @@ var ( targetScrapeNativeHistogramBucketLimit = prometheus.NewCounter( prometheus.CounterOpts{ Name: "prometheus_target_scrapes_histogram_exceeded_bucket_limit_total", - Help: "Total number of native histograms rejected due to exceeding the bucket limit.", + Help: "Total number of scrapes that hit the native histograms bucket limit and were rejected.", }, ) ) @@ -1485,11 +1485,10 @@ func (sl *scrapeLoop) getCache() *scrapeCache { } type appendErrors struct { - numOutOfOrder int - numDuplicates int - numOutOfBounds int - numExemplarOutOfOrder int - numHistogramBucketLimit int + numOutOfOrder int + numDuplicates int + numOutOfBounds int + numExemplarOutOfOrder int } func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, ts time.Time) (total, added, seriesAdded int, err error) { @@ -1506,6 +1505,7 @@ func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, defTime = timestamp.FromTime(ts) appErrs = appendErrors{} sampleLimitErr error + bucketLimitErr error e exemplar.Exemplar // escapes to heap so hoisted out of loop meta metadata.Metadata metadataChanged bool @@ -1655,7 +1655,7 @@ loop: } else { ref, err = app.Append(ref, lset, t, val) } - sampleAdded, err = sl.checkAddError(ce, met, parsedTimestamp, err, &sampleLimitErr, &appErrs) + sampleAdded, err = sl.checkAddError(ce, met, parsedTimestamp, err, &sampleLimitErr, &bucketLimitErr, &appErrs) if err != nil { if err != storage.ErrNotFound { level.Debug(sl.l).Log("msg", "Unexpected error", "series", string(met), "err", err) @@ -1669,7 +1669,7 @@ loop: sl.cache.trackStaleness(hash, lset) } sl.cache.addRef(met, ref, lset, hash) - if sampleAdded && sampleLimitErr == nil { + if sampleAdded && sampleLimitErr == nil && bucketLimitErr == nil { seriesAdded++ } } @@ -1705,6 +1705,13 @@ loop: // We only want to increment this once per scrape, so this is Inc'd outside the loop. targetScrapeSampleLimit.Inc() } + if bucketLimitErr != nil { + if err == nil { + err = bucketLimitErr // if sample limit is hit, that error takes precedence + } + // We only want to increment this once per scrape, so this is Inc'd outside the loop. + targetScrapeNativeHistogramBucketLimit.Inc() + } if appErrs.numOutOfOrder > 0 { level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order samples", "num_dropped", appErrs.numOutOfOrder) } @@ -1717,9 +1724,6 @@ loop: if appErrs.numExemplarOutOfOrder > 0 { level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order exemplars", "num_dropped", appErrs.numExemplarOutOfOrder) } - if appErrs.numHistogramBucketLimit > 0 { - level.Warn(sl.l).Log("msg", "Error on ingesting native histograms that exceeded bucket limit", "num_dropped", appErrs.numHistogramBucketLimit) - } if err == nil { sl.cache.forEachStale(func(lset labels.Labels) bool { // Series no longer exposed, mark it stale. @@ -1737,8 +1741,8 @@ loop: } // Adds samples to the appender, checking the error, and then returns the # of samples added, -// whether the caller should continue to process more samples, and any sample limit errors. -func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err error, sampleLimitErr *error, appErrs *appendErrors) (bool, error) { +// whether the caller should continue to process more samples, and any sample or bucket limit errors. +func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err error, sampleLimitErr, bucketLimitErr *error, appErrs *appendErrors) (bool, error) { switch errors.Cause(err) { case nil: if tp == nil && ce != nil { @@ -1762,16 +1766,16 @@ func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err e level.Debug(sl.l).Log("msg", "Out of bounds metric", "series", string(met)) targetScrapeSampleOutOfBounds.Inc() return false, nil - case storage.ErrHistogramBucketLimit: - appErrs.numHistogramBucketLimit++ - level.Debug(sl.l).Log("msg", "Exceeded bucket limit for native histograms", "series", string(met)) - targetScrapeNativeHistogramBucketLimit.Inc() - return false, nil case errSampleLimit: // Keep on parsing output if we hit the limit, so we report the correct // total number of samples scraped. *sampleLimitErr = err return false, nil + case errBucketLimit: + // Keep on parsing output if we hit the limit, so we report the correct + // total number of samples scraped. + *bucketLimitErr = err + return false, nil default: return false, err } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index b5cabea52e..8822ed96de 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -1788,7 +1788,10 @@ func TestScrapeLoop_HistogramBucketLimit(t *testing.T) { now = time.Now() total, added, seriesAdded, err = sl.append(app, msg, "application/vnd.google.protobuf", now) - require.NoError(t, err) + if err != errBucketLimit { + t.Fatalf("Did not see expected histogram bucket limit error: %s", err) + } + require.NoError(t, app.Rollback()) require.Equal(t, 1, total) require.Equal(t, 1, added) require.Equal(t, 0, seriesAdded) @@ -3010,7 +3013,7 @@ func TestReuseCacheRace(*testing.T) { func TestCheckAddError(t *testing.T) { var appErrs appendErrors sl := scrapeLoop{l: log.NewNopLogger()} - sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, &appErrs) + sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, nil, &appErrs) require.Equal(t, 1, appErrs.numOutOfOrder) } diff --git a/scrape/target.go b/scrape/target.go index f916e45490..a655e85413 100644 --- a/scrape/target.go +++ b/scrape/target.go @@ -314,7 +314,10 @@ func (ts Targets) Len() int { return len(ts) } func (ts Targets) Less(i, j int) bool { return ts[i].URL().String() < ts[j].URL().String() } func (ts Targets) Swap(i, j int) { ts[i], ts[j] = ts[j], ts[i] } -var errSampleLimit = errors.New("sample limit exceeded") +var ( + errSampleLimit = errors.New("sample limit exceeded") + errBucketLimit = errors.New("histogram bucket limit exceeded") +) // limitAppender limits the number of total appended samples in a batch. type limitAppender struct { @@ -366,12 +369,12 @@ type bucketLimitAppender struct { func (app *bucketLimitAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) { if h != nil { if len(h.PositiveBuckets)+len(h.NegativeBuckets) > app.limit { - return 0, storage.ErrHistogramBucketLimit + return 0, errBucketLimit } } if fh != nil { if len(fh.PositiveBuckets)+len(fh.NegativeBuckets) > app.limit { - return 0, storage.ErrHistogramBucketLimit + return 0, errBucketLimit } } ref, err := app.Appender.AppendHistogram(ref, lset, t, h, fh) diff --git a/storage/interface.go b/storage/interface.go index 18119f2547..b282f1fc62 100644 --- a/storage/interface.go +++ b/storage/interface.go @@ -46,7 +46,6 @@ var ( ErrHistogramNegativeBucketCount = errors.New("histogram has a bucket whose observation count is negative") ErrHistogramSpanNegativeOffset = errors.New("histogram has a span whose offset is negative") ErrHistogramSpansBucketsMismatch = errors.New("histogram spans specify different number of buckets than provided") - ErrHistogramBucketLimit = errors.New("histogram bucket limit exceeded") ) // SeriesRef is a generic series reference. In prometheus it is either a