From f9f423ec0a1881039f54474315386cce4f70b6dc Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 17 Apr 2020 20:03:26 +0200 Subject: [PATCH 1/8] Ensure queries are closed in API calls Signed-off-by: beorn7 --- web/api/v1/api.go | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 204b2ff38c..34e27ee226 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -317,7 +317,7 @@ func (api *API) options(r *http.Request) apiFuncResult { return apiFuncResult{nil, nil, nil, nil} } -func (api *API) query(r *http.Request) apiFuncResult { +func (api *API) query(r *http.Request) (result apiFuncResult) { ts, err := parseTimeParam(r, "time", api.now()) if err != nil { return apiFuncResult{nil, &apiError{errorBadData, err}, nil, nil} @@ -340,6 +340,14 @@ func (api *API) query(r *http.Request) apiFuncResult { err = errors.Wrapf(err, "invalid parameter 'query'") return apiFuncResult{nil, &apiError{errorBadData, err}, nil, nil} } + // From now on, we must only return with a finalizer in the result (to + // be called by the caller) or call qry.Close ourselves (which is + // required in the case of a panic). + defer func() { + if result.finalizer == nil { + qry.Close() + } + }() ctx = httputil.ContextFromRequest(ctx, r) @@ -361,7 +369,7 @@ func (api *API) query(r *http.Request) apiFuncResult { }, nil, res.Warnings, qry.Close} } -func (api *API) queryRange(r *http.Request) apiFuncResult { +func (api *API) queryRange(r *http.Request) (result apiFuncResult) { start, err := parseTime(r.FormValue("start")) if err != nil { err = errors.Wrapf(err, "invalid parameter 'start'") @@ -412,6 +420,14 @@ func (api *API) queryRange(r *http.Request) apiFuncResult { if err != nil { return apiFuncResult{nil, &apiError{errorBadData, err}, nil, nil} } + // From now on, we must only return with a finalizer in the result (to + // be called by the caller) or call qry.Close ourselves (which is + // required in the case of a panic). + defer func() { + if result.finalizer == nil { + qry.Close() + } + }() ctx = httputil.ContextFromRequest(ctx, r) @@ -464,7 +480,7 @@ func (api *API) labelNames(r *http.Request) apiFuncResult { return apiFuncResult{names, nil, warnings, nil} } -func (api *API) labelValues(r *http.Request) apiFuncResult { +func (api *API) labelValues(r *http.Request) (result apiFuncResult) { ctx := r.Context() name := route.Param(ctx, "name") @@ -475,7 +491,14 @@ func (api *API) labelValues(r *http.Request) apiFuncResult { if err != nil { return apiFuncResult{nil, &apiError{errorExec, err}, nil, nil} } - + // From now on, we must only return with a finalizer in the result (to + // be called by the caller) or call q.Close ourselves (which is required + // in the case of a panic). + defer func() { + if result.finalizer == nil { + q.Close() + } + }() closer := func() { q.Close() } From ed1852ab95c47c7e2bfe90c074c2e75c30458c19 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Fri, 17 Apr 2020 20:51:03 +0200 Subject: [PATCH 2/8] TSDB: Isolation: avoid creating appenderId's without appender (#7135) Prior to this commit we could have situations where we are creating an appenderId but never creating an appender to go with it, therefore blocking the low watermak. Signed-off-by: Julien Pivotto --- tsdb/head.go | 18 +++++++----------- tsdb/head_test.go | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/tsdb/head.go b/tsdb/head.go index b8494f6182..437fb469d3 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -803,8 +803,6 @@ func (h *RangeHead) Meta() BlockMeta { type initAppender struct { app storage.Appender head *Head - - appendID, cleanupAppendIDsBelow uint64 } func (a *initAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) { @@ -812,7 +810,7 @@ func (a *initAppender) Add(lset labels.Labels, t int64, v float64) (uint64, erro return a.app.Add(lset, t, v) } a.head.initTime(t) - a.app = a.head.appender(a.appendID, a.cleanupAppendIDsBelow) + a.app = a.head.appender() return a.app.Add(lset, t, v) } @@ -842,22 +840,20 @@ func (a *initAppender) Rollback() error { func (h *Head) Appender() storage.Appender { h.metrics.activeAppenders.Inc() - appendID := h.iso.newAppendID() - cleanupAppendIDsBelow := h.iso.lowWatermark() - // The head cache might not have a starting point yet. The init appender // picks up the first appended timestamp as the base. if h.MinTime() == math.MaxInt64 { return &initAppender{ - head: h, - appendID: appendID, - cleanupAppendIDsBelow: cleanupAppendIDsBelow, + head: h, } } - return h.appender(appendID, cleanupAppendIDsBelow) + return h.appender() } -func (h *Head) appender(appendID, cleanupAppendIDsBelow uint64) *headAppender { +func (h *Head) appender() *headAppender { + appendID := h.iso.newAppendID() + cleanupAppendIDsBelow := h.iso.lowWatermark() + return &headAppender{ head: h, // Set the minimum valid time to whichever is greater the head min valid time or the compaction window. diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 08c53418de..6e8b5293d1 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -1078,7 +1078,7 @@ func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) { h.initTime(0) - app := h.appender(0, 0) + app := h.appender() lset := labels.FromStrings("a", "1") _, err = app.Add(lset, 2100, 1) testutil.Ok(t, err) @@ -1106,7 +1106,7 @@ func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) { h.initTime(0) - app := h.appender(0, 0) + app := h.appender() lset := labels.FromStrings("a", "1") _, err = app.Add(lset, 2100, 1) testutil.Ok(t, err) @@ -1368,14 +1368,16 @@ func TestMemSeriesIsolation(t *testing.T) { return -1 } - i := 0 + i := 1 for ; i <= 1000; i++ { var app storage.Appender // To initialize bounds. if hb.MinTime() == math.MaxInt64 { - app = &initAppender{head: hb, appendID: uint64(i), cleanupAppendIDsBelow: 0} + app = &initAppender{head: hb} } else { - app = hb.appender(uint64(i), 0) + a := hb.appender() + a.cleanupAppendIDsBelow = 0 + app = a } _, err := app.Add(labels.FromStrings("foo", "bar"), int64(i), float64(i)) @@ -1394,7 +1396,8 @@ func TestMemSeriesIsolation(t *testing.T) { testutil.Equals(t, 999, lastValue(999)) // Cleanup appendIDs below 500. - app := hb.appender(uint64(i), 500) + app := hb.appender() + app.cleanupAppendIDsBelow = 500 _, err = app.Add(labels.FromStrings("foo", "bar"), int64(i), float64(i)) testutil.Ok(t, err) testutil.Ok(t, app.Commit()) @@ -1412,7 +1415,8 @@ func TestMemSeriesIsolation(t *testing.T) { // Cleanup appendIDs below 1000, which means the sample buffer is // the only thing with appendIDs. - app = hb.appender(uint64(i), 1000) + app = hb.appender() + app.cleanupAppendIDsBelow = 1000 _, err = app.Add(labels.FromStrings("foo", "bar"), int64(i), float64(i)) testutil.Ok(t, err) testutil.Ok(t, app.Commit()) @@ -1425,7 +1429,8 @@ func TestMemSeriesIsolation(t *testing.T) { i++ // Cleanup appendIDs below 1001, but with a rollback. - app = hb.appender(uint64(i), 1001) + app = hb.appender() + app.cleanupAppendIDsBelow = 1001 _, err = app.Add(labels.FromStrings("foo", "bar"), int64(i), float64(i)) testutil.Ok(t, err) testutil.Ok(t, app.Rollback()) @@ -1515,6 +1520,22 @@ func TestHeadSeriesChunkRace(t *testing.T) { } } +func TestIsolationWithoutAdd(t *testing.T) { + hb, err := NewHead(nil, nil, nil, 1000, DefaultStripeSize) + testutil.Ok(t, err) + defer hb.Close() + + app := hb.Appender() + testutil.Ok(t, app.Commit()) + + app = hb.Appender() + _, err = app.Add(labels.FromStrings("foo", "baz"), 1, 1) + testutil.Ok(t, err) + testutil.Ok(t, app.Commit()) + + testutil.Equals(t, hb.iso.lastAppendID, hb.iso.lowWatermark(), "High watermark should be equal to the low watermark") +} + func testHeadSeriesChunkRace(t *testing.T) { h, err := NewHead(nil, nil, nil, 30, DefaultStripeSize) testutil.Ok(t, err) From a2fcdeb1ef3dd8da8a596f81b160d32110f72f24 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Thu, 16 Apr 2020 20:16:16 +0200 Subject: [PATCH 3/8] Defer finalizer (#7129) Signed-off-by: Julien Pivotto --- web/api/v1/api.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 204b2ff38c..d9f3ea237e 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -249,6 +249,9 @@ func (api *API) Register(r *route.Router) { hf := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { httputil.SetCORS(w, api.CORSOrigin, r) result := f(r) + if result.finalizer != nil { + defer result.finalizer() + } if result.err != nil { api.respondError(w, result.err, result.data) } else if result.data != nil { @@ -256,9 +259,6 @@ func (api *API) Register(r *route.Router) { } else { w.WriteHeader(http.StatusNoContent) } - if result.finalizer != nil { - result.finalizer() - } }) return api.ready(httputil.CompressionHandler{ Handler: hf, From e2c06a88989de527696d3650f44412aeb6a06876 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Mon, 6 Apr 2020 10:05:01 +0200 Subject: [PATCH 4/8] register federation failure metrics (#7081) Closes gh-7080 Signed-off-by: Julian Taylor --- web/federate.go | 4 ++++ web/web.go | 1 + 2 files changed, 5 insertions(+) diff --git a/web/federate.go b/web/federate.go index 93c9aece36..19ad59cdb3 100644 --- a/web/federate.go +++ b/web/federate.go @@ -44,6 +44,10 @@ var ( }) ) +func registerFederationMetrics(r prometheus.Registerer) { + r.MustRegister(federationWarnings, federationErrors) +} + func (h *Handler) federation(w http.ResponseWriter, req *http.Request) { h.mtx.RLock() defer h.mtx.RUnlock() diff --git a/web/web.go b/web/web.go index 21fc02158a..48fc62c8cd 100644 --- a/web/web.go +++ b/web/web.go @@ -139,6 +139,7 @@ func newMetrics(r prometheus.Registerer) *metrics { if r != nil { r.MustRegister(m.requestCounter, m.requestDuration, m.responseSize) + registerFederationMetrics(r) } return m } From 7eedcc708ed348312748ae66cbe5c938937a4dd4 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Thu, 16 Apr 2020 01:44:43 +0200 Subject: [PATCH 5/8] promql/parser: Cleanup generatedParserResult accross reuse Reusing the same generatedParserResult ends up in strange panics: See #7131 and #7127. Signed-off-by: Julien Pivotto --- promql/parser/parse.go | 1 + 1 file changed, 1 insertion(+) diff --git a/promql/parser/parse.go b/promql/parser/parse.go index 79424dd98d..6e9bbe23a5 100644 --- a/promql/parser/parse.go +++ b/promql/parser/parse.go @@ -169,6 +169,7 @@ func newParser(input string) *parser { p.injecting = false p.parseErrors = nil + p.generatedParserResult = nil // Clear lexer struct before reusing. p.lex = Lexer{ From 69ac27e1b40858c48284808b9bdd6df3d476b1cc Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 17 Apr 2020 22:40:39 +0200 Subject: [PATCH 6/8] Make `series` method return a finalizer, too Signed-off-by: beorn7 --- web/api/v1/api.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 34e27ee226..a11686a3f9 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -519,7 +519,7 @@ var ( maxTimeFormatted = maxTime.Format(time.RFC3339Nano) ) -func (api *API) series(r *http.Request) apiFuncResult { +func (api *API) series(r *http.Request) (result apiFuncResult) { if err := r.ParseForm(); err != nil { return apiFuncResult{nil, &apiError{errorBadData, errors.Wrapf(err, "error parsing form values")}, nil, nil} } @@ -549,7 +549,17 @@ func (api *API) series(r *http.Request) apiFuncResult { if err != nil { return apiFuncResult{nil, &apiError{errorExec, err}, nil, nil} } - defer q.Close() + // From now on, we must only return with a finalizer in the result (to + // be called by the caller) or call q.Close ourselves (which is required + // in the case of a panic). + defer func() { + if result.finalizer == nil { + q.Close() + } + }() + closer := func() { + q.Close() + } var sets []storage.SeriesSet var warnings storage.Warnings @@ -557,7 +567,7 @@ func (api *API) series(r *http.Request) apiFuncResult { s, wrn, err := q.Select(false, nil, mset...) warnings = append(warnings, wrn...) if err != nil { - return apiFuncResult{nil, &apiError{errorExec, err}, warnings, nil} + return apiFuncResult{nil, &apiError{errorExec, err}, warnings, closer} } sets = append(sets, s) } @@ -568,10 +578,10 @@ func (api *API) series(r *http.Request) apiFuncResult { metrics = append(metrics, set.At().Labels()) } if set.Err() != nil { - return apiFuncResult{nil, &apiError{errorExec, set.Err()}, warnings, nil} + return apiFuncResult{nil, &apiError{errorExec, set.Err()}, warnings, closer} } - return apiFuncResult{metrics, nil, warnings, nil} + return apiFuncResult{metrics, nil, warnings, closer} } func (api *API) dropSeries(r *http.Request) apiFuncResult { From a7b449320d4ad88dd6eca38c84ce1c48d11257bd Mon Sep 17 00:00:00 2001 From: Chris Marchbanks Date: Sat, 18 Apr 2020 06:32:18 -0600 Subject: [PATCH 7/8] Fix updating rule manager never finishing (#7138) Rather than sending a value to the done channel on a group to indicate whether or not to add stale markers to a closing rule group use an explicit boolean. This allows more functions than just run() to read from the done channel and fixes an issue where Eval() could consume the channel during an update, causing run() to never return. Signed-off-by: Chris Marchbanks --- rules/manager.go | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/rules/manager.go b/rules/manager.go index 383d383271..8065b2e392 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -232,7 +232,8 @@ type Group struct { shouldRestore bool - done chan bool + markStale bool + done chan struct{} terminated chan struct{} managerDone chan struct{} @@ -270,7 +271,7 @@ func NewGroup(o GroupOptions) *Group { shouldRestore: o.ShouldRestore, opts: o.Opts, seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), - done: make(chan bool), + done: make(chan struct{}), managerDone: o.done, terminated: make(chan struct{}), logger: log.With(o.Opts.Logger, "group", o.Name), @@ -326,8 +327,8 @@ func (g *Group) run(ctx context.Context) { tick := time.NewTicker(g.interval) defer tick.Stop() - makeStale := func(s bool) { - if !s { + defer func() { + if !g.markStale { return } go func(now time.Time) { @@ -347,7 +348,7 @@ func (g *Group) run(ctx context.Context) { g.cleanupStaleSeries(now) } }(time.Now()) - } + }() iter() if g.shouldRestore { @@ -356,8 +357,7 @@ func (g *Group) run(ctx context.Context) { // we might not have enough data scraped, and recording rules would not // have updated the latest values, on which some alerts might depend. select { - case stale := <-g.done: - makeStale(stale) + case <-g.done: return case <-tick.C: missed := (time.Since(evalTimestamp) / g.interval) - 1 @@ -375,13 +375,11 @@ func (g *Group) run(ctx context.Context) { for { select { - case stale := <-g.done: - makeStale(stale) + case <-g.done: return default: select { - case stale := <-g.done: - makeStale(stale) + case <-g.done: return case <-tick.C: missed := (time.Since(evalTimestamp) / g.interval) - 1 @@ -396,11 +394,6 @@ func (g *Group) run(ctx context.Context) { } } -func (g *Group) stopAndMakeStale() { - g.done <- true - <-g.terminated -} - func (g *Group) stop() { close(g.done) <-g.terminated @@ -943,7 +936,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels wg.Add(len(m.groups)) for n, oldg := range m.groups { go func(n string, g *Group) { - g.stopAndMakeStale() + g.markStale = true + g.stop() if m := g.metrics; m != nil { m.groupInterval.DeleteLabelValues(n) m.groupLastEvalTime.DeleteLabelValues(n) From 18254838fbe25dcc732c950ae05f78ed4db1292c Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Mon, 20 Apr 2020 10:17:21 +0200 Subject: [PATCH 8/8] Release 2.17.2 (#7139) Signed-off-by: Julien Pivotto --- CHANGELOG.md | 8 ++++++++ VERSION | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51fb9048eb..faf9ade792 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 2.17.2 / 2020-04-20 + +* [BUGFIX] Federation: Register federation metrics #7081 +* [BUGFIX] PromQL: Fix panic in parser error handling #7132 +* [BUGFIX] Rules: Fix reloads hanging when deleting a rule group that is being evaluated #7138 +* [BUGFIX] TSDB: Fix a memory leak when prometheus starts with an empty TSDB WAL #7135 +* [BUGFIX] TSDB: Make isolation more robust to panics in web handlers #7129 #7136 + ## 2.17.1 / 2020-03-26 * [BUGFIX] TSDB: Fix query performance regression that increased memory and CPU usage #7051 diff --git a/VERSION b/VERSION index 3f8eb714d0..94dc0ec910 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.17.1 +2.17.2