From 7083ae8267ceead89b27621c165230cb0fd239ec Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Wed, 10 Jul 2024 10:00:31 +0100 Subject: [PATCH 1/2] [release 2.53] Revert 13583 to stop dropping samples in remote-write catch-up (#14446) * Revert "fix bug that would cause us to endlessly fall behind (#13583)" This reverts commit 0c71230784368da829f1f02d412d181d7a06aee6. (leaving the new test in place) * TSDB: enhance TestRun_AvoidNotifyWhenBehind With code suggested by @cstyan in #14439. * WAL watcher: add back log line showing current segment --------- Signed-off-by: Bryan Boreham --- tsdb/wlog/watcher.go | 21 ++++---- tsdb/wlog/watcher_test.go | 110 +++++++++++++++++++------------------- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index 8ebd9249aa..d836e3d868 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -262,6 +262,11 @@ func (w *Watcher) loop() { // Run the watcher, which will tail the WAL until the quit channel is closed // or an error case is hit. func (w *Watcher) Run() error { + _, lastSegment, err := w.firstAndLast() + if err != nil { + return fmt.Errorf("wal.Segments: %w", err) + } + // We want to ensure this is false across iterations since // Run will be called again if there was a failure to read the WAL. w.sendSamples = false @@ -286,20 +291,14 @@ func (w *Watcher) Run() error { return err } - level.Debug(w.logger).Log("msg", "Tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment) + level.Debug(w.logger).Log("msg", "Tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment, "lastSegment", lastSegment) for !isClosed(w.quit) { w.currentSegmentMetric.Set(float64(currentSegment)) - // Re-check on each iteration in case a new segment was added, - // because watch() will wait for notifications on the last segment. - _, lastSegment, err := w.firstAndLast() - if err != nil { - return fmt.Errorf("wal.Segments: %w", err) - } - tail := currentSegment >= lastSegment - - level.Debug(w.logger).Log("msg", "Processing segment", "currentSegment", currentSegment, "lastSegment", lastSegment) - if err := w.watch(currentSegment, tail); err != nil && !errors.Is(err, ErrIgnorable) { + // On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment. + // On subsequent calls to this function, currentSegment will have been incremented and we should open that segment. + level.Debug(w.logger).Log("msg", "Processing segment", "currentSegment", currentSegment) + if err := w.watch(currentSegment, currentSegment >= lastSegment); err != nil && !errors.Is(err, ErrIgnorable) { return err } diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index ff006cb817..b39a0fa8ab 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -17,6 +17,7 @@ import ( "math/rand" "os" "path" + "runtime" "sync" "testing" "time" @@ -698,11 +699,46 @@ func TestRun_StartupTime(t *testing.T) { } } +func generateWALRecords(w *WL, segment, seriesCount, samplesCount int) error { + enc := record.Encoder{} + for j := 0; j < seriesCount; j++ { + ref := j + (segment * 100) + series := enc.Series([]record.RefSeries{ + { + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", segment)), + }, + }, nil) + if err := w.Log(series); err != nil { + return err + } + + for k := 0; k < samplesCount; k++ { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(segment), + V: float64(segment), + }, + }, nil) + if err := w.Log(sample); err != nil { + return err + } + } + } + return nil +} + func TestRun_AvoidNotifyWhenBehind(t *testing.T) { - const pageSize = 32 * 1024 - const segments = 10 - const seriesCount = 20 - const samplesCount = 300 + if runtime.GOOS == "windows" { // Takes a really long time, perhaps because min sleep time is 15ms. + t.SkipNow() + } + const segmentSize = pageSize // Smallest allowed segment size. + const segmentsToWrite = 5 + const segmentsToRead = segmentsToWrite - 1 + const seriesCount = 10 + const samplesCount = 50 // This test can take longer than intended to finish in cloud CI. readTimeout := 10 * time.Second @@ -715,73 +751,37 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, pageSize, compress) + w, err := NewSize(nil, nil, wdir, segmentSize, compress) require.NoError(t, err) var wg sync.WaitGroup - // add one segment initially to ensure there's a value > 0 for the last segment id - for i := 0; i < 1; i++ { - for j := 0; j < seriesCount; j++ { - ref := j + (i * 100) - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for k := 0; k < samplesCount; k++ { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - } - } + // Generate one segment initially to ensure that watcher.Run() finds at least one segment on disk. + require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount)) + w.NextSegment() // Force creation of the next segment wg.Add(1) go func() { defer wg.Done() - for i := 1; i < segments; i++ { - for j := 0; j < seriesCount; j++ { - ref := j + (i * 100) - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for k := 0; k < samplesCount; k++ { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - } + for i := 1; i < segmentsToWrite; i++ { + require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount)) + w.NextSegment() } }() wt := newWriteToMock(time.Millisecond) watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false) - watcher.MaxSegment = segments + watcher.MaxSegment = segmentsToRead watcher.setMetrics() startTime := time.Now() err = watcher.Run() wg.Wait() require.Less(t, time.Since(startTime), readTimeout) + + // But samples records shouldn't get dropped + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() > 0 + }) + require.Equal(t, segmentsToRead*seriesCount*samplesCount, wt.samplesAppended) + require.NoError(t, err) require.NoError(t, w.Close()) }) From 14cfec3f6048b735e08c1e9c64c8d4211d32bab4 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Wed, 10 Jul 2024 11:14:07 +0100 Subject: [PATCH 2/2] Prepare release 2.53.1 (#14452) Co-authored-by: George Krajcsovits Signed-off-by: Bryan Boreham --- CHANGELOG.md | 8 ++++++++ VERSION | 2 +- web/ui/module/codemirror-promql/package.json | 4 ++-- web/ui/module/lezer-promql/package.json | 2 +- web/ui/package-lock.json | 14 +++++++------- web/ui/package.json | 2 +- web/ui/react-app/package.json | 4 ++-- 7 files changed, 22 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e47999934..d5a91e9009 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ ## unreleased +## 2.53.1 / 2024-07-10 + +Fix a bug which would drop samples in remote-write if the sending flow stalled +for longer than it takes to write one "WAL segment". How long this takes depends on the size +of your Prometheus; as a rough guide with 10 million series it is about 2-3 minutes. + +* [BUGFIX] Remote-write: stop dropping samples in catch-up #14446 + ## 2.53.0 / 2024-06-16 This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 75. diff --git a/VERSION b/VERSION index 261d95596f..f419e2c6f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.53.0 +2.53.1 diff --git a/web/ui/module/codemirror-promql/package.json b/web/ui/module/codemirror-promql/package.json index 519c333653..c9efe34913 100644 --- a/web/ui/module/codemirror-promql/package.json +++ b/web/ui/module/codemirror-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.0", + "version": "0.53.1", "description": "a CodeMirror mode for the PromQL language", "types": "dist/esm/index.d.ts", "module": "dist/esm/index.js", @@ -29,7 +29,7 @@ }, "homepage": "https://github.com/prometheus/prometheus/blob/main/web/ui/module/codemirror-promql/README.md", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.0", + "@prometheus-io/lezer-promql": "0.53.1", "lru-cache": "^7.18.3" }, "devDependencies": { diff --git a/web/ui/module/lezer-promql/package.json b/web/ui/module/lezer-promql/package.json index 5a3b0055b7..b6147ff11f 100644 --- a/web/ui/module/lezer-promql/package.json +++ b/web/ui/module/lezer-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/lezer-promql", - "version": "0.53.0", + "version": "0.53.1", "description": "lezer-based PromQL grammar", "main": "dist/index.cjs", "type": "module", diff --git a/web/ui/package-lock.json b/web/ui/package-lock.json index c8135d5e20..8a473e327f 100644 --- a/web/ui/package-lock.json +++ b/web/ui/package-lock.json @@ -1,12 +1,12 @@ { "name": "prometheus-io", - "version": "0.53.0", + "version": "0.53.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "prometheus-io", - "version": "0.53.0", + "version": "0.53.1", "workspaces": [ "react-app", "module/*" @@ -30,10 +30,10 @@ }, "module/codemirror-promql": { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.0", + "version": "0.53.1", "license": "Apache-2.0", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.0", + "@prometheus-io/lezer-promql": "0.53.1", "lru-cache": "^7.18.3" }, "devDependencies": { @@ -69,7 +69,7 @@ }, "module/lezer-promql": { "name": "@prometheus-io/lezer-promql", - "version": "0.53.0", + "version": "0.53.1", "license": "Apache-2.0", "devDependencies": { "@lezer/generator": "^1.5.1", @@ -19233,7 +19233,7 @@ }, "react-app": { "name": "@prometheus-io/app", - "version": "0.53.0", + "version": "0.53.1", "dependencies": { "@codemirror/autocomplete": "^6.11.1", "@codemirror/commands": "^6.3.2", @@ -19251,7 +19251,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.0", + "@prometheus-io/codemirror-promql": "0.53.1", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", diff --git a/web/ui/package.json b/web/ui/package.json index 8b924737d6..d59e294e2a 100644 --- a/web/ui/package.json +++ b/web/ui/package.json @@ -28,5 +28,5 @@ "ts-jest": "^29.1.1", "typescript": "^4.9.5" }, - "version": "0.53.0" + "version": "0.53.1" } diff --git a/web/ui/react-app/package.json b/web/ui/react-app/package.json index 7d9518e8d4..90381cba57 100644 --- a/web/ui/react-app/package.json +++ b/web/ui/react-app/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/app", - "version": "0.53.0", + "version": "0.53.1", "private": true, "dependencies": { "@codemirror/autocomplete": "^6.11.1", @@ -19,7 +19,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.0", + "@prometheus-io/codemirror-promql": "0.53.1", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2",