From 8397c7bc48bad75a7ab34adb1989dde46fa2952a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 6 Jun 2024 09:56:36 +0200 Subject: [PATCH 01/26] Version bump to v2.53.0-rc.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 ++ VERSION | 2 +- web/ui/module/codemirror-promql/package.json | 4 ++-- web/ui/module/lezer-promql/package.json | 2 +- web/ui/package-lock.json | 14 +++++++------- web/ui/package.json | 2 +- web/ui/react-app/package.json | 4 ++-- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c2c9ae31c..511fa07468 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## unreleased +## 2.53.0-rc.0 / 2024-06-06 + This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 50. * [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 diff --git a/VERSION b/VERSION index e7a1fa2a8c..ae392bf33c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.52.1 +2.53.0-rc.0 diff --git a/web/ui/module/codemirror-promql/package.json b/web/ui/module/codemirror-promql/package.json index f3f48c95e5..152abc8c7b 100644 --- a/web/ui/module/codemirror-promql/package.json +++ b/web/ui/module/codemirror-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/codemirror-promql", - "version": "0.52.1", + "version": "0.53.0-rc.0", "description": "a CodeMirror mode for the PromQL language", "types": "dist/esm/index.d.ts", "module": "dist/esm/index.js", @@ -29,7 +29,7 @@ }, "homepage": "https://github.com/prometheus/prometheus/blob/main/web/ui/module/codemirror-promql/README.md", "dependencies": { - "@prometheus-io/lezer-promql": "0.52.1", + "@prometheus-io/lezer-promql": "0.53.0-rc.0", "lru-cache": "^7.18.3" }, "devDependencies": { diff --git a/web/ui/module/lezer-promql/package.json b/web/ui/module/lezer-promql/package.json index fa3be21d73..93486b8dec 100644 --- a/web/ui/module/lezer-promql/package.json +++ b/web/ui/module/lezer-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/lezer-promql", - "version": "0.52.1", + "version": "0.53.0-rc.0", "description": "lezer-based PromQL grammar", "main": "dist/index.cjs", "type": "module", diff --git a/web/ui/package-lock.json b/web/ui/package-lock.json index 139a24fc6e..d002109ddd 100644 --- a/web/ui/package-lock.json +++ b/web/ui/package-lock.json @@ -1,12 +1,12 @@ { "name": "prometheus-io", - "version": "0.52.1", + "version": "0.53.0-rc.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "prometheus-io", - "version": "0.52.1", + "version": "0.53.0-rc.0", "workspaces": [ "react-app", "module/*" @@ -30,10 +30,10 @@ }, "module/codemirror-promql": { "name": "@prometheus-io/codemirror-promql", - "version": "0.52.1", + "version": "0.53.0-rc.0", "license": "Apache-2.0", "dependencies": { - "@prometheus-io/lezer-promql": "0.52.1", + "@prometheus-io/lezer-promql": "0.53.0-rc.0", "lru-cache": "^7.18.3" }, "devDependencies": { @@ -69,7 +69,7 @@ }, "module/lezer-promql": { "name": "@prometheus-io/lezer-promql", - "version": "0.52.1", + "version": "0.53.0-rc.0", "license": "Apache-2.0", "devDependencies": { "@lezer/generator": "^1.5.1", @@ -19233,7 +19233,7 @@ }, "react-app": { "name": "@prometheus-io/app", - "version": "0.52.1", + "version": "0.53.0-rc.0", "dependencies": { "@codemirror/autocomplete": "^6.11.1", "@codemirror/commands": "^6.3.2", @@ -19251,7 +19251,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.52.1", + "@prometheus-io/codemirror-promql": "0.53.0-rc.0", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", diff --git a/web/ui/package.json b/web/ui/package.json index 407b97f6c4..4c9ce03e4e 100644 --- a/web/ui/package.json +++ b/web/ui/package.json @@ -28,5 +28,5 @@ "ts-jest": "^29.1.1", "typescript": "^4.9.5" }, - "version": "0.52.1" + "version": "0.53.0-rc.0" } diff --git a/web/ui/react-app/package.json b/web/ui/react-app/package.json index 4bc667b0f0..d21cf3db5b 100644 --- a/web/ui/react-app/package.json +++ b/web/ui/react-app/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/app", - "version": "0.52.1", + "version": "0.53.0-rc.0", "private": true, "dependencies": { "@codemirror/autocomplete": "^6.11.1", @@ -19,7 +19,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.52.1", + "@prometheus-io/codemirror-promql": "0.53.0-rc.0", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", From b59034ec3190462abc921b93d5822943cecbb386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 6 Jun 2024 12:29:34 +0200 Subject: [PATCH 02/26] Ammend changelog with missing user impact entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 511fa07468..4b3f1980b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,14 +6,21 @@ This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 50. -* [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 +* [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 #14048 * [CHANGE] Runtime: Change GOGC threshold from 100 to 50 #14176 -* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 -* [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974 +* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 +* [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` metric to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 * [ENHANCEMENT] TSDB: Optimize querying with regexp matchers. #13620 * [BUGFIX] OTLP: Don't generate target_info unless at least one identifying label is defined. #13991 * [BUGFIX] OTLP: Don't generate target_info unless there are metrics. #13991 +* [BUGFIX] Native histograms: dDcouple native histogram ingestions and protobuf parsing that lead to errors when using created timestamp feature. #13987 +* [BUGFIX] Scaleway SD: Use the instance's public IP if no private IP is available as the `__address__` meta label. #13941 +* [BUGFIX] Query logger: Do not leak file descriptors on error. #13948 +* [BUGFIX] TSDB: Let queries with heavy regex matches be cancelled and not use up the CPU. #14096 #14103 #14118 #14199 +* [BUGFIX] UI: Allow users to opt-out of the multi-cluster setup for the main Prometheus dashboard, in environments where it isn't applicable. #14062 +* [BUGFIX] API: Do not warn if result count is equal to the limit, only when exceeding the limit for the series, label-names and label-values APIs. #14116 +* [BUGFIX] TSDB: Fix head stats and hooks when replaying a corrupted snapshot. #14079 ## 2.52.1 / 2024-05-29 From 73f74d301e5410880f8f977afbef754e33fae9e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 6 Jun 2024 12:29:49 +0200 Subject: [PATCH 03/26] Clarify action to take with regards to the changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- RELEASE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index f313c4172d..f9a42be6b8 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -149,6 +149,8 @@ Changes for a patch release or release candidate should be merged into the previ Bump the version in the `VERSION` file and update `CHANGELOG.md`. Do this in a proper PR pointing to the release branch as this gives others the opportunity to chime in on the release in general and on the addition to the changelog in particular. For a release candidate, append something like `-rc.0` to the version (with the corresponding changes to the tag name, the release name etc.). +When updating the `CHANGELOG.md` look at all PRs included in the release since the last release and verify if they need a changelog entry. + Note that `CHANGELOG.md` should only document changes relevant to users of Prometheus, including external API changes, performance improvements, and new features. Do not document changes of internal interfaces, code refactorings and clean-ups, changes to the build process, etc. People interested in these are asked to refer to the git history. For release candidates still update `CHANGELOG.md`, but when you cut the final release later, merge all the changes from the pre-releases into the one final update. From c8de725abaf6058787b389ab4260f34cbeb21069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 6 Jun 2024 12:31:03 +0200 Subject: [PATCH 04/26] Fix typo in changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b3f1980b0..225fb70480 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ This release changes the default for GOGC, the Go runtime control for the trade- * [ENHANCEMENT] TSDB: Optimize querying with regexp matchers. #13620 * [BUGFIX] OTLP: Don't generate target_info unless at least one identifying label is defined. #13991 * [BUGFIX] OTLP: Don't generate target_info unless there are metrics. #13991 -* [BUGFIX] Native histograms: dDcouple native histogram ingestions and protobuf parsing that lead to errors when using created timestamp feature. #13987 +* [BUGFIX] Native histograms: Decouple native histogram ingestions and protobuf parsing that lead to errors when using created timestamp feature. #13987 * [BUGFIX] Scaleway SD: Use the instance's public IP if no private IP is available as the `__address__` meta label. #13941 * [BUGFIX] Query logger: Do not leak file descriptors on error. #13948 * [BUGFIX] TSDB: Let queries with heavy regex matches be cancelled and not use up the CPU. #14096 #14103 #14118 #14199 From 3feefd903b2265373272d3ed443c76571ea45008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 6 Jun 2024 17:54:04 +0200 Subject: [PATCH 05/26] Update changelog from review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 225fb70480..754386bd00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,13 +12,11 @@ This release changes the default for GOGC, the Go runtime control for the trade- * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` metric to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 * [ENHANCEMENT] TSDB: Optimize querying with regexp matchers. #13620 -* [BUGFIX] OTLP: Don't generate target_info unless at least one identifying label is defined. #13991 -* [BUGFIX] OTLP: Don't generate target_info unless there are metrics. #13991 -* [BUGFIX] Native histograms: Decouple native histogram ingestions and protobuf parsing that lead to errors when using created timestamp feature. #13987 +* [BUGFIX] OTLP: Don't generate target_info unless there are metrics and at least one identifying label is defined. #13991 +* [BUGFIX] Scrape: Do no try to ingest native histograms when the native histograms feature is turned off. This happened when protobuf scrape was enabled by for example the created time feature. #13987 * [BUGFIX] Scaleway SD: Use the instance's public IP if no private IP is available as the `__address__` meta label. #13941 * [BUGFIX] Query logger: Do not leak file descriptors on error. #13948 * [BUGFIX] TSDB: Let queries with heavy regex matches be cancelled and not use up the CPU. #14096 #14103 #14118 #14199 -* [BUGFIX] UI: Allow users to opt-out of the multi-cluster setup for the main Prometheus dashboard, in environments where it isn't applicable. #14062 * [BUGFIX] API: Do not warn if result count is equal to the limit, only when exceeding the limit for the series, label-names and label-values APIs. #14116 * [BUGFIX] TSDB: Fix head stats and hooks when replaying a corrupted snapshot. #14079 From 1d2f2cb43da031f91947682a90c02062665a386f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 6 Jun 2024 19:47:36 +0200 Subject: [PATCH 06/26] Fix Group.Equals() to take in account the new queryOffset too (#14273) Signed-off-by: Marco Pracucci --- rules/group.go | 4 ++ rules/group_test.go | 98 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 rules/group_test.go diff --git a/rules/group.go b/rules/group.go index 9ae89789d0..c0ad18c187 100644 --- a/rules/group.go +++ b/rules/group.go @@ -793,6 +793,10 @@ func (g *Group) Equals(ng *Group) bool { return false } + if ((g.queryOffset == nil) != (ng.queryOffset == nil)) || (g.queryOffset != nil && ng.queryOffset != nil && *g.queryOffset != *ng.queryOffset) { + return false + } + if len(g.rules) != len(ng.rules) { return false } diff --git a/rules/group_test.go b/rules/group_test.go new file mode 100644 index 0000000000..ff1ef3d6c1 --- /dev/null +++ b/rules/group_test.go @@ -0,0 +1,98 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rules + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestGroup_Equals(t *testing.T) { + tests := map[string]struct { + first *Group + second *Group + expected bool + }{ + "no query offset set on both groups": { + first: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + }, + second: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + }, + expected: true, + }, + "query offset set only on the first group": { + first: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + queryOffset: pointerOf[time.Duration](time.Minute), + }, + second: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + }, + expected: false, + }, + "query offset set on both groups to the same value": { + first: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + queryOffset: pointerOf[time.Duration](time.Minute), + }, + second: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + queryOffset: pointerOf[time.Duration](time.Minute), + }, + expected: true, + }, + "query offset set on both groups to different value": { + first: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + queryOffset: pointerOf[time.Duration](time.Minute), + }, + second: &Group{ + name: "group-1", + file: "file-1", + interval: time.Minute, + queryOffset: pointerOf[time.Duration](2 * time.Minute), + }, + expected: false, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + require.Equal(t, testData.expected, testData.first.Equals(testData.second)) + require.Equal(t, testData.expected, testData.second.Equals(testData.first)) + }) + } +} + +func pointerOf[T any](value T) *T { + return &value +} From dd4400146521c996239da57d2a225a608e3915cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 7 Jun 2024 10:21:27 +0200 Subject: [PATCH 07/26] Update changelog due to pr 14273 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 754386bd00..0bc3abd71e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ This release changes the default for GOGC, the Go runtime control for the trade- * [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 #14048 * [CHANGE] Runtime: Change GOGC threshold from 100 to 50 #14176 -* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 +* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 #14273 * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` metric to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 * [ENHANCEMENT] TSDB: Optimize querying with regexp matchers. #13620 From 6ccee2c4a537f0ebcab79522710635e987c86282 Mon Sep 17 00:00:00 2001 From: SuperQ Date: Mon, 10 Jun 2024 09:34:55 +0200 Subject: [PATCH 08/26] Tune default GOGC Adjust the default GOGC value to 75. This is less of a memory savings, but has less impact on CPU use. Signed-off-by: SuperQ --- config/config.go | 2 +- docs/configuration/configuration.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/config.go b/config/config.go index d02081d2e1..9defa10d48 100644 --- a/config/config.go +++ b/config/config.go @@ -154,7 +154,7 @@ var ( DefaultRuntimeConfig = RuntimeConfig{ // Go runtime tuning. - GoGC: 50, + GoGC: 75, } // DefaultScrapeConfig is the default scrape configuration. diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 26c088e135..f0e13cf136 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -125,7 +125,7 @@ runtime: # Configure the Go garbage collector GOGC parameter # See: https://tip.golang.org/doc/gc-guide#GOGC # Lowering this number increases CPU usage. - [ gogc: | default = 50 ] + [ gogc: | default = 75 ] # Rule files specifies a list of globs. Rules and alerts are read from # all matching files. From 38bf349ff774d45708a994519b1636d47bcbf721 Mon Sep 17 00:00:00 2001 From: SuperQ Date: Tue, 11 Jun 2024 11:18:13 +0200 Subject: [PATCH 09/26] Update changelog for GOGC tuning Include #14285 in changelog. Signed-off-by: SuperQ --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bc3abd71e..e902a775f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## unreleased +This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 75. + +* [CHANGE] Runtime: Change GOGC threshold from 50 to 75 #14285 + ## 2.53.0-rc.0 / 2024-06-06 This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 50. From 4cfec57606dd8c407f8524f4a8913a80a600cb9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 7 Jun 2024 10:46:13 +0200 Subject: [PATCH 10/26] Revert "Update changelog due to pr 14273" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit dd4400146521c996239da57d2a225a608e3915cb. Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e902a775f2..34820be452 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ This release changes the default for GOGC, the Go runtime control for the trade- * [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 #14048 * [CHANGE] Runtime: Change GOGC threshold from 100 to 50 #14176 -* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 #14273 +* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` metric to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 * [ENHANCEMENT] TSDB: Optimize querying with regexp matchers. #13620 From dd8676218b83cc07c046ce8d221c273f580d7c0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 11 Jun 2024 12:55:26 +0200 Subject: [PATCH 11/26] Prepare 2.53.0-rc.1 release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 3 +++ VERSION | 2 +- web/ui/module/codemirror-promql/package.json | 4 ++-- web/ui/module/lezer-promql/package.json | 2 +- web/ui/package-lock.json | 14 +++++++------- web/ui/package.json | 2 +- web/ui/react-app/package.json | 4 ++-- 7 files changed, 17 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34820be452..0c6da426a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,12 @@ ## unreleased +## 2.53.0-rc.1 / 2024-06-11 + This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 75. * [CHANGE] Runtime: Change GOGC threshold from 50 to 75 #14285 +* [BUGFIX] Rules: Fix Group.Equals() to take in account the new queryOffset too. Followup to #14061. #14273 ## 2.53.0-rc.0 / 2024-06-06 diff --git a/VERSION b/VERSION index ae392bf33c..8d108ef311 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.53.0-rc.0 +2.53.0-rc.1 diff --git a/web/ui/module/codemirror-promql/package.json b/web/ui/module/codemirror-promql/package.json index 152abc8c7b..5039e8bd26 100644 --- a/web/ui/module/codemirror-promql/package.json +++ b/web/ui/module/codemirror-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "description": "a CodeMirror mode for the PromQL language", "types": "dist/esm/index.d.ts", "module": "dist/esm/index.js", @@ -29,7 +29,7 @@ }, "homepage": "https://github.com/prometheus/prometheus/blob/main/web/ui/module/codemirror-promql/README.md", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.0-rc.0", + "@prometheus-io/lezer-promql": "0.53.0-rc.1", "lru-cache": "^7.18.3" }, "devDependencies": { diff --git a/web/ui/module/lezer-promql/package.json b/web/ui/module/lezer-promql/package.json index 93486b8dec..6b155d00e0 100644 --- a/web/ui/module/lezer-promql/package.json +++ b/web/ui/module/lezer-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/lezer-promql", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "description": "lezer-based PromQL grammar", "main": "dist/index.cjs", "type": "module", diff --git a/web/ui/package-lock.json b/web/ui/package-lock.json index d002109ddd..e1610c1152 100644 --- a/web/ui/package-lock.json +++ b/web/ui/package-lock.json @@ -1,12 +1,12 @@ { "name": "prometheus-io", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "prometheus-io", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "workspaces": [ "react-app", "module/*" @@ -30,10 +30,10 @@ }, "module/codemirror-promql": { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "license": "Apache-2.0", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.0-rc.0", + "@prometheus-io/lezer-promql": "0.53.0-rc.1", "lru-cache": "^7.18.3" }, "devDependencies": { @@ -69,7 +69,7 @@ }, "module/lezer-promql": { "name": "@prometheus-io/lezer-promql", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "license": "Apache-2.0", "devDependencies": { "@lezer/generator": "^1.5.1", @@ -19233,7 +19233,7 @@ }, "react-app": { "name": "@prometheus-io/app", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "dependencies": { "@codemirror/autocomplete": "^6.11.1", "@codemirror/commands": "^6.3.2", @@ -19251,7 +19251,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.0-rc.0", + "@prometheus-io/codemirror-promql": "0.53.0-rc.1", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", diff --git a/web/ui/package.json b/web/ui/package.json index 4c9ce03e4e..06f44fe221 100644 --- a/web/ui/package.json +++ b/web/ui/package.json @@ -28,5 +28,5 @@ "ts-jest": "^29.1.1", "typescript": "^4.9.5" }, - "version": "0.53.0-rc.0" + "version": "0.53.0-rc.1" } diff --git a/web/ui/react-app/package.json b/web/ui/react-app/package.json index d21cf3db5b..63b3d60efc 100644 --- a/web/ui/react-app/package.json +++ b/web/ui/react-app/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/app", - "version": "0.53.0-rc.0", + "version": "0.53.0-rc.1", "private": true, "dependencies": { "@codemirror/autocomplete": "^6.11.1", @@ -19,7 +19,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.0-rc.0", + "@prometheus-io/codemirror-promql": "0.53.0-rc.1", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", From 4c1e71fa0b3d33754f4d906caa86ba9f73ebafb3 Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Fri, 14 Jun 2024 15:02:46 +0200 Subject: [PATCH 12/26] Reduce the flakiness of TestAsyncRuleEvaluation (#14300) * Reduce the flakiness of TestAsyncRuleEvaluation This tests sleeps for 15 millisecond per rule group, and then comprares the entire execution time to be smaller than a multiple of that delay. The ruleCount is 6, so it assumes that the test will come to the assertions in less than 90ms. Meanwhile, the Github's Windows runner: - ...Huh, oh? What? How much time? milliwhat? Sorry I don't speak that. TL;DR, this increases the delay to 250 millisecond. This won't prevent the test from being flaky, but will reduce the flakiness by several orders of magnitude and hopefully won't be an issue anymore. Signed-off-by: Oleg Zaytsev * Make tests parallel Signed-off-by: Oleg Zaytsev --------- Signed-off-by: Oleg Zaytsev --- rules/manager_test.go | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/rules/manager_test.go b/rules/manager_test.go index 11d1282bd3..d31bfc07aa 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -1910,18 +1910,12 @@ func TestDependencyMapUpdatesOnGroupUpdate(t *testing.T) { } func TestAsyncRuleEvaluation(t *testing.T) { - storage := teststorage.New(t) - t.Cleanup(func() { storage.Close() }) - - var ( - inflightQueries atomic.Int32 - maxInflight atomic.Int32 - ) - t.Run("synchronous evaluation with independent rules", func(t *testing.T) { - // Reset. - inflightQueries.Store(0) - maxInflight.Store(0) + t.Parallel() + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + inflightQueries := atomic.Int32{} + maxInflight := atomic.Int32{} ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) @@ -1949,9 +1943,11 @@ func TestAsyncRuleEvaluation(t *testing.T) { }) t.Run("asynchronous evaluation with independent and dependent rules", func(t *testing.T) { - // Reset. - inflightQueries.Store(0) - maxInflight.Store(0) + t.Parallel() + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + inflightQueries := atomic.Int32{} + maxInflight := atomic.Int32{} ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) @@ -1985,9 +1981,11 @@ func TestAsyncRuleEvaluation(t *testing.T) { }) t.Run("asynchronous evaluation of all independent rules, insufficient concurrency", func(t *testing.T) { - // Reset. - inflightQueries.Store(0) - maxInflight.Store(0) + t.Parallel() + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + inflightQueries := atomic.Int32{} + maxInflight := atomic.Int32{} ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) @@ -2021,9 +2019,11 @@ func TestAsyncRuleEvaluation(t *testing.T) { }) t.Run("asynchronous evaluation of all independent rules, sufficient concurrency", func(t *testing.T) { - // Reset. - inflightQueries.Store(0) - maxInflight.Store(0) + t.Parallel() + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + inflightQueries := atomic.Int32{} + maxInflight := atomic.Int32{} ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) @@ -2098,7 +2098,7 @@ func TestBoundedRuleEvalConcurrency(t *testing.T) { require.EqualValues(t, maxInflight.Load(), int32(maxConcurrency)+int32(groupCount)) } -const artificialDelay = 15 * time.Millisecond +const artificialDelay = 250 * time.Millisecond func optsFactory(storage storage.Storage, maxInflight, inflightQueries *atomic.Int32, maxConcurrent int64) *ManagerOptions { var inflightMu sync.Mutex From e7db2e30a41b71e21f2477c0a756884bfba4579f Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Sat, 15 Jun 2024 11:43:26 -0700 Subject: [PATCH 13/26] fix check context cancellation not incrementing count Signed-off-by: Ben Ye --- tsdb/index/index.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tsdb/index/index.go b/tsdb/index/index.go index 8172b81ce3..09fb737bfd 100644 --- a/tsdb/index/index.go +++ b/tsdb/index/index.go @@ -1557,6 +1557,7 @@ func (r *Reader) LabelNamesFor(ctx context.Context, postings Postings) ([]string i := 0 for postings.Next() { id := postings.At() + i++ if i%checkContextEveryNIterations == 0 && ctx.Err() != nil { return nil, ctx.Err() From e121d073886be49cdf81ac1fa47e049e7d069c1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Sun, 16 Jun 2024 10:24:09 +0200 Subject: [PATCH 14/26] Prepare release 2.53.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 13 +++---------- VERSION | 2 +- web/ui/module/codemirror-promql/package.json | 4 ++-- web/ui/module/lezer-promql/package.json | 2 +- web/ui/package-lock.json | 14 +++++++------- web/ui/package.json | 2 +- web/ui/react-app/package.json | 4 ++-- 7 files changed, 17 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c6da426a6..82212858c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,20 +2,13 @@ ## unreleased -## 2.53.0-rc.1 / 2024-06-11 +## 2.53.0 / 2024-06-16 This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 75. -* [CHANGE] Runtime: Change GOGC threshold from 50 to 75 #14285 -* [BUGFIX] Rules: Fix Group.Equals() to take in account the new queryOffset too. Followup to #14061. #14273 - -## 2.53.0-rc.0 / 2024-06-06 - -This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 50. - * [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 #14048 -* [CHANGE] Runtime: Change GOGC threshold from 100 to 50 #14176 -* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 +* [CHANGE] Runtime: Change GOGC threshold from 100 to 50 #14176 #14285 +* [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 #14273 * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` metric to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 * [ENHANCEMENT] TSDB: Optimize querying with regexp matchers. #13620 diff --git a/VERSION b/VERSION index 8d108ef311..261d95596f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.53.0-rc.1 +2.53.0 diff --git a/web/ui/module/codemirror-promql/package.json b/web/ui/module/codemirror-promql/package.json index 5039e8bd26..519c333653 100644 --- a/web/ui/module/codemirror-promql/package.json +++ b/web/ui/module/codemirror-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.0-rc.1", + "version": "0.53.0", "description": "a CodeMirror mode for the PromQL language", "types": "dist/esm/index.d.ts", "module": "dist/esm/index.js", @@ -29,7 +29,7 @@ }, "homepage": "https://github.com/prometheus/prometheus/blob/main/web/ui/module/codemirror-promql/README.md", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.0-rc.1", + "@prometheus-io/lezer-promql": "0.53.0", "lru-cache": "^7.18.3" }, "devDependencies": { diff --git a/web/ui/module/lezer-promql/package.json b/web/ui/module/lezer-promql/package.json index 6b155d00e0..5a3b0055b7 100644 --- a/web/ui/module/lezer-promql/package.json +++ b/web/ui/module/lezer-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/lezer-promql", - "version": "0.53.0-rc.1", + "version": "0.53.0", "description": "lezer-based PromQL grammar", "main": "dist/index.cjs", "type": "module", diff --git a/web/ui/package-lock.json b/web/ui/package-lock.json index e1610c1152..c8135d5e20 100644 --- a/web/ui/package-lock.json +++ b/web/ui/package-lock.json @@ -1,12 +1,12 @@ { "name": "prometheus-io", - "version": "0.53.0-rc.1", + "version": "0.53.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "prometheus-io", - "version": "0.53.0-rc.1", + "version": "0.53.0", "workspaces": [ "react-app", "module/*" @@ -30,10 +30,10 @@ }, "module/codemirror-promql": { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.0-rc.1", + "version": "0.53.0", "license": "Apache-2.0", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.0-rc.1", + "@prometheus-io/lezer-promql": "0.53.0", "lru-cache": "^7.18.3" }, "devDependencies": { @@ -69,7 +69,7 @@ }, "module/lezer-promql": { "name": "@prometheus-io/lezer-promql", - "version": "0.53.0-rc.1", + "version": "0.53.0", "license": "Apache-2.0", "devDependencies": { "@lezer/generator": "^1.5.1", @@ -19233,7 +19233,7 @@ }, "react-app": { "name": "@prometheus-io/app", - "version": "0.53.0-rc.1", + "version": "0.53.0", "dependencies": { "@codemirror/autocomplete": "^6.11.1", "@codemirror/commands": "^6.3.2", @@ -19251,7 +19251,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.0-rc.1", + "@prometheus-io/codemirror-promql": "0.53.0", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", diff --git a/web/ui/package.json b/web/ui/package.json index 06f44fe221..8b924737d6 100644 --- a/web/ui/package.json +++ b/web/ui/package.json @@ -28,5 +28,5 @@ "ts-jest": "^29.1.1", "typescript": "^4.9.5" }, - "version": "0.53.0-rc.1" + "version": "0.53.0" } diff --git a/web/ui/react-app/package.json b/web/ui/react-app/package.json index 63b3d60efc..7d9518e8d4 100644 --- a/web/ui/react-app/package.json +++ b/web/ui/react-app/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/app", - "version": "0.53.0-rc.1", + "version": "0.53.0", "private": true, "dependencies": { "@codemirror/autocomplete": "^6.11.1", @@ -19,7 +19,7 @@ "@lezer/lr": "^1.3.14", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.0-rc.1", + "@prometheus-io/codemirror-promql": "0.53.0", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^7.6.2", From 0e6fca8e76baf59e4a5f67f398dbf08f654b8013 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Sun, 16 Jun 2024 12:09:42 -0700 Subject: [PATCH 15/26] add unit test Signed-off-by: Ben Ye --- tsdb/index/index.go | 6 ++++-- tsdb/index/index_test.go | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/tsdb/index/index.go b/tsdb/index/index.go index 09fb737bfd..3621054598 100644 --- a/tsdb/index/index.go +++ b/tsdb/index/index.go @@ -1559,8 +1559,10 @@ func (r *Reader) LabelNamesFor(ctx context.Context, postings Postings) ([]string id := postings.At() i++ - if i%checkContextEveryNIterations == 0 && ctx.Err() != nil { - return nil, ctx.Err() + if i%checkContextEveryNIterations == 0 { + if ctxErr := ctx.Err(); ctxErr != nil { + return nil, ctxErr + } } offset := id diff --git a/tsdb/index/index_test.go b/tsdb/index/index_test.go index 038caacf8e..d81dd8696c 100644 --- a/tsdb/index/index_test.go +++ b/tsdb/index/index_test.go @@ -634,6 +634,31 @@ func TestReader_PostingsForLabelMatchingHonorsContextCancel(t *testing.T) { require.Equal(t, failAfter, ctx.Count()) } +func TestReader_LabelNamesForHonorsContextCancel(t *testing.T) { + const seriesCount = 1000 + var input indexWriterSeriesSlice + for i := 1; i <= seriesCount; i++ { + input = append(input, &indexWriterSeries{ + labels: labels.FromStrings(labels.MetricName, fmt.Sprintf("%4d", i)), + chunks: []chunks.Meta{ + {Ref: 1, MinTime: 0, MaxTime: 10}, + }, + }) + } + ir, _, _ := createFileReader(context.Background(), t, input) + + name, value := AllPostingsKey() + p, err := ir.Postings(context.Background(), name, value) + require.NoError(t, err) + // We check context cancellation every 128 iterations so 3 will fail after + // iterating 3 * 128 series. + failAfter := uint64(3) + ctx := &testutil.MockContextErrAfter{FailAfter: failAfter} + _, err = ir.LabelNamesFor(ctx, p) + require.Error(t, err) + require.Equal(t, failAfter, ctx.Count()) +} + // createFileReader creates a temporary index file. It writes the provided input to this file. // It returns a Reader for this file, the file's name, and the symbol map. func createFileReader(ctx context.Context, tb testing.TB, input indexWriterSeriesSlice) (*Reader, string, map[string]struct{}) { From 987fa5c6a2782ebf2c9902b5708245741d007019 Mon Sep 17 00:00:00 2001 From: Charles Korn Date: Mon, 17 Jun 2024 16:43:01 +1000 Subject: [PATCH 16/26] Convert range query test cases to test scripting language Signed-off-by: Charles Korn --- promql/engine_test.go | 161 ------------------ promql/promqltest/testdata/range_queries.test | 73 ++++++++ 2 files changed, 73 insertions(+), 161 deletions(-) create mode 100644 promql/promqltest/testdata/range_queries.test diff --git a/promql/engine_test.go b/promql/engine_test.go index 69d9ea0361..2d13500b1d 100644 --- a/promql/engine_test.go +++ b/promql/engine_test.go @@ -3061,167 +3061,6 @@ func TestEngineOptsValidation(t *testing.T) { } } -func TestRangeQuery(t *testing.T) { - cases := []struct { - Name string - Load string - Query string - Result parser.Value - Start time.Time - End time.Time - Interval time.Duration - }{ - { - Name: "sum_over_time with all values", - Load: `load 30s - bar 0 1 10 100 1000`, - Query: "sum_over_time(bar[30s])", - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 0, T: 0}, {F: 11, T: 60000}, {F: 1100, T: 120000}}, - Metric: labels.EmptyLabels(), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(120, 0), - Interval: 60 * time.Second, - }, - { - Name: "sum_over_time with trailing values", - Load: `load 30s - bar 0 1 10 100 1000 0 0 0 0`, - Query: "sum_over_time(bar[30s])", - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 0, T: 0}, {F: 11, T: 60000}, {F: 1100, T: 120000}}, - Metric: labels.EmptyLabels(), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(120, 0), - Interval: 60 * time.Second, - }, - { - Name: "sum_over_time with all values long", - Load: `load 30s - bar 0 1 10 100 1000 10000 100000 1000000 10000000`, - Query: "sum_over_time(bar[30s])", - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 0, T: 0}, {F: 11, T: 60000}, {F: 1100, T: 120000}, {F: 110000, T: 180000}, {F: 11000000, T: 240000}}, - Metric: labels.EmptyLabels(), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(240, 0), - Interval: 60 * time.Second, - }, - { - Name: "sum_over_time with all values random", - Load: `load 30s - bar 5 17 42 2 7 905 51`, - Query: "sum_over_time(bar[30s])", - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 5, T: 0}, {F: 59, T: 60000}, {F: 9, T: 120000}, {F: 956, T: 180000}}, - Metric: labels.EmptyLabels(), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(180, 0), - Interval: 60 * time.Second, - }, - { - Name: "metric query", - Load: `load 30s - metric 1+1x4`, - Query: "metric", - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 1, T: 0}, {F: 3, T: 60000}, {F: 5, T: 120000}}, - Metric: labels.FromStrings("__name__", "metric"), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(120, 0), - Interval: 1 * time.Minute, - }, - { - Name: "metric query with trailing values", - Load: `load 30s - metric 1+1x8`, - Query: "metric", - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 1, T: 0}, {F: 3, T: 60000}, {F: 5, T: 120000}}, - Metric: labels.FromStrings("__name__", "metric"), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(120, 0), - Interval: 1 * time.Minute, - }, - { - Name: "short-circuit", - Load: `load 30s - foo{job="1"} 1+1x4 - bar{job="2"} 1+1x4`, - Query: `foo > 2 or bar`, - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 1, T: 0}, {F: 3, T: 60000}, {F: 5, T: 120000}}, - Metric: labels.FromStrings( - "__name__", "bar", - "job", "2", - ), - }, - promql.Series{ - Floats: []promql.FPoint{{F: 3, T: 60000}, {F: 5, T: 120000}}, - Metric: labels.FromStrings( - "__name__", "foo", - "job", "1", - ), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(120, 0), - Interval: 1 * time.Minute, - }, - { - Name: "drop-metric-name", - Load: `load 30s - requests{job="1", __address__="bar"} 100`, - Query: `requests * 2`, - Result: promql.Matrix{ - promql.Series{ - Floats: []promql.FPoint{{F: 200, T: 0}, {F: 200, T: 60000}, {F: 200, T: 120000}}, - Metric: labels.FromStrings( - "__address__", "bar", - "job", "1", - ), - }, - }, - Start: time.Unix(0, 0), - End: time.Unix(120, 0), - Interval: 1 * time.Minute, - }, - } - for _, c := range cases { - t.Run(c.Name, func(t *testing.T) { - engine := newTestEngine() - storage := promqltest.LoadedStorage(t, c.Load) - t.Cleanup(func() { storage.Close() }) - - qry, err := engine.NewRangeQuery(context.Background(), storage, nil, c.Query, c.Start, c.End, c.Interval) - require.NoError(t, err) - - res := qry.Exec(context.Background()) - require.NoError(t, res.Err) - testutil.RequireEqual(t, c.Result, res.Value) - }) - } -} - func TestInstantQueryWithRangeVectorSelector(t *testing.T) { engine := newTestEngine() diff --git a/promql/promqltest/testdata/range_queries.test b/promql/promqltest/testdata/range_queries.test new file mode 100644 index 0000000000..e695109602 --- /dev/null +++ b/promql/promqltest/testdata/range_queries.test @@ -0,0 +1,73 @@ +# sum_over_time with all values +load 30s + bar 0 1 10 100 1000 + +eval range from 0 to 2m step 1m sum_over_time(bar[30s]) + {} 0 11 1100 + +clear + +# sum_over_time with trailing values +load 30s + bar 0 1 10 100 1000 0 0 0 0 + +eval range from 0 to 2m step 1m sum_over_time(bar[30s]) + {} 0 11 1100 + +clear + +# sum_over_time with all values long +load 30s + bar 0 1 10 100 1000 10000 100000 1000000 10000000 + +eval range from 0 to 4m step 1m sum_over_time(bar[30s]) + {} 0 11 1100 110000 11000000 + +clear + +# sum_over_time with all values random +load 30s + bar 5 17 42 2 7 905 51 + +eval range from 0 to 3m step 1m sum_over_time(bar[30s]) + {} 5 59 9 956 + +clear + +# metric query +load 30s + metric 1+1x4 + +eval range from 0 to 2m step 1m metric + metric 1 3 5 + +clear + +# metric query with trailing values +load 30s + metric 1+1x8 + +eval range from 0 to 2m step 1m metric + metric 1 3 5 + +clear + +# short-circuit +load 30s + foo{job="1"} 1+1x4 + bar{job="2"} 1+1x4 + +eval range from 0 to 2m step 1m foo > 2 or bar + foo{job="1"} _ 3 5 + bar{job="2"} 1 3 5 + +clear + +# Drop metric name +load 30s + requests{job="1", __address__="bar"} 100 + +eval range from 0 to 2m step 1m requests * 2 + {job="1", __address__="bar"} 200 200 200 + +clear From aeec30f082012b3f3445d51ecdc30dfb15790a45 Mon Sep 17 00:00:00 2001 From: Charles Korn Date: Mon, 17 Jun 2024 16:56:56 +1000 Subject: [PATCH 17/26] Convert `TestTimestampFunction_StepsMoreOftenThanSamples` Signed-off-by: Charles Korn --- promql/engine_test.go | 41 ----------------------- promql/promqltest/testdata/functions.test | 8 +++++ 2 files changed, 8 insertions(+), 41 deletions(-) diff --git a/promql/engine_test.go b/promql/engine_test.go index 2d13500b1d..4e321a6c33 100644 --- a/promql/engine_test.go +++ b/promql/engine_test.go @@ -2015,47 +2015,6 @@ func TestSubquerySelector(t *testing.T) { } } -func TestTimestampFunction_StepsMoreOftenThanSamples(t *testing.T) { - engine := newTestEngine() - storage := promqltest.LoadedStorage(t, ` -load 1m - metric 0+1x1000 -`) - t.Cleanup(func() { storage.Close() }) - - query := "timestamp(metric)" - start := time.Unix(0, 0) - end := time.Unix(61, 0) - interval := time.Second - - // We expect the value to be 0 for t=0s to t=59s (inclusive), then 60 for t=60s and t=61s. - expectedPoints := []promql.FPoint{} - - for t := 0; t <= 59; t++ { - expectedPoints = append(expectedPoints, promql.FPoint{F: 0, T: int64(t * 1000)}) - } - - expectedPoints = append( - expectedPoints, - promql.FPoint{F: 60, T: 60_000}, - promql.FPoint{F: 60, T: 61_000}, - ) - - expectedResult := promql.Matrix{ - promql.Series{ - Floats: expectedPoints, - Metric: labels.EmptyLabels(), - }, - } - - qry, err := engine.NewRangeQuery(context.Background(), storage, nil, query, start, end, interval) - require.NoError(t, err) - - res := qry.Exec(context.Background()) - require.NoError(t, res.Err) - testutil.RequireEqual(t, expectedResult, res.Value) -} - type FakeQueryLogger struct { closed bool logs []interface{} diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index 2c198374ac..7e741e9956 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -1213,3 +1213,11 @@ eval instant at 5m log10(exp_root_log - 20) {l="y"} -Inf clear + +# Test that timestamp() handles the scenario where there are more steps than samples. +load 1m + metric 0+1x1000 + +# We expect the value to be 0 for t=0s to t=59s (inclusive), then 60 for t=60s and t=61s. +eval range from 0 to 61s step 1s timestamp(metric) + {} 0x59 60 60 From 0fbf4a2529070ce6f8880b23b4834eeab6159fa6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 17 Jun 2024 10:40:45 +0200 Subject: [PATCH 18/26] Export remote.ToLabelMatchers() Signed-off-by: Marco Pracucci --- storage/remote/codec.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/storage/remote/codec.go b/storage/remote/codec.go index 1228b23f5c..c3b815a4d3 100644 --- a/storage/remote/codec.go +++ b/storage/remote/codec.go @@ -95,7 +95,7 @@ func EncodeReadResponse(resp *prompb.ReadResponse, w http.ResponseWriter) error // ToQuery builds a Query proto. func ToQuery(from, to int64, matchers []*labels.Matcher, hints *storage.SelectHints) (*prompb.Query, error) { - ms, err := toLabelMatchers(matchers) + ms, err := ToLabelMatchers(matchers) if err != nil { return nil, err } @@ -566,7 +566,8 @@ func validateLabelsAndMetricName(ls []prompb.Label) error { return nil } -func toLabelMatchers(matchers []*labels.Matcher) ([]*prompb.LabelMatcher, error) { +// ToLabelMatchers converts Prometheus label matchers to protobuf label matchers. +func ToLabelMatchers(matchers []*labels.Matcher) ([]*prompb.LabelMatcher, error) { pbMatchers := make([]*prompb.LabelMatcher, 0, len(matchers)) for _, m := range matchers { var mType prompb.LabelMatcher_Type @@ -591,7 +592,7 @@ func toLabelMatchers(matchers []*labels.Matcher) ([]*prompb.LabelMatcher, error) return pbMatchers, nil } -// FromLabelMatchers parses protobuf label matchers to Prometheus label matchers. +// FromLabelMatchers converts protobuf label matchers to Prometheus label matchers. func FromLabelMatchers(matchers []*prompb.LabelMatcher) ([]*labels.Matcher, error) { result := make([]*labels.Matcher, 0, len(matchers)) for _, matcher := range matchers { From 4f78cc809c3f3cfd5a4d2bfce9b9bedacb7eb27a Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Tue, 18 Jun 2024 11:57:37 +0200 Subject: [PATCH 19/26] Refactor `toNormalisedLower`: shorter and slightly faster. (#14299) Refactor toNormalisedLower: shorter and slightly faster Signed-off-by: Oleg Zaytsev --- model/labels/labels_common.go | 5 +++++ model/labels/labels_dedupelabels.go | 5 ----- model/labels/labels_stringlabels.go | 5 ----- model/labels/regexp.go | 34 ++++++++--------------------- model/labels/regexp_test.go | 4 ++++ 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/model/labels/labels_common.go b/model/labels/labels_common.go index f46321c97e..4bc94f84fe 100644 --- a/model/labels/labels_common.go +++ b/model/labels/labels_common.go @@ -18,6 +18,7 @@ import ( "encoding/json" "slices" "strconv" + "unsafe" "github.com/prometheus/common/model" ) @@ -215,3 +216,7 @@ func contains(s []Label, n string) bool { } return false } + +func yoloString(b []byte) string { + return *((*string)(unsafe.Pointer(&b))) +} diff --git a/model/labels/labels_dedupelabels.go b/model/labels/labels_dedupelabels.go index dfc74aa3a3..972f5dc164 100644 --- a/model/labels/labels_dedupelabels.go +++ b/model/labels/labels_dedupelabels.go @@ -20,7 +20,6 @@ import ( "slices" "strings" "sync" - "unsafe" "github.com/cespare/xxhash/v2" ) @@ -426,10 +425,6 @@ func EmptyLabels() Labels { return Labels{} } -func yoloString(b []byte) string { - return *((*string)(unsafe.Pointer(&b))) -} - // New returns a sorted Labels from the given labels. // The caller has to guarantee that all label names are unique. // Note this function is not efficient; should not be used in performance-critical places. diff --git a/model/labels/labels_stringlabels.go b/model/labels/labels_stringlabels.go index 9ef764daec..bccceb61fe 100644 --- a/model/labels/labels_stringlabels.go +++ b/model/labels/labels_stringlabels.go @@ -299,11 +299,6 @@ func Equal(ls, o Labels) bool { func EmptyLabels() Labels { return Labels{} } - -func yoloString(b []byte) string { - return *((*string)(unsafe.Pointer(&b))) -} - func yoloBytes(s string) (b []byte) { *(*string)(unsafe.Pointer(&b)) = s (*reflect.SliceHeader)(unsafe.Pointer(&b)).Cap = len(s) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 1f3f15eb07..1e9db882bf 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -798,39 +798,23 @@ func (m *equalMultiStringMapMatcher) Matches(s string) bool { // toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert // it to lower case. func toNormalisedLower(s string) string { - // Check if the string is all ASCII chars and convert any upper case character to lower case character. - isASCII := true - var ( - b strings.Builder - pos int - ) - b.Grow(len(s)) + var buf []byte for i := 0; i < len(s); i++ { c := s[i] - if isASCII && c >= utf8.RuneSelf { - isASCII = false - break + if c >= utf8.RuneSelf { + return strings.Map(unicode.ToLower, norm.NFKD.String(s)) } if 'A' <= c && c <= 'Z' { - c += 'a' - 'A' - if pos < i { - b.WriteString(s[pos:i]) + if buf == nil { + buf = []byte(s) } - b.WriteByte(c) - pos = i + 1 + buf[i] = c + 'a' - 'A' } } - if pos < len(s) { - b.WriteString(s[pos:]) + if buf == nil { + return s } - - // Optimize for ASCII-only strings. In this case we don't have to do any normalization. - if isASCII { - return b.String() - } - - // Normalise and convert to lower. - return strings.Map(unicode.ToLower, norm.NFKD.String(b.String())) + return yoloString(buf) } // anyStringWithoutNewlineMatcher is a stringMatcher which matches any string diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index c86a5cae41..008eae702c 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -1209,6 +1209,10 @@ func visitStringMatcher(matcher StringMatcher, callback func(matcher StringMatch func TestToNormalisedLower(t *testing.T) { testCases := map[string]string{ "foo": "foo", + "FOO": "foo", + "Foo": "foo", + "foO": "foo", + "fOo": "foo", "AAAAAAAAAAAAAAAAAAAAAAAA": "aaaaaaaaaaaaaaaaaaaaaaaa", "cccccccccccccccccccccccC": "cccccccccccccccccccccccc", "ſſſſſſſſſſſſſſſſſſſſſſſſS": "sssssssssssssssssssssssss", From fd1a89b7c87ec6658708fc631d6af591223d2077 Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Tue, 18 Jun 2024 12:28:56 +0200 Subject: [PATCH 20/26] Pass affected labels to `MemPostings.Delete()` (#14307) * Pass affected labels to MemPostings.Delete As suggested by @bboreham, we can track the labels of the deleted series and avoid iterating through all the label/value combinations. This looks much faster on the MemPostings.Delete call. We don't have a benchmark on stripeSeries.gc() where we'll pay the price of iterating the labels of each one of the deleted series. Signed-off-by: Oleg Zaytsev --- tsdb/head.go | 10 ++-- tsdb/head_test.go | 74 ++++++++++++++++++++++++++ tsdb/index/postings.go | 101 ++++++++---------------------------- tsdb/index/postings_test.go | 67 ++++++++++++------------ 4 files changed, 138 insertions(+), 114 deletions(-) diff --git a/tsdb/head.go b/tsdb/head.go index d5f7144fdb..5972a9c5d6 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -1552,7 +1552,7 @@ func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) { // Drop old chunks and remember series IDs and hashes if they can be // deleted entirely. - deleted, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef) + deleted, affected, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef) seriesRemoved := len(deleted) h.metrics.seriesRemoved.Add(float64(seriesRemoved)) @@ -1561,7 +1561,7 @@ func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) { h.numSeries.Sub(uint64(seriesRemoved)) // Remove deleted series IDs from the postings lists. - h.postings.Delete(deleted) + h.postings.Delete(deleted, affected) // Remove tombstones referring to the deleted series. h.tombstones.DeleteTombstones(deleted) @@ -1869,9 +1869,10 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st // but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct // and there's no easy way to cast maps. // minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series. -func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ int, _, _ int64, minMmapFile int) { +func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ map[labels.Label]struct{}, _ int, _, _ int64, minMmapFile int) { var ( deleted = map[storage.SeriesRef]struct{}{} + affected = map[labels.Label]struct{}{} rmChunks = 0 actualMint int64 = math.MaxInt64 minOOOTime int64 = math.MaxInt64 @@ -1927,6 +1928,7 @@ func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) ( } deleted[storage.SeriesRef(series.ref)] = struct{}{} + series.lset.Range(func(l labels.Label) { affected[l] = struct{}{} }) s.hashes[hashShard].del(hash, series.ref) delete(s.series[refShard], series.ref) deletedForCallback[series.ref] = series.lset @@ -1938,7 +1940,7 @@ func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) ( actualMint = mint } - return deleted, rmChunks, actualMint, minOOOTime, minMmapFile + return deleted, affected, rmChunks, actualMint, minOOOTime, minMmapFile } // The iterForDeletion function iterates through all series, invoking the checkDeletedFunc for each. diff --git a/tsdb/head_test.go b/tsdb/head_test.go index bb437ab598..93f046e5b3 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -814,6 +814,80 @@ func TestHead_UnknownWALRecord(t *testing.T) { require.NoError(t, head.Close()) } +// BenchmarkHead_Truncate is quite heavy, so consider running it with +// -benchtime=10x or similar to get more stable and comparable results. +func BenchmarkHead_Truncate(b *testing.B) { + const total = 1e6 + + prepare := func(b *testing.B, churn int) *Head { + h, _ := newTestHead(b, 1000, wlog.CompressionNone, false) + b.Cleanup(func() { + require.NoError(b, h.Close()) + }) + + h.initTime(0) + + internedItoa := map[int]string{} + var mtx sync.RWMutex + itoa := func(i int) string { + mtx.RLock() + s, ok := internedItoa[i] + mtx.RUnlock() + if ok { + return s + } + mtx.Lock() + s = strconv.Itoa(i) + internedItoa[i] = s + mtx.Unlock() + return s + } + + allSeries := [total]labels.Labels{} + nameValues := make([]string, 0, 100) + for i := 0; i < total; i++ { + nameValues = nameValues[:0] + + // A thousand labels like lbl_x_of_1000, each with total/1000 values + thousand := "lbl_" + itoa(i%1000) + "_of_1000" + nameValues = append(nameValues, thousand, itoa(i/1000)) + // A hundred labels like lbl_x_of_100, each with total/100 values. + hundred := "lbl_" + itoa(i%100) + "_of_100" + nameValues = append(nameValues, hundred, itoa(i/100)) + + if i%13 == 0 { + ten := "lbl_" + itoa(i%10) + "_of_10" + nameValues = append(nameValues, ten, itoa(i%10)) + } + + allSeries[i] = labels.FromStrings(append(nameValues, "first", "a", "second", "a", "third", "a")...) + s, _, _ := h.getOrCreate(allSeries[i].Hash(), allSeries[i]) + s.mmappedChunks = []*mmappedChunk{ + {minTime: 1000 * int64(i/churn), maxTime: 999 + 1000*int64(i/churn)}, + } + } + + return h + } + + for _, churn := range []int{10, 100, 1000} { + b.Run(fmt.Sprintf("churn=%d", churn), func(b *testing.B) { + if b.N > total/churn { + // Just to make sure that benchmark still makes sense. + panic("benchmark not prepared") + } + h := prepare(b, churn) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + require.NoError(b, h.Truncate(1000*int64(i))) + // Make sure the benchmark is meaningful and it's actually truncating the expected amount of series. + require.Equal(b, total-churn*i, int(h.NumSeries())) + } + }) + } +} + func TestHead_Truncate(t *testing.T) { h, _ := newTestHead(t, 1000, wlog.CompressionNone, false) defer func() { diff --git a/tsdb/index/postings.go b/tsdb/index/postings.go index 6b654f6b5b..d9b5b69de0 100644 --- a/tsdb/index/postings.go +++ b/tsdb/index/postings.go @@ -288,89 +288,34 @@ func (p *MemPostings) EnsureOrder(numberOfConcurrentProcesses int) { } // Delete removes all ids in the given map from the postings lists. -func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}) { - // We will take an optimistic read lock for the entire method, - // and only lock for writing when we actually find something to delete. - // - // Each SeriesRef can appear in several Postings. - // To change each one, we need to know the label name and value that it is indexed under. - // We iterate over all label names, then for each name all values, - // and look for individual series to be deleted. - p.mtx.RLock() - defer p.mtx.RUnlock() +// affectedLabels contains all the labels that are affected by the deletion, there's no need to check other labels. +func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected map[labels.Label]struct{}) { + p.mtx.Lock() + defer p.mtx.Unlock() - // Collect all keys relevant for deletion once. New keys added afterwards - // can by definition not be affected by any of the given deletes. - keys := make([]string, 0, len(p.m)) - maxVals := 0 - for n := range p.m { - keys = append(keys, n) - if len(p.m[n]) > maxVals { - maxVals = len(p.m[n]) + process := func(l labels.Label) { + orig := p.m[l.Name][l.Value] + repl := make([]storage.SeriesRef, 0, len(orig)) + for _, id := range orig { + if _, ok := deleted[id]; !ok { + repl = append(repl, id) + } + } + if len(repl) > 0 { + p.m[l.Name][l.Value] = repl + } else { + delete(p.m[l.Name], l.Value) + // Delete the key if we removed all values. + if len(p.m[l.Name]) == 0 { + delete(p.m, l.Name) + } } } - vals := make([]string, 0, maxVals) - for _, n := range keys { - // Copy the values and iterate the copy: if we unlock in the loop below, - // another goroutine might modify the map while we are part-way through it. - vals = vals[:0] - for v := range p.m[n] { - vals = append(vals, v) - } - - // For each posting we first analyse whether the postings list is affected by the deletes. - // If no, we remove the label value from the vals list. - // This way we only need to Lock once later. - for i := 0; i < len(vals); { - found := false - refs := p.m[n][vals[i]] - for _, id := range refs { - if _, ok := deleted[id]; ok { - i++ - found = true - break - } - } - - if !found { - // Didn't match, bring the last value to this position, make the slice shorter and check again. - // The order of the slice doesn't matter as it comes from a map iteration. - vals[i], vals = vals[len(vals)-1], vals[:len(vals)-1] - } - } - - // If no label values have deleted ids, just continue. - if len(vals) == 0 { - continue - } - - // The only vals left here are the ones that contain deleted ids. - // Now we take the write lock and remove the ids. - p.mtx.RUnlock() - p.mtx.Lock() - for _, l := range vals { - repl := make([]storage.SeriesRef, 0, len(p.m[n][l])) - - for _, id := range p.m[n][l] { - if _, ok := deleted[id]; !ok { - repl = append(repl, id) - } - } - if len(repl) > 0 { - p.m[n][l] = repl - } else { - delete(p.m[n], l) - } - } - - // Delete the key if we removed all values. - if len(p.m[n]) == 0 { - delete(p.m, n) - } - p.mtx.Unlock() - p.mtx.RLock() + for l := range affected { + process(l) } + process(allPostingsKey) } // Iter calls f for each postings list. It aborts if f returns an error and returns it. diff --git a/tsdb/index/postings_test.go b/tsdb/index/postings_test.go index 4f34cc47ea..96c9ed124b 100644 --- a/tsdb/index/postings_test.go +++ b/tsdb/index/postings_test.go @@ -979,9 +979,13 @@ func TestMemPostings_Delete(t *testing.T) { p.Add(3, labels.FromStrings("lbl2", "a")) before := p.Get(allPostingsKey.Name, allPostingsKey.Value) - p.Delete(map[storage.SeriesRef]struct{}{ + deletedRefs := map[storage.SeriesRef]struct{}{ 2: {}, - }) + } + affectedLabels := map[labels.Label]struct{}{ + {Name: "lbl1", Value: "b"}: {}, + } + p.Delete(deletedRefs, affectedLabels) after := p.Get(allPostingsKey.Name, allPostingsKey.Value) // Make sure postings gotten before the delete have the old data when @@ -1022,33 +1026,23 @@ func BenchmarkMemPostings_Delete(b *testing.B) { } const total = 1e6 - prepare := func() *MemPostings { - var ref storage.SeriesRef - next := func() storage.SeriesRef { - ref++ - return ref + allSeries := [total]labels.Labels{} + nameValues := make([]string, 0, 100) + for i := 0; i < total; i++ { + nameValues = nameValues[:0] + + // A thousand labels like lbl_x_of_1000, each with total/1000 values + thousand := "lbl_" + itoa(i%1000) + "_of_1000" + nameValues = append(nameValues, thousand, itoa(i/1000)) + // A hundred labels like lbl_x_of_100, each with total/100 values. + hundred := "lbl_" + itoa(i%100) + "_of_100" + nameValues = append(nameValues, hundred, itoa(i/100)) + + if i < 100 { + ten := "lbl_" + itoa(i%10) + "_of_10" + nameValues = append(nameValues, ten, itoa(i%10)) } - - p := NewMemPostings() - nameValues := make([]string, 0, 100) - for i := 0; i < total; i++ { - nameValues = nameValues[:0] - - // A thousand labels like lbl_x_of_1000, each with total/1000 values - thousand := "lbl_" + itoa(i%1000) + "_of_1000" - nameValues = append(nameValues, thousand, itoa(i/1000)) - // A hundred labels like lbl_x_of_100, each with total/100 values. - hundred := "lbl_" + itoa(i%100) + "_of_100" - nameValues = append(nameValues, hundred, itoa(i/100)) - - if i < 100 { - ten := "lbl_" + itoa(i%10) + "_of_10" - nameValues = append(nameValues, ten, itoa(i%10)) - } - - p.Add(next(), labels.FromStrings(append(nameValues, "first", "a", "second", "a", "third", "a")...)) - } - return p + allSeries[i] = labels.FromStrings(append(nameValues, "first", "a", "second", "a", "third", "a")...) } for _, refs := range []int{1, 100, 10_000} { @@ -1060,7 +1054,11 @@ func BenchmarkMemPostings_Delete(b *testing.B) { panic("benchmark not prepared") } - p := prepare() + p := NewMemPostings() + for i := range allSeries { + p.Add(storage.SeriesRef(i), allSeries[i]) + } + stop := make(chan struct{}) wg := sync.WaitGroup{} for i := 0; i < reads; i++ { @@ -1086,11 +1084,16 @@ func BenchmarkMemPostings_Delete(b *testing.B) { b.ResetTimer() for n := 0; n < b.N; n++ { - deleted := map[storage.SeriesRef]struct{}{} + deleted := make(map[storage.SeriesRef]struct{}, refs) + affected := make(map[labels.Label]struct{}, refs) for i := 0; i < refs; i++ { - deleted[storage.SeriesRef(n*refs+i)] = struct{}{} + ref := storage.SeriesRef(n*refs + i) + deleted[ref] = struct{}{} + allSeries[ref].Range(func(l labels.Label) { + affected[l] = struct{}{} + }) } - p.Delete(deleted) + p.Delete(deleted, affected) } }) } From 29d3e482676cbd1dde29d701d56fbdc98ba8ee61 Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Tue, 18 Jun 2024 13:45:53 +0200 Subject: [PATCH 21/26] Update CHANGELOG.md Co-authored-by: Julien <291750+roidelapluie@users.noreply.github.com> Signed-off-by: George Krajcsovits --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82212858c7..7e47999934 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ This release changes the default for GOGC, the Go runtime control for the trade-off between excess memory use and CPU usage. We have found that Prometheus operates with minimal additional CPU usage, but greatly reduced memory by adjusting the upstream Go default from 100 to 75. * [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 #14048 -* [CHANGE] Runtime: Change GOGC threshold from 100 to 50 #14176 #14285 +* [CHANGE] Runtime: Change GOGC threshold from 100 to 75 #14176 #14285 * [FEATURE] Rules: Add new option `query_offset` for each rule group via rule group configuration file and `rule_query_offset` as part of the global configuration to have more resilience for remote write delays. #14061 #14216 #14273 * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` metric to measure the time it takes to restore a rule group. #13974 * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 From 1c3f322f7832d6ebbba9dc3a13b398e03e014bcf Mon Sep 17 00:00:00 2001 From: Rens Groothuijsen Date: Tue, 18 Jun 2024 13:51:47 +0200 Subject: [PATCH 22/26] docs: mention implicitly watched directories in documentation (#14019) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: mention implicitly watched directories in documentation Signed-off-by: Rens Groothuijsen * Add mention of atomic file renaming Co-authored-by: Ayoub Mrini Signed-off-by: Rens Groothuijsen --------- Signed-off-by: Rens Groothuijsen Co-authored-by: Ayoub Mrini Co-authored-by: Björn Rabenstein --- docs/configuration/configuration.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index b83219700a..5df7dae3c0 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -1608,7 +1608,16 @@ and serves as an interface to plug in custom service discovery mechanisms. It reads a set of files containing a list of zero or more ``s. Changes to all defined files are detected via disk watches -and applied immediately. Files may be provided in YAML or JSON format. Only +and applied immediately. + +While those individual files are watched for changes, +the parent directory is also watched implicitly. This is to handle [atomic +renaming](https://github.com/fsnotify/fsnotify/blob/c1467c02fba575afdb5f4201072ab8403bbf00f4/README.md?plain=1#L128) efficiently and to detect new files that match the configured globs. +This may cause issues if the parent directory contains a large number of other files, +as each of these files will be watched too, even though the events related +to them are not relevant. + +Files may be provided in YAML or JSON format. Only changes resulting in well-formed target groups are applied. Files must contain a list of static configs, using these formats: From 545d31f184cd405eeacf4b6f035d9b6ddbdee6bd Mon Sep 17 00:00:00 2001 From: anarcat Date: Wed, 19 Jun 2024 01:46:13 -0400 Subject: [PATCH 23/26] docs: clarify backup requirements for storage (#14297) * clarify backup requirements for storage After reading this (again) recently, I was under the impression that our backup strategy ("just throw Bacula at it") was just not good enough and that our backups were inconsistent. I filed [an issue internally][41627] about this because of that concern. But reading a conversation with @SuperQ on IRC, I came under the impression that only the WAL files would be lost. This is an attempt at documenting this more clearly. [41627]: https://gitlab.torproject.org/tpo/tpa/team/-/issues/41627 --------- Signed-off-by: anarcat Co-authored-by: Ben Kochie --- docs/storage.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/storage.md b/docs/storage.md index b66f2062af..947960fe12 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -61,8 +61,11 @@ A Prometheus server's data directory looks something like this: Note that a limitation of local storage is that it is not clustered or replicated. Thus, it is not arbitrarily scalable or durable in the face of drive or node outages and should be managed like any other single node -database. The use of RAID is suggested for storage availability, and -[snapshots](querying/api.md#snapshot) are recommended for backups. With proper +database. + +[Snapshots](querying/api.md#snapshot) are recommended for backups. Backups +made without snapshots run the risk of losing data that was recorded since +the last WAL sync, which typically happens every two hours. With proper architecture, it is possible to retain years of data in local storage. Alternatively, external storage may be used via the From 94d28cd6cf96d0f56f8cd5f3f0b7b9777eb26640 Mon Sep 17 00:00:00 2001 From: machine424 Date: Fri, 31 May 2024 16:40:09 +0200 Subject: [PATCH 24/26] chore(notifier): add a reproducer for https://github.com/prometheus/prometheus/issues/13676 to show "targets groups update" starvation when the notifications queue is full and an Alertmanager is down. The existing `TestHangingNotifier` that was added in https://github.com/prometheus/prometheus/pull/10948 doesn't really reflect the reality as the SD changes are manually fed into `syncCh` in a continuous way, whereas in reality, updates are only resent every `updatert`. The test added here sets up an SD manager and links it to the notifier. The SD changes will be triggered by that manager as it's done in reality. Signed-off-by: machine424 Co-authored-by: Ethan Hunter --- discovery/manager.go | 10 +++ notifier/notifier_test.go | 149 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/discovery/manager.go b/discovery/manager.go index f14071af30..897d7d151c 100644 --- a/discovery/manager.go +++ b/discovery/manager.go @@ -120,6 +120,16 @@ func Name(n string) func(*Manager) { } } +// Updatert sets the updatert of the manager. +// Used to speed up tests. +func Updatert(u time.Duration) func(*Manager) { + return func(m *Manager) { + m.mtx.Lock() + defer m.mtx.Unlock() + m.updatert = u + } +} + // HTTPClientOptions sets the list of HTTP client options to expose to // Discoverers. It is up to Discoverers to choose to use the options provided. func HTTPClientOptions(opts ...config.HTTPClientOption) func(*Manager) { diff --git a/notifier/notifier_test.go b/notifier/notifier_test.go index d2e72ca33b..5c82decbe0 100644 --- a/notifier/notifier_test.go +++ b/notifier/notifier_test.go @@ -26,13 +26,17 @@ import ( "testing" "time" + "github.com/go-kit/log" "github.com/prometheus/alertmanager/api/v2/models" + "github.com/prometheus/client_golang/prometheus" config_util "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "go.uber.org/atomic" "gopkg.in/yaml.v2" + "github.com/prometheus/prometheus/discovery" + "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" @@ -811,3 +815,148 @@ func TestHangingNotifier(t *testing.T) { }) } } + +// TODO: renameit and even replace TestHangingNotifier with it. +// TestHangingNotifierXXX ensures that the notifier takes into account SD changes even when there are +// queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676. +func TestHangingNotifierXXX(t *testing.T) { + const ( + batches = 100 + alertsCount = maxBatchSize * batches + ) + + var ( + sendTimeout = 10 * time.Millisecond + sdUpdatert = sendTimeout / 2 + + done = make(chan struct{}) + ) + + defer func() { + close(done) + }() + + // Set up a faulty Alertmanager. + var faultyCalled atomic.Bool + faultyServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + faultyCalled.Store(true) + select { + case <-done: + case <-time.After(time.Hour): + } + })) + faultyURL, err := url.Parse(faultyServer.URL) + require.NoError(t, err) + + // Set up a functional Alertmanager. + var functionalCalled atomic.Bool + functionalServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + functionalCalled.Store(true) + })) + functionalURL, err := url.Parse(functionalServer.URL) + require.NoError(t, err) + + // Initialize the discovery manager + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + reg := prometheus.NewRegistry() + sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg)) + require.NoError(t, err) + sdManager := discovery.NewManager( + ctx, + log.NewNopLogger(), + reg, + sdMetrics, + discovery.Name("sd-manager"), + discovery.Updatert(sdUpdatert), + ) + go sdManager.Run() + + // Set up the notifier with both faulty and functional Alertmanagers. + notifier := NewManager( + &Options{ + QueueCapacity: alertsCount, + }, + nil, + ) + notifier.alertmanagers = make(map[string]*alertmanagerSet) + amCfg := config.DefaultAlertmanagerConfig + amCfg.Timeout = model.Duration(sendTimeout) + notifier.alertmanagers["config-0"] = &alertmanagerSet{ + ams: []alertmanager{ + alertmanagerMock{ + urlf: func() string { return faultyURL.String() }, + }, + alertmanagerMock{ + urlf: func() string { return functionalURL.String() }, + }, + }, + cfg: &amCfg, + metrics: notifier.metrics, + } + go notifier.Run(sdManager.SyncCh()) + defer notifier.Stop() + + require.Len(t, notifier.Alertmanagers(), 2) + + // Enqueue the alerts. + var alerts []*Alert + for i := range make([]struct{}, alertsCount) { + alerts = append(alerts, &Alert{ + Labels: labels.FromStrings("alertname", strconv.Itoa(i)), + }) + } + notifier.Send(alerts...) + + // Wait for the Alertmanagers to start receiving alerts. + // 10*sdUpdatert is used as an arbitrary timeout here. + timeout := time.After(10 * sdUpdatert) +loop1: + for { + select { + case <-timeout: + t.Fatalf("Timeout waiting for the alertmanagers to be reached for the first time.") + default: + if faultyCalled.Load() && functionalCalled.Load() { + break loop1 + } + } + } + + // Request to remove the faulty Alertmanager. + c := map[string]discovery.Configs{ + "config-0": { + discovery.StaticConfig{ + &targetgroup.Group{ + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue(functionalURL.Host), + }, + }, + }, + }, + }, + } + require.NoError(t, sdManager.ApplyConfig(c)) + + // The notifier should not wait until the alerts queue is empty to apply the discovery changes + // A faulty Alertmanager could cause each alert sending cycle to take up to AlertmanagerConfig.Timeout + // The queue may never be emptied, as the arrival rate could be larger than the departure rate + // It could even overflow and alerts could be dropped. + timeout = time.After(batches * sendTimeout) +loop2: + for { + select { + case <-timeout: + t.Fatalf("Timeout, the faulty alertmanager not removed on time.") + default: + // The faulty alertmanager was dropped. + if len(notifier.Alertmanagers()) == 1 { + // Prevent from TOCTOU. + require.Positive(t, notifier.queueLen()) + break loop2 + } + require.Positive(t, notifier.queueLen(), "The faulty alertmanager wasn't dropped before the alerts queue was emptied.") + } + } +} From 690de487e2fb0be3e3a41121bbe2b9ac3e7a844c Mon Sep 17 00:00:00 2001 From: machine424 Date: Mon, 3 Jun 2024 18:09:51 +0200 Subject: [PATCH 25/26] chore(notifier): Split 'Run()' into two goroutines: one to receive target updates and trigger reloads and the other one to send notifications. This is done to prevent the latter operation from blocking/starving the former, as previously, the `tsets` channel was consumed by the same goroutine that consumes and feeds the buffered `n.more` channel, the `tsets` channel was less likely to be ready as it's unbuffered and only fed every `SDManager.updatert` seconds. See https://github.com/prometheus/prometheus/issues/13676 and https://github.com/prometheus/prometheus/issues/8768 The synchronization with the sendLoop goroutine is managed through the n.mtx mutex. This uses a similar approach than scrape manager's https://github.com/prometheus/prometheus/blob/efbd6e41c59ec8d6b7a0791c1fb337fdac53b4f2/scrape/manager.go#L115-L117 The old TestHangingNotifier was replaced by the new one to more closely reflect reality. Signed-off-by: machine424 --- notifier/notifier.go | 35 ++++++----- notifier/notifier_test.go | 123 ++------------------------------------ 2 files changed, 25 insertions(+), 133 deletions(-) diff --git a/notifier/notifier.go b/notifier/notifier.go index 4cf376aa05..a375a0749c 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -298,25 +298,14 @@ func (n *Manager) nextBatch() []*Alert { return alerts } -// Run dispatches notifications continuously. -func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) { +// sendLoop continuously consumes the notifications queue and sends alerts to +// the configured Alertmanagers. +func (n *Manager) sendLoop() { for { - // The select is split in two parts, such as we will first try to read - // new alertmanager targets if they are available, before sending new - // alerts. select { case <-n.ctx.Done(): return - case ts := <-tsets: - n.reload(ts) - default: - select { - case <-n.ctx.Done(): - return - case ts := <-tsets: - n.reload(ts) - case <-n.more: - } + case <-n.more: } alerts := n.nextBatch() @@ -330,6 +319,21 @@ func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) { } } +// Run receives updates of target groups and triggers a reload. +// The dispatching of notifications occurs in the background to prevent blocking the receipt of target updates. +// Refer to https://github.com/prometheus/prometheus/issues/13676 for more details. +func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) { + go n.sendLoop() + for { + select { + case <-n.ctx.Done(): + return + case ts := <-tsets: + n.reload(ts) + } + } +} + func (n *Manager) reload(tgs map[string][]*targetgroup.Group) { n.mtx.Lock() defer n.mtx.Unlock() @@ -483,6 +487,7 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { ams.mtx.RLock() + if len(ams.cfg.AlertRelabelConfigs) > 0 { amAlerts = relabelAlerts(ams.cfg.AlertRelabelConfigs, labels.Labels{}, alerts) if len(amAlerts) == 0 { diff --git a/notifier/notifier_test.go b/notifier/notifier_test.go index 5c82decbe0..03290a58ca 100644 --- a/notifier/notifier_test.go +++ b/notifier/notifier_test.go @@ -701,125 +701,10 @@ func TestLabelsToOpenAPILabelSet(t *testing.T) { require.Equal(t, models.LabelSet{"aaa": "111", "bbb": "222"}, labelsToOpenAPILabelSet(labels.FromStrings("aaa", "111", "bbb", "222"))) } -// TestHangingNotifier validates that targets updates happen even when there are -// queued alerts. -func TestHangingNotifier(t *testing.T) { - // Note: When targets are not updated in time, this test is flaky because go - // selects are not deterministic. Therefore we run 10 subtests to run into the issue. - for i := 0; i < 10; i++ { - t.Run(strconv.Itoa(i), func(t *testing.T) { - var ( - done = make(chan struct{}) - changed = make(chan struct{}) - syncCh = make(chan map[string][]*targetgroup.Group) - ) - - defer func() { - close(done) - }() - - var calledOnce bool - // Setting up a bad server. This server hangs for 2 seconds. - badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if calledOnce { - t.Fatal("hanging server called multiple times") - } - calledOnce = true - select { - case <-done: - case <-time.After(2 * time.Second): - } - })) - badURL, err := url.Parse(badServer.URL) - require.NoError(t, err) - badAddress := badURL.Host // Used for __name__ label in targets. - - // Setting up a bad server. This server returns fast, signaling requests on - // by closing the changed channel. - goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - close(changed) - })) - goodURL, err := url.Parse(goodServer.URL) - require.NoError(t, err) - goodAddress := goodURL.Host // Used for __name__ label in targets. - - h := NewManager( - &Options{ - QueueCapacity: 20 * maxBatchSize, - }, - nil, - ) - - h.alertmanagers = make(map[string]*alertmanagerSet) - - am1Cfg := config.DefaultAlertmanagerConfig - am1Cfg.Timeout = model.Duration(200 * time.Millisecond) - - h.alertmanagers["config-0"] = &alertmanagerSet{ - ams: []alertmanager{}, - cfg: &am1Cfg, - metrics: h.metrics, - } - go h.Run(syncCh) - defer h.Stop() - - var alerts []*Alert - for i := range make([]struct{}, 20*maxBatchSize) { - alerts = append(alerts, &Alert{ - Labels: labels.FromStrings("alertname", strconv.Itoa(i)), - }) - } - - // Injecting the hanging server URL. - syncCh <- map[string][]*targetgroup.Group{ - "config-0": { - { - Targets: []model.LabelSet{ - { - model.AddressLabel: model.LabelValue(badAddress), - }, - }, - }, - }, - } - - // Queing alerts. - h.Send(alerts...) - - // Updating with a working alertmanager target. - go func() { - select { - case syncCh <- map[string][]*targetgroup.Group{ - "config-0": { - { - Targets: []model.LabelSet{ - { - model.AddressLabel: model.LabelValue(goodAddress), - }, - }, - }, - }, - }: - case <-done: - } - }() - - select { - case <-time.After(1 * time.Second): - t.Fatalf("Timeout after 1 second, targets not synced in time.") - case <-changed: - // The good server has been hit in less than 3 seconds, therefore - // targets have been updated before a second call could be made to the - // bad server. - } - }) - } -} - -// TODO: renameit and even replace TestHangingNotifier with it. -// TestHangingNotifierXXX ensures that the notifier takes into account SD changes even when there are +// TestHangingNotifier ensures that the notifier takes into account SD changes even when there are // queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676. -func TestHangingNotifierXXX(t *testing.T) { +// and https://github.com/prometheus/prometheus/issues/8768. +func TestHangingNotifier(t *testing.T) { const ( batches = 100 alertsCount = maxBatchSize * batches @@ -857,6 +742,8 @@ func TestHangingNotifierXXX(t *testing.T) { require.NoError(t, err) // Initialize the discovery manager + // This is relevant as the updates aren't sent continually in real life, but only each updatert. + // The old implementation of TestHangingNotifier didn't take that into acount. ctx, cancel := context.WithCancel(context.Background()) defer cancel() reg := prometheus.NewRegistry() From 70beda092a47c161b0483eb93c97547acbbdf63b Mon Sep 17 00:00:00 2001 From: machine424 Date: Mon, 10 Jun 2024 21:26:36 +0200 Subject: [PATCH 26/26] fix(notifier): take alertmanagerSet.mtx before checking alertmanagerSet.ams in sendAll Signed-off-by: machine424 --- notifier/notifier.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notifier/notifier.go b/notifier/notifier.go index a375a0749c..eb83c45b07 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -475,10 +475,6 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { numSuccess atomic.Uint64 ) for _, ams := range amSets { - if len(ams.ams) == 0 { - continue - } - var ( payload []byte err error @@ -487,6 +483,10 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { ams.mtx.RLock() + if len(ams.ams) == 0 { + ams.mtx.RUnlock() + continue + } if len(ams.cfg.AlertRelabelConfigs) > 0 { amAlerts = relabelAlerts(ams.cfg.AlertRelabelConfigs, labels.Labels{}, alerts)