From 7f99d2930d46047a1b6e0f6ad1a0962d20a6e380 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Thu, 26 Sep 2024 11:07:15 +0100 Subject: [PATCH 01/15] [BUGFIX] PromQL: make sort_by_label stable Go's sorting functions can re-order equal elements, so the strategy of sorting by the fallback ordering first does not always work. Pulling the fallback into the main comparison function is more reliable and more efficient. Signed-off-by: Bryan Boreham --- CHANGELOG.md | 2 +- promql/functions.go | 34 ++++++++-------------------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 64618d552e..c17eb8cf2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ * [ENHANCEMENT] Remote Read client: Enable streaming remote read if the server supports it. #11379 * [ENHANCEMENT] Remote-Write: Don't reshard if we haven't successfully sent a sample since last update. #14450 * [ENHANCEMENT] PromQL: Delay deletion of `__name__` label to the end of the query evaluation. This is **experimental** and enabled under the feature-flag `promql-delayed-name-removal`. #14477 -* [ENHANCEMENT] PromQL: Experimental `sort_by_label` and `sort_by_label_desc` sort by all labels when label is equal. #14655 +* [ENHANCEMENT] PromQL: Experimental `sort_by_label` and `sort_by_label_desc` sort by all labels when label is equal. #14655, #14985 * [ENHANCEMENT] PromQL: Clarify error message logged when Go runtime panic occurs during query evaluation. #14621 * [ENHANCEMENT] PromQL: Use Kahan summation for better accuracy in `avg` and `avg_over_time`. #14413 * [ENHANCEMENT] Tracing: Improve PromQL tracing, including showing the operation performed for aggregates, operators, and calls. #14816 diff --git a/promql/functions.go b/promql/functions.go index 182b69b080..04b6848b43 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -415,22 +415,12 @@ func funcSortDesc(vals []parser.Value, args parser.Expressions, enh *EvalNodeHel // === sort_by_label(vector parser.ValueTypeVector, label parser.ValueTypeString...) (Vector, Annotations) === func funcSortByLabel(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - // First, sort by the full label set. This ensures a consistent ordering in case sorting by the - // labels provided as arguments is not conclusive. + lbls := stringSliceFromArgs(args[1:]) slices.SortFunc(vals[0].(Vector), func(a, b Sample) int { - return labels.Compare(a.Metric, b.Metric) - }) - - labels := stringSliceFromArgs(args[1:]) - // Next, sort by the labels provided as arguments. - slices.SortFunc(vals[0].(Vector), func(a, b Sample) int { - // Iterate over each given label. - for _, label := range labels { + for _, label := range lbls { lv1 := a.Metric.Get(label) lv2 := b.Metric.Get(label) - // If we encounter multiple samples with the same label values, the sorting which was - // performed in the first step will act as a "tie breaker". if lv1 == lv2 { continue } @@ -442,7 +432,8 @@ func funcSortByLabel(vals []parser.Value, args parser.Expressions, enh *EvalNode return +1 } - return 0 + // If all labels provided as arguments were equal, sort by the full label set. This ensures a consistent ordering. + return labels.Compare(a.Metric, b.Metric) }) return vals[0].(Vector), nil @@ -450,22 +441,12 @@ func funcSortByLabel(vals []parser.Value, args parser.Expressions, enh *EvalNode // === sort_by_label_desc(vector parser.ValueTypeVector, label parser.ValueTypeString...) (Vector, Annotations) === func funcSortByLabelDesc(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - // First, sort by the full label set. This ensures a consistent ordering in case sorting by the - // labels provided as arguments is not conclusive. + lbls := stringSliceFromArgs(args[1:]) slices.SortFunc(vals[0].(Vector), func(a, b Sample) int { - return labels.Compare(b.Metric, a.Metric) - }) - - labels := stringSliceFromArgs(args[1:]) - // Next, sort by the labels provided as arguments. - slices.SortFunc(vals[0].(Vector), func(a, b Sample) int { - // Iterate over each given label. - for _, label := range labels { + for _, label := range lbls { lv1 := a.Metric.Get(label) lv2 := b.Metric.Get(label) - // If we encounter multiple samples with the same label values, the sorting which was - // performed in the first step will act as a "tie breaker". if lv1 == lv2 { continue } @@ -477,7 +458,8 @@ func funcSortByLabelDesc(vals []parser.Value, args parser.Expressions, enh *Eval return -1 } - return 0 + // If all labels provided as arguments were equal, sort by the full label set. This ensures a consistent ordering. + return -labels.Compare(a.Metric, b.Metric) }) return vals[0].(Vector), nil From 6b247c50d236c936e3499ccbb164749268537030 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Mon, 30 Sep 2024 17:09:01 -0400 Subject: [PATCH 02/15] Revert "Merge pull request #14769 from roidelapluie/autoreload" This reverts commit 50f5327f83de448354a5873b7934b7f6bb662ba1, reversing changes made to eb4004c344bf78b9e6f7e62a464a33db39a147cb. Signed-off-by: Bryan Boreham --- CHANGELOG.md | 1 - cmd/prometheus/main.go | 55 +------- config/reload.go | 92 ------------- config/reload_test.go | 222 -------------------------------- docs/command-line/prometheus.md | 3 +- docs/feature_flags.md | 13 -- 6 files changed, 2 insertions(+), 384 deletions(-) delete mode 100644 config/reload.go delete mode 100644 config/reload_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index c17eb8cf2e..88c27741b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,6 @@ ## 2.55.0-rc.0 / 2024-09-20 * [FEATURE] Support UTF-8 characters in label names - feature flag `utf8-names`. #14482, #14880, #14736, #14727 -* [FEATURE] Support config reload automatically - feature flag `auto-reload-config`. #14769 * [FEATURE] Scraping: Add the ability to set custom `http_headers` in config. #14817 * [FEATURE] Scraping: Support feature flag `created-timestamp-zero-ingestion` in OpenMetrics. #14356, #14815 * [FEATURE] Scraping: `scrape_failure_log_file` option to log failures to a file. #14734 diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index ee6cd0c2eb..b3bcb78b78 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -154,9 +154,6 @@ type flagConfig struct { RemoteFlushDeadline model.Duration nameEscapingScheme string - enableAutoReload bool - autoReloadInterval model.Duration - featureList []string memlimitRatio float64 // These options are extracted from featureList @@ -215,12 +212,6 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error { case "auto-gomaxprocs": c.enableAutoGOMAXPROCS = true level.Info(logger).Log("msg", "Automatically set GOMAXPROCS to match Linux container CPU quota") - case "auto-reload-config": - c.enableAutoReload = true - if s := time.Duration(c.autoReloadInterval).Seconds(); s > 0 && s < 1 { - c.autoReloadInterval, _ = model.ParseDuration("1s") - } - level.Info(logger).Log("msg", fmt.Sprintf("Enabled automatic configuration file reloading. Checking for configuration changes every %s.", c.autoReloadInterval)) case "auto-gomemlimit": c.enableAutoGOMEMLIMIT = true level.Info(logger).Log("msg", "Automatically set GOMEMLIMIT to match Linux container or system memory limit") @@ -311,9 +302,6 @@ func main() { a.Flag("config.file", "Prometheus configuration file path."). Default("prometheus.yml").StringVar(&cfg.configFile) - a.Flag("config.auto-reload-interval", "Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes."). - Default("30s").SetValue(&cfg.autoReloadInterval) - a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry. Can be repeated."). Default("0.0.0.0:9090").StringsVar(&cfg.web.ListenAddresses) @@ -504,7 +492,7 @@ func main() { a.Flag("scrape.name-escaping-scheme", `Method for escaping legacy invalid names when sending to Prometheus that does not support UTF-8. Can be one of "values", "underscores", or "dots".`).Default(scrape.DefaultNameEscapingScheme.String()).StringVar(&cfg.nameEscapingScheme) - a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, auto-reload-config, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). + a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). Default("").StringsVar(&cfg.featureList) promlogflag.AddFlags(a, &cfg.promlogConfig) @@ -1142,15 +1130,6 @@ func main() { hup := make(chan os.Signal, 1) signal.Notify(hup, syscall.SIGHUP) cancel := make(chan struct{}) - - var checksum string - if cfg.enableAutoReload { - checksum, err = config.GenerateChecksum(cfg.configFile) - if err != nil { - level.Error(logger).Log("msg", "Failed to generate initial checksum for configuration file", "err", err) - } - } - g.Add( func() error { <-reloadReady.C @@ -1160,12 +1139,6 @@ func main() { case <-hup: if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { level.Error(logger).Log("msg", "Error reloading config", "err", err) - } else if cfg.enableAutoReload { - if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil { - checksum = currentChecksum - } else { - level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err) - } } case rc := <-webHandler.Reload(): if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { @@ -1173,32 +1146,6 @@ func main() { rc <- err } else { rc <- nil - if cfg.enableAutoReload { - if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil { - checksum = currentChecksum - } else { - level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err) - } - } - } - case <-time.Tick(time.Duration(cfg.autoReloadInterval)): - if !cfg.enableAutoReload { - continue - } - currentChecksum, err := config.GenerateChecksum(cfg.configFile) - if err != nil { - level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err) - continue - } - if currentChecksum == checksum { - continue - } - level.Info(logger).Log("msg", "Configuration file change detected, reloading the configuration.") - - if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { - level.Error(logger).Log("msg", "Error reloading config", "err", err) - } else { - checksum = currentChecksum } case <-cancel: return nil diff --git a/config/reload.go b/config/reload.go deleted file mode 100644 index 8be1b28d8a..0000000000 --- a/config/reload.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2024 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package config - -import ( - "crypto/sha256" - "encoding/hex" - "fmt" - "os" - "path/filepath" - - "gopkg.in/yaml.v2" -) - -type ExternalFilesConfig struct { - RuleFiles []string `yaml:"rule_files"` - ScrapeConfigFiles []string `yaml:"scrape_config_files"` -} - -// GenerateChecksum generates a checksum of the YAML file and the files it references. -func GenerateChecksum(yamlFilePath string) (string, error) { - hash := sha256.New() - - yamlContent, err := os.ReadFile(yamlFilePath) - if err != nil { - return "", fmt.Errorf("error reading YAML file: %w", err) - } - _, err = hash.Write(yamlContent) - if err != nil { - return "", fmt.Errorf("error writing YAML file to hash: %w", err) - } - - var config ExternalFilesConfig - if err := yaml.Unmarshal(yamlContent, &config); err != nil { - return "", fmt.Errorf("error unmarshalling YAML: %w", err) - } - - dir := filepath.Dir(yamlFilePath) - - for i, file := range config.RuleFiles { - config.RuleFiles[i] = filepath.Join(dir, file) - } - for i, file := range config.ScrapeConfigFiles { - config.ScrapeConfigFiles[i] = filepath.Join(dir, file) - } - - files := map[string][]string{ - "r": config.RuleFiles, // "r" for rule files - "s": config.ScrapeConfigFiles, // "s" for scrape config files - } - - for _, prefix := range []string{"r", "s"} { - for _, pattern := range files[prefix] { - matchingFiles, err := filepath.Glob(pattern) - if err != nil { - return "", fmt.Errorf("error finding files with pattern %q: %w", pattern, err) - } - - for _, file := range matchingFiles { - // Write prefix to the hash ("r" or "s") followed by \0, then - // the file path. - _, err = hash.Write([]byte(prefix + "\x00" + file + "\x00")) - if err != nil { - return "", fmt.Errorf("error writing %q path to hash: %w", file, err) - } - - // Read and hash the content of the file. - content, err := os.ReadFile(file) - if err != nil { - return "", fmt.Errorf("error reading file %s: %w", file, err) - } - _, err = hash.Write(append(content, []byte("\x00")...)) - if err != nil { - return "", fmt.Errorf("error writing %q content to hash: %w", file, err) - } - } - } - } - - return hex.EncodeToString(hash.Sum(nil)), nil -} diff --git a/config/reload_test.go b/config/reload_test.go deleted file mode 100644 index f0f44f3588..0000000000 --- a/config/reload_test.go +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2024 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package config - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestGenerateChecksum(t *testing.T) { - tmpDir := t.TempDir() - - // Define paths for the temporary files. - yamlFilePath := filepath.Join(tmpDir, "test.yml") - ruleFilePath := filepath.Join(tmpDir, "rule_file.yml") - scrapeConfigFilePath := filepath.Join(tmpDir, "scrape_config.yml") - - // Define initial and modified content for the files. - originalRuleContent := "groups:\n- name: example\n rules:\n - alert: ExampleAlert" - modifiedRuleContent := "groups:\n- name: example\n rules:\n - alert: ModifiedAlert" - - originalScrapeConfigContent := "scrape_configs:\n- job_name: example" - modifiedScrapeConfigContent := "scrape_configs:\n- job_name: modified_example" - - // Define YAML content referencing the rule and scrape config files. - yamlContent := ` -rule_files: - - rule_file.yml -scrape_config_files: - - scrape_config.yml -` - - // Write initial content to files. - require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) - require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) - require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) - - // Generate the original checksum. - originalChecksum := calculateChecksum(t, yamlFilePath) - - t.Run("Rule File Change", func(t *testing.T) { - // Modify the rule file. - require.NoError(t, os.WriteFile(ruleFilePath, []byte(modifiedRuleContent), 0o644)) - - // Checksum should change. - modifiedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, modifiedChecksum) - - // Revert the rule file. - require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Scrape Config Change", func(t *testing.T) { - // Modify the scrape config file. - require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(modifiedScrapeConfigContent), 0o644)) - - // Checksum should change. - modifiedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, modifiedChecksum) - - // Revert the scrape config file. - require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Rule File Deletion", func(t *testing.T) { - // Delete the rule file. - require.NoError(t, os.Remove(ruleFilePath)) - - // Checksum should change. - deletedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, deletedChecksum) - - // Restore the rule file. - require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Scrape Config Deletion", func(t *testing.T) { - // Delete the scrape config file. - require.NoError(t, os.Remove(scrapeConfigFilePath)) - - // Checksum should change. - deletedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, deletedChecksum) - - // Restore the scrape config file. - require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Main File Change", func(t *testing.T) { - // Modify the main YAML file. - modifiedYamlContent := ` -global: - scrape_interval: 3s -rule_files: - - rule_file.yml -scrape_config_files: - - scrape_config.yml -` - require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644)) - - // Checksum should change. - modifiedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, modifiedChecksum) - - // Revert the main YAML file. - require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Rule File Removed from YAML Config", func(t *testing.T) { - // Modify the YAML content to remove the rule file. - modifiedYamlContent := ` -scrape_config_files: - - scrape_config.yml -` - require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644)) - - // Checksum should change. - modifiedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, modifiedChecksum) - - // Revert the YAML content. - require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Scrape Config Removed from YAML Config", func(t *testing.T) { - // Modify the YAML content to remove the scrape config file. - modifiedYamlContent := ` -rule_files: - - rule_file.yml -` - require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644)) - - // Checksum should change. - modifiedChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, modifiedChecksum) - - // Revert the YAML content. - require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Empty Rule File", func(t *testing.T) { - // Write an empty rule file. - require.NoError(t, os.WriteFile(ruleFilePath, []byte(""), 0o644)) - - // Checksum should change. - emptyChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, emptyChecksum) - - // Restore the rule file. - require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) - - t.Run("Empty Scrape Config File", func(t *testing.T) { - // Write an empty scrape config file. - require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(""), 0o644)) - - // Checksum should change. - emptyChecksum := calculateChecksum(t, yamlFilePath) - require.NotEqual(t, originalChecksum, emptyChecksum) - - // Restore the scrape config file. - require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) - - // Checksum should return to the original. - revertedChecksum := calculateChecksum(t, yamlFilePath) - require.Equal(t, originalChecksum, revertedChecksum) - }) -} - -// calculateChecksum generates a checksum for the given YAML file path. -func calculateChecksum(t *testing.T, yamlFilePath string) string { - checksum, err := GenerateChecksum(yamlFilePath) - require.NoError(t, err) - require.NotEmpty(t, checksum) - return checksum -} diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index 8fefa8ecc9..7d9e5a3c80 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -15,7 +15,6 @@ The Prometheus monitoring server | -h, --help | Show context-sensitive help (also try --help-long and --help-man). | | | --version | Show application version. | | | --config.file | Prometheus configuration file path. | `prometheus.yml` | -| --config.auto-reload-interval | Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes. | `30s` | | --web.listen-address ... | Address to listen on for UI, API, and telemetry. Can be repeated. | `0.0.0.0:9090` | | --auto-gomemlimit.ratio | The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory | `0.9` | | --web.config.file | [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication. | | @@ -58,7 +57,7 @@ The Prometheus monitoring server | --query.max-concurrency | Maximum number of queries executed concurrently. Use with server mode only. | `20` | | --query.max-samples | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` | | --scrape.name-escaping-scheme | Method for escaping legacy invalid names when sending to Prometheus that does not support UTF-8. Can be one of "values", "underscores", or "dots". | `values` | -| --enable-feature ... | Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, auto-reload-config, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | +| --enable-feature ... | Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | | --log.level | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` | | --log.format | Output format of log messages. One of: [logfmt, json] | `logfmt` | diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 51c2a9b314..7b07a04d0e 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -265,16 +265,3 @@ This allows optionally preserving the `__name__` label via the `label_replace` a When enabled, changes the metric and label name validation scheme inside Prometheus to allow the full UTF-8 character set. By itself, this flag does not enable the request of UTF-8 names via content negotiation. Users will also have to set `metric_name_validation_scheme` in scrape configs to enable the feature either on the global config or on a per-scrape config basis. - -## Auto Reload Config - -`--enable-feature=auto-reload-config` - -When enabled, Prometheus will automatically reload its configuration file at a -specified interval. The interval is defined by the -`--config.auto-reload-interval` flag, which defaults to `30s`. - -Configuration reloads are triggered by detecting changes in the checksum of the -main configuration file or any referenced files, such as rule and scrape -configurations. To ensure consistency and avoid issues during reloads, it's -recommended to update these files atomically. From 105c692f77db299c0a64f293460643e35b22d75b Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Mon, 7 Oct 2024 13:50:01 +0100 Subject: [PATCH 03/15] [BUGFIX] TSDB: Don't read in-order chunks from before head MinTime Because we are reimplementing the `IndexReader` to fetch in-order and out-of-order chunks together, we must reproduce the behaviour of `Head.indexRange()`, which floors the minimum time queried at `head.MinTime()`. Signed-off-by: Bryan Boreham --- tsdb/db.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tsdb/db.go b/tsdb/db.go index 2d2759b61c..a339414c7b 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -2043,7 +2043,7 @@ func (db *DB) Querier(mint, maxt int64) (_ storage.Querier, err error) { overlapsOOO := overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) var headQuerier storage.Querier - inoMint := mint + inoMint := max(db.head.MinTime(), mint) if maxt >= db.head.MinTime() || overlapsOOO { rh := NewRangeHead(db.head, mint, maxt) var err error @@ -2121,7 +2121,7 @@ func (db *DB) blockChunkQuerierForRange(mint, maxt int64) (_ []storage.ChunkQuer overlapsOOO := overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) var headQuerier storage.ChunkQuerier - inoMint := mint + inoMint := max(db.head.MinTime(), mint) if maxt >= db.head.MinTime() || overlapsOOO { rh := NewRangeHead(db.head, mint, maxt) headQuerier, err = db.blockChunkQuerierFunc(rh, mint, maxt) From f7b396a1dc6c417ee5dae6457d58b114ecb65a64 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 24 Sep 2024 12:03:56 +0200 Subject: [PATCH 04/15] promql.Engine: Refactor vector selector evaluation into a method (#14900) New method is named `evalVectorSelector`. --------- Signed-off-by: Arve Knudsen --- promql/engine.go | 117 +++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 50 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index b583e12d57..86e76bb70d 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -998,6 +998,8 @@ func extractGroupsFromPath(p []parser.Node) (bool, []string) { return false, nil } +// checkAndExpandSeriesSet expands expr's UnexpandedSeriesSet into expr's Series. +// If the Series field is already non-nil, it's a no-op. func checkAndExpandSeriesSet(ctx context.Context, expr parser.Expr) (annotations.Annotations, error) { switch e := expr.(type) { case *parser.MatrixSelector: @@ -1449,6 +1451,70 @@ func (ev *evaluator) rangeEvalAgg(ctx context.Context, aggExpr *parser.Aggregate return result, warnings } +// evalVectorSelector generates a Matrix between ev.startTimestamp and ev.endTimestamp (inclusive), each point spaced ev.interval apart, from vs. +// vs.Series has to be expanded before calling this method. +// For every series iterator in vs.Series, the method iterates in ev.interval sized steps from ev.startTimestamp until and including ev.endTimestamp, +// collecting every corresponding sample (obtained via ev.vectorSelectorSingle) into a Series. +// All of the generated Series are collected into a Matrix, that gets returned. +func (ev *evaluator) evalVectorSelector(ctx context.Context, vs *parser.VectorSelector) Matrix { + numSteps := int((ev.endTimestamp-ev.startTimestamp)/ev.interval) + 1 + + mat := make(Matrix, 0, len(vs.Series)) + var prevSS *Series + it := storage.NewMemoizedEmptyIterator(durationMilliseconds(ev.lookbackDelta)) + var chkIter chunkenc.Iterator + for _, s := range vs.Series { + if err := contextDone(ctx, "expression evaluation"); err != nil { + ev.error(err) + } + + chkIter = s.Iterator(chkIter) + it.Reset(chkIter) + ss := Series{ + Metric: s.Labels(), + } + + for ts, step := ev.startTimestamp, -1; ts <= ev.endTimestamp; ts += ev.interval { + step++ + _, f, h, ok := ev.vectorSelectorSingle(it, vs, ts) + if !ok { + continue + } + + if h == nil { + ev.currentSamples++ + ev.samplesStats.IncrementSamplesAtStep(step, 1) + if ev.currentSamples > ev.maxSamples { + ev.error(ErrTooManySamples(env)) + } + if ss.Floats == nil { + ss.Floats = reuseOrGetFPointSlices(prevSS, numSteps) + } + ss.Floats = append(ss.Floats, FPoint{F: f, T: ts}) + } else { + point := HPoint{H: h, T: ts} + histSize := point.size() + ev.currentSamples += histSize + ev.samplesStats.IncrementSamplesAtStep(step, int64(histSize)) + if ev.currentSamples > ev.maxSamples { + ev.error(ErrTooManySamples(env)) + } + if ss.Histograms == nil { + ss.Histograms = reuseOrGetHPointSlices(prevSS, numSteps) + } + ss.Histograms = append(ss.Histograms, point) + } + } + + if len(ss.Floats)+len(ss.Histograms) > 0 { + mat = append(mat, ss) + prevSS = &mat[len(mat)-1] + } + } + ev.samplesStats.UpdatePeak(ev.currentSamples) + return mat +} + // evalSubquery evaluates given SubqueryExpr and returns an equivalent // evaluated MatrixSelector in its place. Note that the Name and LabelMatchers are not set. func (ev *evaluator) evalSubquery(ctx context.Context, subq *parser.SubqueryExpr) (*parser.MatrixSelector, int, annotations.Annotations) { @@ -1887,56 +1953,7 @@ func (ev *evaluator) eval(ctx context.Context, expr parser.Expr) (parser.Value, if err != nil { ev.error(errWithWarnings{fmt.Errorf("expanding series: %w", err), ws}) } - mat := make(Matrix, 0, len(e.Series)) - var prevSS *Series - it := storage.NewMemoizedEmptyIterator(durationMilliseconds(ev.lookbackDelta)) - var chkIter chunkenc.Iterator - for i, s := range e.Series { - if err := contextDone(ctx, "expression evaluation"); err != nil { - ev.error(err) - } - chkIter = s.Iterator(chkIter) - it.Reset(chkIter) - ss := Series{ - Metric: e.Series[i].Labels(), - } - - for ts, step := ev.startTimestamp, -1; ts <= ev.endTimestamp; ts += ev.interval { - step++ - _, f, h, ok := ev.vectorSelectorSingle(it, e, ts) - if ok { - if h == nil { - ev.currentSamples++ - ev.samplesStats.IncrementSamplesAtStep(step, 1) - if ev.currentSamples > ev.maxSamples { - ev.error(ErrTooManySamples(env)) - } - if ss.Floats == nil { - ss.Floats = reuseOrGetFPointSlices(prevSS, numSteps) - } - ss.Floats = append(ss.Floats, FPoint{F: f, T: ts}) - } else { - point := HPoint{H: h, T: ts} - histSize := point.size() - ev.currentSamples += histSize - ev.samplesStats.IncrementSamplesAtStep(step, int64(histSize)) - if ev.currentSamples > ev.maxSamples { - ev.error(ErrTooManySamples(env)) - } - if ss.Histograms == nil { - ss.Histograms = reuseOrGetHPointSlices(prevSS, numSteps) - } - ss.Histograms = append(ss.Histograms, point) - } - } - } - - if len(ss.Floats)+len(ss.Histograms) > 0 { - mat = append(mat, ss) - prevSS = &mat[len(mat)-1] - } - } - ev.samplesStats.UpdatePeak(ev.currentSamples) + mat := ev.evalVectorSelector(ctx, e) return mat, ws case *parser.MatrixSelector: From 90cc7e572327b194af39abcb1b9456f66e8eeae5 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 15 Oct 2024 16:37:58 +0200 Subject: [PATCH 05/15] Upgrade github.com/googleapis/enterprise-certificate-proxy to v0.3.4 Signed-off-by: Arve Knudsen --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 4a2dd1c779..0631611234 100644 --- a/go.mod +++ b/go.mod @@ -140,7 +140,7 @@ require ( github.com/google/go-querystring v1.1.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/s2a-go v0.1.8 // indirect - github.com/googleapis/enterprise-certificate-proxy v0.3.3 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/googleapis/gax-go/v2 v2.13.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect diff --git a/go.sum b/go.sum index 4fc4f93bd8..0246a377d2 100644 --- a/go.sum +++ b/go.sum @@ -328,8 +328,8 @@ github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+ github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/googleapis/enterprise-certificate-proxy v0.3.3 h1:QRje2j5GZimBzlbhGA2V2QlGNgL8G6e+wGo/+/2bWI0= -github.com/googleapis/enterprise-certificate-proxy v0.3.3/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= +github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= +github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s= From b1c356beea0879d8f89250fe63a55b66612ec126 Mon Sep 17 00:00:00 2001 From: machine424 Date: Tue, 15 Oct 2024 20:29:07 +0200 Subject: [PATCH 06/15] fix(discovery): Handle cache.DeletedFinalStateUnknown in node informers' DeleteFunc Signed-off-by: machine424 --- discovery/kubernetes/endpoints.go | 7 +++++-- discovery/kubernetes/endpointslice.go | 7 +++++-- discovery/kubernetes/kubernetes.go | 10 ++++++++++ discovery/kubernetes/kubernetes_test.go | 17 +++++++++++++++++ discovery/kubernetes/node.go | 2 +- discovery/kubernetes/pod.go | 7 +++++-- 6 files changed, 43 insertions(+), 7 deletions(-) diff --git a/discovery/kubernetes/endpoints.go b/discovery/kubernetes/endpoints.go index 75da67f1c6..5ba9df6276 100644 --- a/discovery/kubernetes/endpoints.go +++ b/discovery/kubernetes/endpoints.go @@ -167,8 +167,11 @@ func NewEndpoints(l *slog.Logger, eps cache.SharedIndexInformer, svc, pod, node e.enqueueNode(node.Name) }, DeleteFunc: func(o interface{}) { - node := o.(*apiv1.Node) - e.enqueueNode(node.Name) + nodeName, err := nodeName(o) + if err != nil { + l.Error("Error getting Node name", "err", err) + } + e.enqueueNode(nodeName) }, }) if err != nil { diff --git a/discovery/kubernetes/endpointslice.go b/discovery/kubernetes/endpointslice.go index efd1c72167..8f58ba3535 100644 --- a/discovery/kubernetes/endpointslice.go +++ b/discovery/kubernetes/endpointslice.go @@ -145,8 +145,11 @@ func NewEndpointSlice(l *slog.Logger, eps cache.SharedIndexInformer, svc, pod, n e.enqueueNode(node.Name) }, DeleteFunc: func(o interface{}) { - node := o.(*apiv1.Node) - e.enqueueNode(node.Name) + nodeName, err := nodeName(o) + if err != nil { + l.Error("Error getting Node name", "err", err) + } + e.enqueueNode(nodeName) }, }) if err != nil { diff --git a/discovery/kubernetes/kubernetes.go b/discovery/kubernetes/kubernetes.go index be1c77c205..64e8886cfd 100644 --- a/discovery/kubernetes/kubernetes.go +++ b/discovery/kubernetes/kubernetes.go @@ -804,3 +804,13 @@ func addObjectMetaLabels(labelSet model.LabelSet, objectMeta metav1.ObjectMeta, func namespacedName(namespace, name string) string { return namespace + "/" + name } + +// nodeName knows how to handle the cache.DeletedFinalStateUnknown tombstone. +// It assumes the MetaNamespaceKeyFunc keyFunc is used, which uses the node name as the tombstone key. +func nodeName(o interface{}) (string, error) { + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(o) + if err != nil { + return "", err + } + return key, nil +} diff --git a/discovery/kubernetes/kubernetes_test.go b/discovery/kubernetes/kubernetes_test.go index fbbd77c3c3..a14f2b3d1b 100644 --- a/discovery/kubernetes/kubernetes_test.go +++ b/discovery/kubernetes/kubernetes_test.go @@ -23,7 +23,9 @@ import ( prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/promslog" "github.com/stretchr/testify/require" + apiv1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/version" "k8s.io/apimachinery/pkg/watch" @@ -320,3 +322,18 @@ func TestFailuresCountMetric(t *testing.T) { }) } } + +func TestNodeName(t *testing.T) { + node := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + } + name, err := nodeName(node) + require.NoError(t, err) + require.Equal(t, "foo", name) + + name, err = nodeName(cache.DeletedFinalStateUnknown{Key: "bar"}) + require.NoError(t, err) + require.Equal(t, "bar", name) +} diff --git a/discovery/kubernetes/node.go b/discovery/kubernetes/node.go index eecb52ab50..0e0c5745f2 100644 --- a/discovery/kubernetes/node.go +++ b/discovery/kubernetes/node.go @@ -82,7 +82,7 @@ func NewNode(l *slog.Logger, inf cache.SharedInformer, eventCount *prometheus.Co } func (n *Node) enqueue(obj interface{}) { - key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + key, err := nodeName(obj) if err != nil { return } diff --git a/discovery/kubernetes/pod.go b/discovery/kubernetes/pod.go index 73568e51c8..8704a66239 100644 --- a/discovery/kubernetes/pod.go +++ b/discovery/kubernetes/pod.go @@ -95,8 +95,11 @@ func NewPod(l *slog.Logger, pods cache.SharedIndexInformer, nodes cache.SharedIn p.enqueuePodsForNode(node.Name) }, DeleteFunc: func(o interface{}) { - node := o.(*apiv1.Node) - p.enqueuePodsForNode(node.Name) + nodeName, err := nodeName(o) + if err != nil { + l.Error("Error getting Node name", "err", err) + } + p.enqueuePodsForNode(nodeName) }, }) if err != nil { From e05e97cdd7fc684442c2d8e03e6967641933de51 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 15 Oct 2024 17:41:20 +0200 Subject: [PATCH 07/15] evaluator.rangeEval: Split out gatherVector method Signed-off-by: Arve Knudsen --- promql/engine.go | 75 +++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index 86e76bb70d..ef316e088d 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -1219,38 +1219,17 @@ func (ev *evaluator) rangeEval(ctx context.Context, prepSeries func(labels.Label ev.currentSamples = tempNumSamples // Gather input vectors for this timestamp. for i := range exprs { - vectors[i] = vectors[i][:0] - + var bh []EvalSeriesHelper + var sh []EvalSeriesHelper if prepSeries != nil { - bufHelpers[i] = bufHelpers[i][:0] - } - - for si, series := range matrixes[i] { - switch { - case len(series.Floats) > 0 && series.Floats[0].T == ts: - vectors[i] = append(vectors[i], Sample{Metric: series.Metric, F: series.Floats[0].F, T: ts, DropName: series.DropName}) - // Move input vectors forward so we don't have to re-scan the same - // past points at the next step. - matrixes[i][si].Floats = series.Floats[1:] - case len(series.Histograms) > 0 && series.Histograms[0].T == ts: - vectors[i] = append(vectors[i], Sample{Metric: series.Metric, H: series.Histograms[0].H, T: ts, DropName: series.DropName}) - matrixes[i][si].Histograms = series.Histograms[1:] - default: - continue - } - if prepSeries != nil { - bufHelpers[i] = append(bufHelpers[i], seriesHelpers[i][si]) - } - // Don't add histogram size here because we only - // copy the pointer above, not the whole - // histogram. - ev.currentSamples++ - if ev.currentSamples > ev.maxSamples { - ev.error(ErrTooManySamples(env)) - } + bh = bufHelpers[i][:0] + sh = seriesHelpers[i] } + vectors[i], bh = ev.gatherVector(ts, matrixes[i], vectors[i], bh, sh) args[i] = vectors[i] - ev.samplesStats.UpdatePeak(ev.currentSamples) + if prepSeries != nil { + bufHelpers[i] = bh + } } // Make the function call. @@ -3682,3 +3661,41 @@ func newHistogramStatsSeries(series storage.Series) *histogramStatsSeries { func (s histogramStatsSeries) Iterator(it chunkenc.Iterator) chunkenc.Iterator { return NewHistogramStatsIterator(s.Series.Iterator(it)) } + +// gatherVector gathers a Vector for ts from the series in input. +// output is used as a buffer. +// If bufHelpers and seriesHelpers are provided, seriesHelpers[i] is appended to bufHelpers for every input index i. +// The gathered Vector and bufHelper are returned. +func (ev *evaluator) gatherVector(ts int64, input Matrix, output Vector, bufHelpers, seriesHelpers []EvalSeriesHelper) (Vector, []EvalSeriesHelper) { + output = output[:0] + for i, series := range input { + switch { + case len(series.Floats) > 0 && series.Floats[0].T == ts: + s := series.Floats[0] + output = append(output, Sample{Metric: series.Metric, F: s.F, T: ts, DropName: series.DropName}) + // Move input vectors forward so we don't have to re-scan the same + // past points at the next step. + input[i].Floats = series.Floats[1:] + case len(series.Histograms) > 0 && series.Histograms[0].T == ts: + s := series.Histograms[0] + output = append(output, Sample{Metric: series.Metric, H: s.H, T: ts, DropName: series.DropName}) + input[i].Histograms = series.Histograms[1:] + default: + continue + } + if len(seriesHelpers) > 0 { + bufHelpers = append(bufHelpers, seriesHelpers[i]) + } + + // Don't add histogram size here because we only + // copy the pointer above, not the whole + // histogram. + ev.currentSamples++ + if ev.currentSamples > ev.maxSamples { + ev.error(ErrTooManySamples(env)) + } + } + ev.samplesStats.UpdatePeak(ev.currentSamples) + + return output, bufHelpers +} From efc43d0714210884de4e556fcf206930704d2b42 Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Fri, 18 Oct 2024 09:32:15 +0200 Subject: [PATCH 08/15] s/scrape_classic_histograms/always_scrape_classic_histograms (3.0 breaking change) (#15178) This is for readability, especially when we can converting to nhcb option. See discussion https://cloud-native.slack.com/archives/C077Z4V13AM/p1729155873397889 Signed-off-by: bwplotka --- config/config.go | 18 +++++++++--------- docs/configuration/configuration.md | 4 ++-- docs/feature_flags.md | 2 +- scrape/scrape.go | 16 ++++++++-------- scrape/scrape_test.go | 6 +++--- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/config/config.go b/config/config.go index 3eb6898d5c..a88b0d32ff 100644 --- a/config/config.go +++ b/config/config.go @@ -163,13 +163,13 @@ var ( // DefaultScrapeConfig is the default scrape configuration. DefaultScrapeConfig = ScrapeConfig{ // ScrapeTimeout, ScrapeInterval and ScrapeProtocols default to the configured globals. - ScrapeClassicHistograms: false, - MetricsPath: "/metrics", - Scheme: "http", - HonorLabels: false, - HonorTimestamps: true, - HTTPClientConfig: config.DefaultHTTPClientConfig, - EnableCompression: true, + AlwaysScrapeClassicHistograms: false, + MetricsPath: "/metrics", + Scheme: "http", + HonorLabels: false, + HonorTimestamps: true, + HTTPClientConfig: config.DefaultHTTPClientConfig, + EnableCompression: true, } // DefaultAlertmanagerConfig is the default alertmanager configuration. @@ -631,8 +631,8 @@ type ScrapeConfig struct { // Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1, // OpenMetricsText1.0.0, PrometheusText0.0.4. ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"` - // Whether to scrape a classic histogram that is also exposed as a native histogram. - ScrapeClassicHistograms bool `yaml:"scrape_classic_histograms,omitempty"` + // Whether to scrape a classic histogram, even if it is also exposed as a native histogram. + AlwaysScrapeClassicHistograms bool `yaml:"always_scrape_classic_histograms,omitempty"` // File to which scrape failures are logged. ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"` // The HTTP resource path on which to fetch metrics from targets. diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 6e0cf431cb..4a681c7973 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -215,9 +215,9 @@ job_name: # OpenMetricsText1.0.0, PrometheusText0.0.4. [ scrape_protocols: [, ...] | default = ] -# Whether to scrape a classic histogram that is also exposed as a native +# Whether to scrape a classic histogram, even if it is also exposed as a native # histogram (has no effect without --enable-feature=native-histograms). -[ scrape_classic_histograms: | default = false ] +[ always_scrape_classic_histograms: | default = false ] # The HTTP resource path on which to fetch metrics from targets. [ metrics_path: | default = /metrics ] diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 65eb60eaf1..0d6e23972a 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -84,7 +84,7 @@ those classic histograms that do not come with a corresponding native histogram. However, if a native histogram is present, Prometheus will ignore the corresponding classic histogram, with the notable exception of exemplars, which are always ingested. To keep the classic histograms as well, enable -`scrape_classic_histograms` in the scrape job. +`always_scrape_classic_histograms` in the scrape job. _Note about the format of `le` and `quantile` label values:_ diff --git a/scrape/scrape.go b/scrape/scrape.go index d40e0be2e2..4273f4cb64 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -112,7 +112,7 @@ type scrapeLoopOptions struct { trackTimestampsStaleness bool interval time.Duration timeout time.Duration - scrapeClassicHistograms bool + alwaysScrapeClassicHist bool validationScheme model.ValidationScheme mrc []*relabel.Config @@ -179,7 +179,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed opts.labelLimits, opts.interval, opts.timeout, - opts.scrapeClassicHistograms, + opts.alwaysScrapeClassicHist, options.EnableNativeHistogramsIngestion, options.EnableCreatedTimestampZeroIngestion, options.ExtraMetrics, @@ -480,7 +480,7 @@ func (sp *scrapePool) sync(targets []*Target) { enableCompression = sp.config.EnableCompression trackTimestampsStaleness = sp.config.TrackTimestampsStaleness mrc = sp.config.MetricRelabelConfigs - scrapeClassicHistograms = sp.config.ScrapeClassicHistograms + alwaysScrapeClassicHist = sp.config.AlwaysScrapeClassicHistograms ) validationScheme := model.UTF8Validation @@ -521,7 +521,7 @@ func (sp *scrapePool) sync(targets []*Target) { mrc: mrc, interval: interval, timeout: timeout, - scrapeClassicHistograms: scrapeClassicHistograms, + alwaysScrapeClassicHist: alwaysScrapeClassicHist, validationScheme: validationScheme, }) if err != nil { @@ -883,7 +883,7 @@ type scrapeLoop struct { labelLimits *labelLimits interval time.Duration timeout time.Duration - scrapeClassicHistograms bool + alwaysScrapeClassicHist bool validationScheme model.ValidationScheme // Feature flagged options. @@ -1183,7 +1183,7 @@ func newScrapeLoop(ctx context.Context, labelLimits *labelLimits, interval time.Duration, timeout time.Duration, - scrapeClassicHistograms bool, + alwaysScrapeClassicHist bool, enableNativeHistogramIngestion bool, enableCTZeroIngestion bool, reportExtraMetrics bool, @@ -1237,7 +1237,7 @@ func newScrapeLoop(ctx context.Context, labelLimits: labelLimits, interval: interval, timeout: timeout, - scrapeClassicHistograms: scrapeClassicHistograms, + alwaysScrapeClassicHist: alwaysScrapeClassicHist, enableNativeHistogramIngestion: enableNativeHistogramIngestion, enableCTZeroIngestion: enableCTZeroIngestion, reportExtraMetrics: reportExtraMetrics, @@ -1537,7 +1537,7 @@ type appendErrors struct { } func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, ts time.Time) (total, added, seriesAdded int, err error) { - p, err := textparse.New(b, contentType, sl.scrapeClassicHistograms, sl.enableCTZeroIngestion, sl.symbolTable) + p, err := textparse.New(b, contentType, sl.alwaysScrapeClassicHist, sl.enableCTZeroIngestion, sl.symbolTable) if err != nil { sl.l.Debug( "Invalid content type on scrape, using prometheus parser as fallback.", diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index e0e094da54..f65d41a84a 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -1846,7 +1846,7 @@ func TestScrapeLoopAppendStalenessIfTrackTimestampStaleness(t *testing.T) { func TestScrapeLoopAppendExemplar(t *testing.T) { tests := []struct { title string - scrapeClassicHistograms bool + alwaysScrapeClassicHist bool enableNativeHistogramsIngestion bool scrapeText string contentType string @@ -2115,7 +2115,7 @@ metric: < > `, - scrapeClassicHistograms: true, + alwaysScrapeClassicHist: true, contentType: "application/vnd.google.protobuf", floats: []floatSample{ {metric: labels.FromStrings("__name__", "test_histogram_count"), t: 1234568, f: 175}, @@ -2177,7 +2177,7 @@ metric: < sl.reportSampleMutator = func(l labels.Labels) labels.Labels { return mutateReportSampleLabels(l, discoveryLabels) } - sl.scrapeClassicHistograms = test.scrapeClassicHistograms + sl.alwaysScrapeClassicHist = test.alwaysScrapeClassicHist now := time.Now() From e6a682f046f946ef73c17dc2840c1906139fda90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 18 Oct 2024 08:54:37 +0200 Subject: [PATCH 09/15] Reproduce populateWithDelChunkSeriesIterator corrupting chunk meta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When handling recoded histogram chunks the min time of the chunk is updated by mistake. It should only update when the chunk is completely new. Signed-off-by: György Krajcsovits --- tsdb/db_test.go | 64 +++++++++++++++++++++++++++++++++----- tsdb/head_test.go | 2 +- tsdb/ooo_head_read_test.go | 4 +-- tsdb/testutil.go | 33 ++++++++++++++++++-- 4 files changed, 91 insertions(+), 12 deletions(-) diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 08417e889e..3f0fc0c841 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -4757,7 +4757,7 @@ func TestMultipleEncodingsCommitOrder(t *testing.T) { seriesSet := query(t, querier, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar1")) require.Len(t, seriesSet, 1) gotSamples := seriesSet[series1.String()] - requireEqualSamples(t, series1.String(), expSamples, gotSamples, true) + requireEqualSamples(t, series1.String(), expSamples, gotSamples, requireEqualSamplesIgnoreCounterResets) // Verify chunks querier. chunkQuerier, err := db.ChunkQuerier(minT, maxT) @@ -4775,7 +4775,7 @@ func TestMultipleEncodingsCommitOrder(t *testing.T) { gotChunkSamples = append(gotChunkSamples, smpls...) require.NoError(t, it.Err()) } - requireEqualSamples(t, series1.String(), expSamples, gotChunkSamples, true) + requireEqualSamples(t, series1.String(), expSamples, gotChunkSamples, requireEqualSamplesIgnoreCounterResets) } var expSamples []chunks.Sample @@ -5704,16 +5704,33 @@ func testQuerierOOOQuery(t *testing.T, gotSamples := seriesSet[series1.String()] require.NotNil(t, gotSamples) require.Len(t, seriesSet, 1) - requireEqualSamples(t, series1.String(), expSamples, gotSamples, true) + requireEqualSamples(t, series1.String(), expSamples, gotSamples, requireEqualSamplesIgnoreCounterResets) requireEqualOOOSamples(t, oooSamples, db) }) } } func TestChunkQuerierOOOQuery(t *testing.T) { + nBucketHistogram := func(n int64) *histogram.Histogram { + h := &histogram.Histogram{ + Count: uint64(n), + Sum: float64(n), + } + if n == 0 { + h.PositiveSpans = []histogram.Span{} + h.PositiveBuckets = []int64{} + return h + } + h.PositiveSpans = []histogram.Span{{Offset: 0, Length: uint32(n)}} + h.PositiveBuckets = make([]int64, n) + h.PositiveBuckets[0] = 1 + return h + } + scenarios := map[string]struct { - appendFunc func(app storage.Appender, ts int64, counterReset bool) (storage.SeriesRef, error) - sampleFunc func(ts int64) chunks.Sample + appendFunc func(app storage.Appender, ts int64, counterReset bool) (storage.SeriesRef, error) + sampleFunc func(ts int64) chunks.Sample + checkInUseBucket bool }{ "float": { appendFunc: func(app storage.Appender, ts int64, counterReset bool) (storage.SeriesRef, error) { @@ -5758,10 +5775,24 @@ func TestChunkQuerierOOOQuery(t *testing.T) { return sample{t: ts, h: tsdbutil.GenerateTestHistogram(int(ts))} }, }, + "integer histogram with recode": { + // Histograms have increasing number of buckets so their chunks are recoded. + appendFunc: func(app storage.Appender, ts int64, counterReset bool) (storage.SeriesRef, error) { + n := ts / time.Minute.Milliseconds() + return app.AppendHistogram(0, labels.FromStrings("foo", "bar1"), ts, nBucketHistogram(n), nil) + }, + sampleFunc: func(ts int64) chunks.Sample { + n := ts / time.Minute.Milliseconds() + return sample{t: ts, h: nBucketHistogram(n)} + }, + // Only check in-use buckets for this scenario. + // Recoding adds empty buckets. + checkInUseBucket: true, + }, } for name, scenario := range scenarios { t.Run(name, func(t *testing.T) { - testChunkQuerierOOOQuery(t, scenario.appendFunc, scenario.sampleFunc) + testChunkQuerierOOOQuery(t, scenario.appendFunc, scenario.sampleFunc, scenario.checkInUseBucket) }) } } @@ -5769,6 +5800,7 @@ func TestChunkQuerierOOOQuery(t *testing.T) { func testChunkQuerierOOOQuery(t *testing.T, appendFunc func(app storage.Appender, ts int64, counterReset bool) (storage.SeriesRef, error), sampleFunc func(ts int64) chunks.Sample, + checkInUseBuckets bool, ) { opts := DefaultOptions() opts.OutOfOrderCapMax = 30 @@ -6008,10 +6040,28 @@ func testChunkQuerierOOOQuery(t *testing.T, it := chunk.Chunk.Iterator(nil) smpls, err := storage.ExpandSamples(it, newSample) require.NoError(t, err) + + // Verify that no sample is outside the chunk's time range. + for i, s := range smpls { + switch i { + case 0: + require.Equal(t, chunk.MinTime, s.T(), "first sample %v not at chunk min time %v", s, chunk.MinTime) + case len(smpls) - 1: + require.Equal(t, chunk.MaxTime, s.T(), "last sample %v not at chunk max time %v", s, chunk.MaxTime) + default: + require.GreaterOrEqual(t, s.T(), chunk.MinTime, "sample %v before chunk min time %v", s, chunk.MinTime) + require.LessOrEqual(t, s.T(), chunk.MaxTime, "sample %v after chunk max time %v", s, chunk.MaxTime) + } + } + gotSamples = append(gotSamples, smpls...) require.NoError(t, it.Err()) } - requireEqualSamples(t, series1.String(), expSamples, gotSamples, true) + if checkInUseBuckets { + requireEqualSamples(t, series1.String(), expSamples, gotSamples, requireEqualSamplesIgnoreCounterResets, requireEqualSamplesInUseBucketCompare) + } else { + requireEqualSamples(t, series1.String(), expSamples, gotSamples, requireEqualSamplesIgnoreCounterResets) + } }) } } diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 671e85cd78..cc9daa97fe 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -5178,7 +5178,7 @@ func testWBLReplay(t *testing.T, scenario sampleTypeScenario) { // Passing in true for the 'ignoreCounterResets' parameter prevents differences in counter reset headers // from being factored in to the sample comparison // TODO(fionaliao): understand counter reset behaviour, might want to modify this later - requireEqualSamples(t, l.String(), expOOOSamples, actOOOSamples, true) + requireEqualSamples(t, l.String(), expOOOSamples, actOOOSamples, requireEqualSamplesIgnoreCounterResets) require.NoError(t, h.Close()) } diff --git a/tsdb/ooo_head_read_test.go b/tsdb/ooo_head_read_test.go index 8d1527e05c..17f551dd7d 100644 --- a/tsdb/ooo_head_read_test.go +++ b/tsdb/ooo_head_read_test.go @@ -878,7 +878,7 @@ func testOOOHeadChunkReader_Chunk(t *testing.T, scenario sampleTypeScenario) { } resultSamples, err := storage.ExpandSamples(it, nil) require.NoError(t, err) - requireEqualSamples(t, s1.String(), tc.expChunksSamples[i], resultSamples, true) + requireEqualSamples(t, s1.String(), tc.expChunksSamples[i], resultSamples, requireEqualSamplesIgnoreCounterResets) } }) } @@ -1054,7 +1054,7 @@ func testOOOHeadChunkReader_Chunk_ConsistentQueryResponseDespiteOfHeadExpanding( it := iterable.Iterator(nil) resultSamples, err := storage.ExpandSamples(it, nil) require.NoError(t, err) - requireEqualSamples(t, s1.String(), tc.expChunksSamples[i], resultSamples, true) + requireEqualSamples(t, s1.String(), tc.expChunksSamples[i], resultSamples, requireEqualSamplesIgnoreCounterResets) } }) } diff --git a/tsdb/testutil.go b/tsdb/testutil.go index ab6aab79f4..03587f4e2c 100644 --- a/tsdb/testutil.go +++ b/tsdb/testutil.go @@ -111,7 +111,7 @@ func requireEqualSeries(t *testing.T, expected, actual map[string][]chunks.Sampl for name, expectedItem := range expected { actualItem, ok := actual[name] require.True(t, ok, "Expected series %s not found", name) - requireEqualSamples(t, name, expectedItem, actualItem, ignoreCounterResets) + requireEqualSamples(t, name, expectedItem, actualItem, requireEqualSamplesIgnoreCounterResets) } for name := range actual { _, ok := expected[name] @@ -126,7 +126,28 @@ func requireEqualOOOSamples(t *testing.T, expectedSamples int, db *DB) { "number of ooo appended samples mismatch") } -func requireEqualSamples(t *testing.T, name string, expected, actual []chunks.Sample, ignoreCounterResets bool) { +type requireEqualSamplesOption int + +const ( + requireEqualSamplesNoOption requireEqualSamplesOption = iota + requireEqualSamplesIgnoreCounterResets + requireEqualSamplesInUseBucketCompare +) + +func requireEqualSamples(t *testing.T, name string, expected, actual []chunks.Sample, options ...requireEqualSamplesOption) { + var ( + ignoreCounterResets bool + inUseBucketCompare bool + ) + for _, option := range options { + switch option { + case requireEqualSamplesIgnoreCounterResets: + ignoreCounterResets = true + case requireEqualSamplesInUseBucketCompare: + inUseBucketCompare = true + } + } + require.Equal(t, len(expected), len(actual), "Length not equal to expected for %s", name) for i, s := range expected { expectedSample := s @@ -144,6 +165,10 @@ func requireEqualSamples(t *testing.T, name string, expected, actual []chunks.Sa } else { require.Equal(t, expectedHist.CounterResetHint, actualHist.CounterResetHint, "Sample header doesn't match for %s[%d] at ts %d, expected: %s, actual: %s", name, i, expectedSample.T(), counterResetAsString(expectedHist.CounterResetHint), counterResetAsString(actualHist.CounterResetHint)) } + if inUseBucketCompare { + expectedSample.H().Compact(0) + actualSample.H().Compact(0) + } require.Equal(t, expectedHist, actualHist, "Sample doesn't match for %s[%d] at ts %d", name, i, expectedSample.T()) } case s.FH() != nil: @@ -156,6 +181,10 @@ func requireEqualSamples(t *testing.T, name string, expected, actual []chunks.Sa } else { require.Equal(t, expectedHist.CounterResetHint, actualHist.CounterResetHint, "Sample header doesn't match for %s[%d] at ts %d, expected: %s, actual: %s", name, i, expectedSample.T(), counterResetAsString(expectedHist.CounterResetHint), counterResetAsString(actualHist.CounterResetHint)) } + if inUseBucketCompare { + expectedSample.FH().Compact(0) + actualSample.FH().Compact(0) + } require.Equal(t, expectedHist, actualHist, "Sample doesn't match for %s[%d] at ts %d", name, i, expectedSample.T()) } default: From a4083f14e866223cab66125beef5c347ae51dcf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 18 Oct 2024 09:06:37 +0200 Subject: [PATCH 10/15] Fix populateWithDelChunkSeriesIterator corrupting chunk meta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When handling recoded histogram chunks the min time of the chunk is updated by mistake. It should only update when the chunk is completely new. Otherwise the ongoing chunk's meta will be later than the previously written samples in it. Same bug as https://github.com/prometheus/prometheus/pull/14629 Signed-off-by: György Krajcsovits --- tsdb/querier.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsdb/querier.go b/tsdb/querier.go index 1083cbba0e..b80faf881e 100644 --- a/tsdb/querier.go +++ b/tsdb/querier.go @@ -1022,9 +1022,9 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { if newChunk != nil { if !recoded { p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt}) + cmint = t } currentChunk = newChunk - cmint = t } cmaxt = t From 18b81ad79d5ebdb2a6592442182603f12022b22f Mon Sep 17 00:00:00 2001 From: machine424 Date: Tue, 15 Oct 2024 17:28:56 +0200 Subject: [PATCH 11/15] feat: ProtobufParse.formatOpenMetricsFloat: improve float formatting by using strconv.AppendFloat instead of fmt.Sprint Signed-off-by: machine424 --- model/textparse/protobufparse.go | 23 +++- model/textparse/protobufparse_test.go | 163 ++++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 4 deletions(-) diff --git a/model/textparse/protobufparse.go b/model/textparse/protobufparse.go index b3dfdfca1c..a77e1d728f 100644 --- a/model/textparse/protobufparse.go +++ b/model/textparse/protobufparse.go @@ -20,7 +20,9 @@ import ( "fmt" "io" "math" + "strconv" "strings" + "sync" "unicode/utf8" "github.com/gogo/protobuf/proto" @@ -34,6 +36,15 @@ import ( dto "github.com/prometheus/prometheus/prompb/io/prometheus/client" ) +// floatFormatBufPool is exclusively used in formatOpenMetricsFloat. +var floatFormatBufPool = sync.Pool{ + New: func() interface{} { + // To contain at most 17 digits and additional syntax for a float64. + b := make([]byte, 0, 24) + return &b + }, +} + // ProtobufParser is a very inefficient way of unmarshaling the old Prometheus // protobuf format and then present it as it if were parsed by a // Prometheus-2-style text parser. This is only done so that we can easily plug @@ -629,11 +640,15 @@ func formatOpenMetricsFloat(f float64) string { case math.IsInf(f, -1): return "-Inf" } - s := fmt.Sprint(f) - if strings.ContainsAny(s, "e.") { - return s + bp := floatFormatBufPool.Get().(*[]byte) + defer floatFormatBufPool.Put(bp) + + *bp = strconv.AppendFloat((*bp)[:0], f, 'g', -1, 64) + if bytes.ContainsAny(*bp, "e.") { + return string(*bp) } - return s + ".0" + *bp = append(*bp, '.', '0') + return string(*bp) } // isNativeHistogram returns false iff the provided histograms has no spans at diff --git a/model/textparse/protobufparse_test.go b/model/textparse/protobufparse_test.go index 0c09279fed..065459a69a 100644 --- a/model/textparse/protobufparse_test.go +++ b/model/textparse/protobufparse_test.go @@ -409,6 +409,49 @@ metric: < > > +`, + `name: "test_histogram3" +help: "Similar histogram as before but now with integer buckets." +type: HISTOGRAM +metric: < + histogram: < + sample_count: 6 + sample_sum: 50 + bucket: < + cumulative_count: 2 + upper_bound: -20 + > + bucket: < + cumulative_count: 4 + upper_bound: 20 + exemplar: < + label: < + name: "dummyID" + value: "59727" + > + value: 15 + timestamp: < + seconds: 1625851153 + nanos: 146848499 + > + > + > + bucket: < + cumulative_count: 6 + upper_bound: 30 + exemplar: < + label: < + name: "dummyID" + value: "5617" + > + value: 25 + > + > + schema: 0 + zero_threshold: 0 + > +> + `, `name: "test_histogram_family" help: "Test histogram metric family with two very simple histograms." @@ -1050,6 +1093,66 @@ func TestProtobufParse(t *testing.T) { "le", "+Inf", ), }, + { + m: "test_histogram3", + help: "Similar histogram as before but now with integer buckets.", + }, + { + m: "test_histogram3", + typ: model.MetricTypeHistogram, + }, + { + m: "test_histogram3_count", + v: 6, + lset: labels.FromStrings( + "__name__", "test_histogram3_count", + ), + }, + { + m: "test_histogram3_sum", + v: 50, + lset: labels.FromStrings( + "__name__", "test_histogram3_sum", + ), + }, + { + m: "test_histogram3_bucket\xffle\xff-20.0", + v: 2, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "-20.0", + ), + }, + { + m: "test_histogram3_bucket\xffle\xff20.0", + v: 4, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "20.0", + ), + es: []exemplar.Exemplar{ + {Labels: labels.FromStrings("dummyID", "59727"), Value: 15, HasTs: true, Ts: 1625851153146}, + }, + }, + { + m: "test_histogram3_bucket\xffle\xff30.0", + v: 6, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "30.0", + ), + es: []exemplar.Exemplar{ + {Labels: labels.FromStrings("dummyID", "5617"), Value: 25, HasTs: false}, + }, + }, + { + m: "test_histogram3_bucket\xffle\xff+Inf", + v: 6, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "+Inf", + ), + }, { m: "test_histogram_family", help: "Test histogram metric family with two very simple histograms.", @@ -1857,6 +1960,66 @@ func TestProtobufParse(t *testing.T) { "le", "+Inf", ), }, + { + m: "test_histogram3", + help: "Similar histogram as before but now with integer buckets.", + }, + { + m: "test_histogram3", + typ: model.MetricTypeHistogram, + }, + { + m: "test_histogram3_count", + v: 6, + lset: labels.FromStrings( + "__name__", "test_histogram3_count", + ), + }, + { + m: "test_histogram3_sum", + v: 50, + lset: labels.FromStrings( + "__name__", "test_histogram3_sum", + ), + }, + { + m: "test_histogram3_bucket\xffle\xff-20.0", + v: 2, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "-20.0", + ), + }, + { + m: "test_histogram3_bucket\xffle\xff20.0", + v: 4, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "20.0", + ), + es: []exemplar.Exemplar{ + {Labels: labels.FromStrings("dummyID", "59727"), Value: 15, HasTs: true, Ts: 1625851153146}, + }, + }, + { + m: "test_histogram3_bucket\xffle\xff30.0", + v: 6, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "30.0", + ), + es: []exemplar.Exemplar{ + {Labels: labels.FromStrings("dummyID", "5617"), Value: 25, HasTs: false}, + }, + }, + { + m: "test_histogram3_bucket\xffle\xff+Inf", + v: 6, + lset: labels.FromStrings( + "__name__", "test_histogram3_bucket", + "le", "+Inf", + ), + }, { m: "test_histogram_family", help: "Test histogram metric family with two very simple histograms.", From c78d5b94af29fe2a7ed7e265629b91403c9b5619 Mon Sep 17 00:00:00 2001 From: Alan Protasio Date: Fri, 18 Oct 2024 06:23:14 -0700 Subject: [PATCH 12/15] Disallowing configure AM with the v1 api (#13883) * Stop supporting Alertmanager v1 * Disallowing configure AM with the v1 api Signed-off-by: alanprot * Update config/config_test.go Co-authored-by: Ayoub Mrini Signed-off-by: Alan Protasio * Update config/config.go Co-authored-by: Ayoub Mrini Signed-off-by: Alan Protasio * Addressing coments Signed-off-by: alanprot * Update notifier/notifier.go Co-authored-by: Ayoub Mrini Signed-off-by: Alan Protasio * Update config/config_test.go Co-authored-by: Jan Fajerski Signed-off-by: Alan Protasio --------- Signed-off-by: alanprot Signed-off-by: Alan Protasio Co-authored-by: Ayoub Mrini Co-authored-by: Jan Fajerski --- .../config_with_service_discovery_files.yml | 2 +- config/config.go | 3 +- config/config_test.go | 5 ++++ .../config_with_deprecated_am_api_config.yml | 7 +++++ notifier/notifier.go | 29 +++++-------------- notifier/notifier_test.go | 12 ++++---- 6 files changed, 29 insertions(+), 29 deletions(-) create mode 100644 config/testdata/config_with_deprecated_am_api_config.yml diff --git a/cmd/promtool/testdata/config_with_service_discovery_files.yml b/cmd/promtool/testdata/config_with_service_discovery_files.yml index 13b6d7faff..6a550a8403 100644 --- a/cmd/promtool/testdata/config_with_service_discovery_files.yml +++ b/cmd/promtool/testdata/config_with_service_discovery_files.yml @@ -6,7 +6,7 @@ scrape_configs: alerting: alertmanagers: - scheme: http - api_version: v1 + api_version: v2 file_sd_configs: - files: - nonexistent_file.yml diff --git a/config/config.go b/config/config.go index a88b0d32ff..17405309b0 100644 --- a/config/config.go +++ b/config/config.go @@ -955,6 +955,7 @@ func (a AlertmanagerConfigs) ToMap() map[string]*AlertmanagerConfig { // AlertmanagerAPIVersion represents a version of the // github.com/prometheus/alertmanager/api, e.g. 'v1' or 'v2'. +// 'v1' is no longer supported. type AlertmanagerAPIVersion string // UnmarshalYAML implements the yaml.Unmarshaler interface. @@ -984,7 +985,7 @@ const ( ) var SupportedAlertmanagerAPIVersions = []AlertmanagerAPIVersion{ - AlertmanagerAPIVersionV1, AlertmanagerAPIVersionV2, + AlertmanagerAPIVersionV2, } // AlertmanagerConfig configures how Alertmanagers can be discovered and communicated with. diff --git a/config/config_test.go b/config/config_test.go index 547070dacc..9734847f31 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1500,6 +1500,11 @@ var expectedConf = &Config{ }, } +func TestYAMLNotLongerSupportedAMApi(t *testing.T) { + _, err := LoadFile("testdata/config_with_no_longer_supported_am_api_config.yml", false, promslog.NewNopLogger()) + require.Error(t, err) +} + func TestYAMLRoundtrip(t *testing.T) { want, err := LoadFile("testdata/roundtrip.good.yml", false, promslog.NewNopLogger()) require.NoError(t, err) diff --git a/config/testdata/config_with_deprecated_am_api_config.yml b/config/testdata/config_with_deprecated_am_api_config.yml new file mode 100644 index 0000000000..ac89537ff1 --- /dev/null +++ b/config/testdata/config_with_deprecated_am_api_config.yml @@ -0,0 +1,7 @@ +alerting: + alertmanagers: + - scheme: http + api_version: v1 + file_sd_configs: + - files: + - nonexistent_file.yml diff --git a/notifier/notifier.go b/notifier/notifier.go index 482d2fdaab..e970b67e6d 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -542,10 +542,10 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { begin := time.Now() - // v1Payload and v2Payload represent 'alerts' marshaled for Alertmanager API - // v1 or v2. Marshaling happens below. Reference here is for caching between + // cachedPayload represent 'alerts' marshaled for Alertmanager API v2. + // Marshaling happens below. Reference here is for caching between // for loop iterations. - var v1Payload, v2Payload []byte + var cachedPayload []byte n.mtx.RLock() amSets := n.alertmanagers @@ -576,29 +576,16 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { continue } // We can't use the cached values from previous iteration. - v1Payload, v2Payload = nil, nil + cachedPayload = nil } switch ams.cfg.APIVersion { - case config.AlertmanagerAPIVersionV1: - { - if v1Payload == nil { - v1Payload, err = json.Marshal(amAlerts) - if err != nil { - n.logger.Error("Encoding alerts for Alertmanager API v1 failed", "err", err) - ams.mtx.RUnlock() - return false - } - } - - payload = v1Payload - } case config.AlertmanagerAPIVersionV2: { - if v2Payload == nil { + if cachedPayload == nil { openAPIAlerts := alertsToOpenAPIAlerts(amAlerts) - v2Payload, err = json.Marshal(openAPIAlerts) + cachedPayload, err = json.Marshal(openAPIAlerts) if err != nil { n.logger.Error("Encoding alerts for Alertmanager API v2 failed", "err", err) ams.mtx.RUnlock() @@ -606,7 +593,7 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { } } - payload = v2Payload + payload = cachedPayload } default: { @@ -621,7 +608,7 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { if len(ams.cfg.AlertRelabelConfigs) > 0 { // We can't use the cached values on the next iteration. - v1Payload, v2Payload = nil, nil + cachedPayload = nil } for _, am := range ams.ams { diff --git a/notifier/notifier_test.go b/notifier/notifier_test.go index 83eaf8168b..97b0274f29 100644 --- a/notifier/notifier_test.go +++ b/notifier/notifier_test.go @@ -50,27 +50,27 @@ func TestPostPath(t *testing.T) { }{ { in: "", - out: "/api/v1/alerts", + out: "/api/v2/alerts", }, { in: "/", - out: "/api/v1/alerts", + out: "/api/v2/alerts", }, { in: "/prefix", - out: "/prefix/api/v1/alerts", + out: "/prefix/api/v2/alerts", }, { in: "/prefix//", - out: "/prefix/api/v1/alerts", + out: "/prefix/api/v2/alerts", }, { in: "prefix//", - out: "/prefix/api/v1/alerts", + out: "/prefix/api/v2/alerts", }, } for _, c := range cases { - require.Equal(t, c.out, postPath(c.in, config.AlertmanagerAPIVersionV1)) + require.Equal(t, c.out, postPath(c.in, config.AlertmanagerAPIVersionV2)) } } From 421a3c22ea06d6966bff5127c65c94fd8855e190 Mon Sep 17 00:00:00 2001 From: Alex Greenbank Date: Fri, 18 Oct 2024 16:12:31 +0100 Subject: [PATCH 13/15] scrape: provide a fallback format (#15136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scrape: Remove implicit fallback to the Prometheus text format Remove implicit fallback to the Prometheus text format in case of invalid/missing Content-Type and fail the scrape instead. Add ability to specify a `fallback_scrape_protocol` in the scrape config. --------- Signed-off-by: alexgreenbank Signed-off-by: Alex Greenbank Co-authored-by: Björn Rabenstein --- CHANGELOG.md | 1 + config/config.go | 32 ++++++- config/config_test.go | 88 ++++++++++++++++--- config/testdata/conf.good.yml | 2 + ...ig_files_fallback_scrape_protocol1.bad.yml | 5 ++ ...ig_files_fallback_scrape_protocol2.bad.yml | 5 ++ docs/configuration/configuration.md | 8 +- model/textparse/interface.go | 59 ++++++++++--- model/textparse/interface_test.go | 87 +++++++++++++++--- promql/fuzz.go | 6 +- scrape/scrape.go | 23 ++++- scrape/scrape_test.go | 6 +- 12 files changed, 280 insertions(+), 42 deletions(-) create mode 100644 config/testdata/scrape_config_files_fallback_scrape_protocol1.bad.yml create mode 100644 config/testdata/scrape_config_files_fallback_scrape_protocol2.bad.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 250533bab7..f1321829e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## unreleased +* [CHANGE] Scraping: Remove implicit fallback to the Prometheus text format in case of invalid/missing Content-Type and fail the scrape instead. Add ability to specify a `fallback_scrape_protocol` in the scrape config. #15136 * [BUGFIX] PromQL: Fix stddev+stdvar aggregations to always ignore native histograms. #14941 * [BUGFIX] PromQL: Fix stddev+stdvar aggregations to treat Infinity consistently. #14941 diff --git a/config/config.go b/config/config.go index 17405309b0..f3403654c2 100644 --- a/config/config.go +++ b/config/config.go @@ -17,6 +17,7 @@ import ( "errors" "fmt" "log/slog" + "mime" "net/url" "os" "path/filepath" @@ -473,9 +474,22 @@ func (s ScrapeProtocol) Validate() error { return nil } +// HeaderMediaType returns the MIME mediaType for a particular ScrapeProtocol. +func (s ScrapeProtocol) HeaderMediaType() string { + if _, ok := ScrapeProtocolsHeaders[s]; !ok { + return "" + } + mediaType, _, err := mime.ParseMediaType(ScrapeProtocolsHeaders[s]) + if err != nil { + return "" + } + return mediaType +} + var ( PrometheusProto ScrapeProtocol = "PrometheusProto" PrometheusText0_0_4 ScrapeProtocol = "PrometheusText0.0.4" + PrometheusText1_0_0 ScrapeProtocol = "PrometheusText1.0.0" OpenMetricsText0_0_1 ScrapeProtocol = "OpenMetricsText0.0.1" OpenMetricsText1_0_0 ScrapeProtocol = "OpenMetricsText1.0.0" UTF8NamesHeader string = model.EscapingKey + "=" + model.AllowUTF8 @@ -483,6 +497,7 @@ var ( ScrapeProtocolsHeaders = map[ScrapeProtocol]string{ PrometheusProto: "application/vnd.google.protobuf;proto=io.prometheus.client.MetricFamily;encoding=delimited", PrometheusText0_0_4: "text/plain;version=0.0.4", + PrometheusText1_0_0: "text/plain;version=1.0.0;escaping=allow-utf-8", OpenMetricsText0_0_1: "application/openmetrics-text;version=0.0.1", OpenMetricsText1_0_0: "application/openmetrics-text;version=1.0.0", } @@ -492,6 +507,7 @@ var ( DefaultScrapeProtocols = []ScrapeProtocol{ OpenMetricsText1_0_0, OpenMetricsText0_0_1, + PrometheusText1_0_0, PrometheusText0_0_4, } @@ -503,6 +519,7 @@ var ( PrometheusProto, OpenMetricsText1_0_0, OpenMetricsText0_0_1, + PrometheusText1_0_0, PrometheusText0_0_4, } ) @@ -629,8 +646,15 @@ type ScrapeConfig struct { // The protocols to negotiate during a scrape. It tells clients what // protocol are accepted by Prometheus and with what preference (most wanted is first). // Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1, - // OpenMetricsText1.0.0, PrometheusText0.0.4. + // OpenMetricsText1.0.0, PrometheusText1.0.0, PrometheusText0.0.4. ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"` + // The fallback protocol to use if the Content-Type provided by the target + // is not provided, blank, or not one of the expected values. + // Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1, + // OpenMetricsText1.0.0, PrometheusText1.0.0, PrometheusText0.0.4. + ScrapeFallbackProtocol ScrapeProtocol `yaml:"fallback_scrape_protocol,omitempty"` + // Whether to scrape a classic histogram that is also exposed as a native histogram. + ScrapeClassicHistograms bool `yaml:"scrape_classic_histograms,omitempty"` // Whether to scrape a classic histogram, even if it is also exposed as a native histogram. AlwaysScrapeClassicHistograms bool `yaml:"always_scrape_classic_histograms,omitempty"` // File to which scrape failures are logged. @@ -780,6 +804,12 @@ func (c *ScrapeConfig) Validate(globalConfig GlobalConfig) error { return fmt.Errorf("%w for scrape config with job name %q", err, c.JobName) } + if c.ScrapeFallbackProtocol != "" { + if err := c.ScrapeFallbackProtocol.Validate(); err != nil { + return fmt.Errorf("invalid fallback_scrape_protocol for scrape config with job name %q: %w", c.JobName, err) + } + } + switch globalConfig.MetricNameValidationScheme { case LegacyValidationConfig: case "", UTF8ValidationConfig: diff --git a/config/config_test.go b/config/config_test.go index 9734847f31..8bf664c1f0 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -206,19 +206,20 @@ var expectedConf = &Config{ { JobName: "prometheus", - HonorLabels: true, - HonorTimestamps: true, - ScrapeInterval: model.Duration(15 * time.Second), - ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout, - EnableCompression: true, - BodySizeLimit: globBodySizeLimit, - SampleLimit: globSampleLimit, - TargetLimit: globTargetLimit, - LabelLimit: globLabelLimit, - LabelNameLengthLimit: globLabelNameLengthLimit, - LabelValueLengthLimit: globLabelValueLengthLimit, - ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, - ScrapeFailureLogFile: "testdata/fail_prom.log", + HonorLabels: true, + HonorTimestamps: true, + ScrapeInterval: model.Duration(15 * time.Second), + ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout, + EnableCompression: true, + BodySizeLimit: globBodySizeLimit, + SampleLimit: globSampleLimit, + TargetLimit: globTargetLimit, + LabelLimit: globLabelLimit, + LabelNameLengthLimit: globLabelNameLengthLimit, + LabelValueLengthLimit: globLabelValueLengthLimit, + ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFallbackProtocol: PrometheusText0_0_4, + ScrapeFailureLogFile: "testdata/fail_prom.log", MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -2086,12 +2087,20 @@ var expectedErrors = []struct { }, { filename: "scrape_config_files_scrape_protocols.bad.yml", - errMsg: `parsing YAML file testdata/scrape_config_files_scrape_protocols.bad.yml: scrape_protocols: unknown scrape protocol prometheusproto, supported: [OpenMetricsText0.0.1 OpenMetricsText1.0.0 PrometheusProto PrometheusText0.0.4] for scrape config with job name "node"`, + errMsg: `parsing YAML file testdata/scrape_config_files_scrape_protocols.bad.yml: scrape_protocols: unknown scrape protocol prometheusproto, supported: [OpenMetricsText0.0.1 OpenMetricsText1.0.0 PrometheusProto PrometheusText0.0.4 PrometheusText1.0.0] for scrape config with job name "node"`, }, { filename: "scrape_config_files_scrape_protocols2.bad.yml", errMsg: `parsing YAML file testdata/scrape_config_files_scrape_protocols2.bad.yml: duplicated protocol in scrape_protocols, got [OpenMetricsText1.0.0 PrometheusProto OpenMetricsText1.0.0] for scrape config with job name "node"`, }, + { + filename: "scrape_config_files_fallback_scrape_protocol1.bad.yml", + errMsg: `parsing YAML file testdata/scrape_config_files_fallback_scrape_protocol1.bad.yml: invalid fallback_scrape_protocol for scrape config with job name "node": unknown scrape protocol prometheusproto, supported: [OpenMetricsText0.0.1 OpenMetricsText1.0.0 PrometheusProto PrometheusText0.0.4 PrometheusText1.0.0]`, + }, + { + filename: "scrape_config_files_fallback_scrape_protocol2.bad.yml", + errMsg: `unmarshal errors`, + }, } func TestBadConfigs(t *testing.T) { @@ -2412,3 +2421,54 @@ func TestScrapeConfigNameValidationSettings(t *testing.T) { }) } } + +func TestScrapeProtocolHeader(t *testing.T) { + tests := []struct { + name string + proto ScrapeProtocol + expectedValue string + }{ + { + name: "blank", + proto: ScrapeProtocol(""), + expectedValue: "", + }, + { + name: "invalid", + proto: ScrapeProtocol("invalid"), + expectedValue: "", + }, + { + name: "prometheus protobuf", + proto: PrometheusProto, + expectedValue: "application/vnd.google.protobuf", + }, + { + name: "prometheus text 0.0.4", + proto: PrometheusText0_0_4, + expectedValue: "text/plain", + }, + { + name: "prometheus text 1.0.0", + proto: PrometheusText1_0_0, + expectedValue: "text/plain", + }, + { + name: "openmetrics 0.0.1", + proto: OpenMetricsText0_0_1, + expectedValue: "application/openmetrics-text", + }, + { + name: "openmetrics 1.0.0", + proto: OpenMetricsText1_0_0, + expectedValue: "application/openmetrics-text", + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + mediaType := tc.proto.HeaderMediaType() + + require.Equal(t, tc.expectedValue, mediaType) + }) + } +} diff --git a/config/testdata/conf.good.yml b/config/testdata/conf.good.yml index 9eb7995432..2501652d5b 100644 --- a/config/testdata/conf.good.yml +++ b/config/testdata/conf.good.yml @@ -74,6 +74,8 @@ scrape_configs: # metrics_path defaults to '/metrics' # scheme defaults to 'http'. + fallback_scrape_protocol: PrometheusText0.0.4 + scrape_failure_log_file: fail_prom.log file_sd_configs: - files: diff --git a/config/testdata/scrape_config_files_fallback_scrape_protocol1.bad.yml b/config/testdata/scrape_config_files_fallback_scrape_protocol1.bad.yml new file mode 100644 index 0000000000..07cfe47594 --- /dev/null +++ b/config/testdata/scrape_config_files_fallback_scrape_protocol1.bad.yml @@ -0,0 +1,5 @@ +scrape_configs: + - job_name: node + fallback_scrape_protocol: "prometheusproto" + static_configs: + - targets: ['localhost:8080'] diff --git a/config/testdata/scrape_config_files_fallback_scrape_protocol2.bad.yml b/config/testdata/scrape_config_files_fallback_scrape_protocol2.bad.yml new file mode 100644 index 0000000000..c5d133f9c4 --- /dev/null +++ b/config/testdata/scrape_config_files_fallback_scrape_protocol2.bad.yml @@ -0,0 +1,5 @@ +scrape_configs: + - job_name: node + fallback_scrape_protocol: ["OpenMetricsText1.0.0", "PrometheusText0.0.4"] + static_configs: + - targets: ['localhost:8080'] diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 4a681c7973..31ceac734f 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -212,9 +212,15 @@ job_name: # The protocols to negotiate during a scrape with the client. # Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1, -# OpenMetricsText1.0.0, PrometheusText0.0.4. +# OpenMetricsText1.0.0, PrometheusText0.0.4, PrometheusText1.0.0. [ scrape_protocols: [, ...] | default = ] +# Fallback protocol to use if a scrape returns blank, unparseable, or otherwise +# invalid Content-Type. +# Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1, +# OpenMetricsText1.0.0, PrometheusText0.0.4, PrometheusText1.0.0. +[ fallback_scrape_protocol: ] + # Whether to scrape a classic histogram, even if it is also exposed as a native # histogram (has no effect without --enable-feature=native-histograms). [ always_scrape_classic_histograms: | default = false ] diff --git a/model/textparse/interface.go b/model/textparse/interface.go index 3b0e9a96e1..99755ffd22 100644 --- a/model/textparse/interface.go +++ b/model/textparse/interface.go @@ -14,6 +14,8 @@ package textparse import ( + "errors" + "fmt" "mime" "github.com/prometheus/common/model" @@ -78,28 +80,65 @@ type Parser interface { Next() (Entry, error) } -// New returns a new parser of the byte slice. -// -// This function always returns a valid parser, but might additionally -// return an error if the content type cannot be parsed. -func New(b []byte, contentType string, parseClassicHistograms, skipOMCTSeries bool, st *labels.SymbolTable) (Parser, error) { +// extractMediaType returns the mediaType of a required parser. It tries first to +// extract a valid and supported mediaType from contentType. If that fails, +// the provided fallbackType (possibly an empty string) is returned, together with +// an error. fallbackType is used as-is without further validation. +func extractMediaType(contentType, fallbackType string) (string, error) { if contentType == "" { - return NewPromParser(b, st), nil + if fallbackType == "" { + return "", errors.New("non-compliant scrape target sending blank Content-Type and no fallback_scrape_protocol specified for target") + } + return fallbackType, fmt.Errorf("non-compliant scrape target sending blank Content-Type, using fallback_scrape_protocol %q", fallbackType) } + // We have a contentType, parse it. mediaType, _, err := mime.ParseMediaType(contentType) if err != nil { - return NewPromParser(b, st), err + if fallbackType == "" { + retErr := fmt.Errorf("cannot parse Content-Type %q and no fallback_scrape_protocol for target", contentType) + return "", errors.Join(retErr, err) + } + retErr := fmt.Errorf("could not parse received Content-Type %q, using fallback_scrape_protocol %q", contentType, fallbackType) + return fallbackType, errors.Join(retErr, err) } + + // We have a valid media type, either we recognise it and can use it + // or we have to error. + switch mediaType { + case "application/openmetrics-text", "application/vnd.google.protobuf", "text/plain": + return mediaType, nil + } + // We're here because we have no recognised mediaType. + if fallbackType == "" { + return "", fmt.Errorf("received unsupported Content-Type %q and no fallback_scrape_protocol specified for target", contentType) + } + return fallbackType, fmt.Errorf("received unsupported Content-Type %q, using fallback_scrape_protocol %q", contentType, fallbackType) +} + +// New returns a new parser of the byte slice. +// +// This function no longer guarantees to return a valid parser. +// +// It only returns a valid parser if the supplied contentType and fallbackType allow. +// An error may also be returned if fallbackType had to be used or there was some +// other error parsing the supplied Content-Type. +// If the returned parser is nil then the scrape must fail. +func New(b []byte, contentType, fallbackType string, parseClassicHistograms, skipOMCTSeries bool, st *labels.SymbolTable) (Parser, error) { + mediaType, err := extractMediaType(contentType, fallbackType) + // err may be nil or something we want to warn about. + switch mediaType { case "application/openmetrics-text": return NewOpenMetricsParser(b, st, func(o *openMetricsParserOptions) { o.SkipCTSeries = skipOMCTSeries - }), nil + }), err case "application/vnd.google.protobuf": - return NewProtobufParser(b, parseClassicHistograms, st), nil + return NewProtobufParser(b, parseClassicHistograms, st), err + case "text/plain": + return NewPromParser(b, st), err default: - return NewPromParser(b, st), nil + return nil, err } } diff --git a/model/textparse/interface_test.go b/model/textparse/interface_test.go index 3f2f758d7e..6136fbc915 100644 --- a/model/textparse/interface_test.go +++ b/model/textparse/interface_test.go @@ -22,6 +22,7 @@ import ( "github.com/prometheus/common/model" "github.com/stretchr/testify/require" + "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" @@ -31,6 +32,10 @@ import ( func TestNewParser(t *testing.T) { t.Parallel() + requireNilParser := func(t *testing.T, p Parser) { + require.Nil(t, p) + } + requirePromParser := func(t *testing.T, p Parser) { require.NotNil(t, p) _, ok := p.(*PromParser) @@ -43,34 +48,83 @@ func TestNewParser(t *testing.T) { require.True(t, ok) } + requireProtobufParser := func(t *testing.T, p Parser) { + require.NotNil(t, p) + _, ok := p.(*ProtobufParser) + require.True(t, ok) + } + for name, tt := range map[string]*struct { - contentType string - validateParser func(*testing.T, Parser) - err string + contentType string + fallbackScrapeProtocol config.ScrapeProtocol + validateParser func(*testing.T, Parser) + err string }{ "empty-string": { - validateParser: requirePromParser, + validateParser: requireNilParser, + err: "non-compliant scrape target sending blank Content-Type and no fallback_scrape_protocol specified for target", + }, + "empty-string-fallback-text-plain": { + validateParser: requirePromParser, + fallbackScrapeProtocol: config.PrometheusText0_0_4, + err: "non-compliant scrape target sending blank Content-Type, using fallback_scrape_protocol \"text/plain\"", }, "invalid-content-type-1": { contentType: "invalid/", - validateParser: requirePromParser, + validateParser: requireNilParser, err: "expected token after slash", }, + "invalid-content-type-1-fallback-text-plain": { + contentType: "invalid/", + validateParser: requirePromParser, + fallbackScrapeProtocol: config.PrometheusText0_0_4, + err: "expected token after slash", + }, + "invalid-content-type-1-fallback-openmetrics": { + contentType: "invalid/", + validateParser: requireOpenMetricsParser, + fallbackScrapeProtocol: config.OpenMetricsText0_0_1, + err: "expected token after slash", + }, + "invalid-content-type-1-fallback-protobuf": { + contentType: "invalid/", + validateParser: requireProtobufParser, + fallbackScrapeProtocol: config.PrometheusProto, + err: "expected token after slash", + }, "invalid-content-type-2": { contentType: "invalid/invalid/invalid", - validateParser: requirePromParser, + validateParser: requireNilParser, err: "unexpected content after media subtype", }, + "invalid-content-type-2-fallback-text-plain": { + contentType: "invalid/invalid/invalid", + validateParser: requirePromParser, + fallbackScrapeProtocol: config.PrometheusText1_0_0, + err: "unexpected content after media subtype", + }, "invalid-content-type-3": { contentType: "/", - validateParser: requirePromParser, + validateParser: requireNilParser, err: "no media type", }, + "invalid-content-type-3-fallback-text-plain": { + contentType: "/", + validateParser: requirePromParser, + fallbackScrapeProtocol: config.PrometheusText1_0_0, + err: "no media type", + }, "invalid-content-type-4": { contentType: "application/openmetrics-text; charset=UTF-8; charset=utf-8", - validateParser: requirePromParser, + validateParser: requireNilParser, err: "duplicate parameter name", }, + "invalid-content-type-4-fallback-open-metrics": { + contentType: "application/openmetrics-text; charset=UTF-8; charset=utf-8", + validateParser: requireOpenMetricsParser, + fallbackScrapeProtocol: config.OpenMetricsText1_0_0, + err: "duplicate parameter name", + }, "openmetrics": { contentType: "application/openmetrics-text", validateParser: requireOpenMetricsParser, @@ -87,20 +141,33 @@ func TestNewParser(t *testing.T) { contentType: "text/plain", validateParser: requirePromParser, }, + "protobuf": { + contentType: "application/vnd.google.protobuf", + validateParser: requireProtobufParser, + }, "plain-text-with-version": { contentType: "text/plain; version=0.0.4", validateParser: requirePromParser, }, "some-other-valid-content-type": { contentType: "text/html", - validateParser: requirePromParser, + validateParser: requireNilParser, + err: "received unsupported Content-Type \"text/html\" and no fallback_scrape_protocol specified for target", + }, + "some-other-valid-content-type-fallback-text-plain": { + contentType: "text/html", + validateParser: requirePromParser, + fallbackScrapeProtocol: config.PrometheusText0_0_4, + err: "received unsupported Content-Type \"text/html\", using fallback_scrape_protocol \"text/plain\"", }, } { t.Run(name, func(t *testing.T) { tt := tt // Copy to local variable before going parallel. t.Parallel() - p, err := New([]byte{}, tt.contentType, false, false, labels.NewSymbolTable()) + fallbackProtoMediaType := tt.fallbackScrapeProtocol.HeaderMediaType() + + p, err := New([]byte{}, tt.contentType, fallbackProtoMediaType, false, false, labels.NewSymbolTable()) tt.validateParser(t, p) if tt.err == "" { require.NoError(t, err) diff --git a/promql/fuzz.go b/promql/fuzz.go index 57fd1166ac..759055fb0d 100644 --- a/promql/fuzz.go +++ b/promql/fuzz.go @@ -61,8 +61,8 @@ const ( var symbolTable = labels.NewSymbolTable() func fuzzParseMetricWithContentType(in []byte, contentType string) int { - p, warning := textparse.New(in, contentType, false, false, symbolTable) - if warning != nil { + p, warning := textparse.New(in, contentType, "", false, false, symbolTable) + if p == nil || warning != nil { // An invalid content type is being passed, which should not happen // in this context. panic(warning) @@ -91,7 +91,7 @@ func fuzzParseMetricWithContentType(in []byte, contentType string) int { // Note that this is not the parser for the text-based exposition-format; that // lives in github.com/prometheus/client_golang/text. func FuzzParseMetric(in []byte) int { - return fuzzParseMetricWithContentType(in, "") + return fuzzParseMetricWithContentType(in, "text/plain") } func FuzzParseOpenMetric(in []byte) int { diff --git a/scrape/scrape.go b/scrape/scrape.go index 4273f4cb64..89d1671354 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -114,6 +114,7 @@ type scrapeLoopOptions struct { timeout time.Duration alwaysScrapeClassicHist bool validationScheme model.ValidationScheme + fallbackScrapeProtocol string mrc []*relabel.Config cache *scrapeCache @@ -189,6 +190,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed metrics, options.skipOffsetting, opts.validationScheme, + opts.fallbackScrapeProtocol, ) } sp.metrics.targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) @@ -325,6 +327,7 @@ func (sp *scrapePool) restartLoops(reuseCache bool) { enableCompression = sp.config.EnableCompression trackTimestampsStaleness = sp.config.TrackTimestampsStaleness mrc = sp.config.MetricRelabelConfigs + fallbackScrapeProtocol = sp.config.ScrapeFallbackProtocol.HeaderMediaType() ) validationScheme := model.UTF8Validation @@ -371,6 +374,7 @@ func (sp *scrapePool) restartLoops(reuseCache bool) { interval: interval, timeout: timeout, validationScheme: validationScheme, + fallbackScrapeProtocol: fallbackScrapeProtocol, }) ) if err != nil { @@ -480,6 +484,7 @@ func (sp *scrapePool) sync(targets []*Target) { enableCompression = sp.config.EnableCompression trackTimestampsStaleness = sp.config.TrackTimestampsStaleness mrc = sp.config.MetricRelabelConfigs + fallbackScrapeProtocol = sp.config.ScrapeFallbackProtocol.HeaderMediaType() alwaysScrapeClassicHist = sp.config.AlwaysScrapeClassicHistograms ) @@ -523,6 +528,7 @@ func (sp *scrapePool) sync(targets []*Target) { timeout: timeout, alwaysScrapeClassicHist: alwaysScrapeClassicHist, validationScheme: validationScheme, + fallbackScrapeProtocol: fallbackScrapeProtocol, }) if err != nil { l.setForcedError(err) @@ -885,6 +891,7 @@ type scrapeLoop struct { timeout time.Duration alwaysScrapeClassicHist bool validationScheme model.ValidationScheme + fallbackScrapeProtocol string // Feature flagged options. enableNativeHistogramIngestion bool @@ -1193,6 +1200,7 @@ func newScrapeLoop(ctx context.Context, metrics *scrapeMetrics, skipOffsetting bool, validationScheme model.ValidationScheme, + fallbackScrapeProtocol string, ) *scrapeLoop { if l == nil { l = promslog.NewNopLogger() @@ -1245,6 +1253,7 @@ func newScrapeLoop(ctx context.Context, metrics: metrics, skipOffsetting: skipOffsetting, validationScheme: validationScheme, + fallbackScrapeProtocol: fallbackScrapeProtocol, } sl.ctx, sl.cancel = context.WithCancel(ctx) @@ -1537,11 +1546,21 @@ type appendErrors struct { } func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, ts time.Time) (total, added, seriesAdded int, err error) { - p, err := textparse.New(b, contentType, sl.alwaysScrapeClassicHist, sl.enableCTZeroIngestion, sl.symbolTable) + p, err := textparse.New(b, contentType, sl.fallbackScrapeProtocol, sl.alwaysScrapeClassicHist, sl.enableCTZeroIngestion, sl.symbolTable) + if p == nil { + sl.l.Error( + "Failed to determine correct type of scrape target.", + "content_type", contentType, + "fallback_media_type", sl.fallbackScrapeProtocol, + "err", err, + ) + return + } if err != nil { sl.l.Debug( - "Invalid content type on scrape, using prometheus parser as fallback.", + "Invalid content type on scrape, using fallback setting.", "content_type", contentType, + "fallback_media_type", sl.fallbackScrapeProtocol, "err", err, ) } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index f65d41a84a..82230ce1e9 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -690,6 +690,7 @@ func newBasicScrapeLoop(t testing.TB, ctx context.Context, scraper scraper, app newTestScrapeMetrics(t), false, model.LegacyValidation, + "text/plain", ) } @@ -833,6 +834,7 @@ func TestScrapeLoopRun(t *testing.T) { scrapeMetrics, false, model.LegacyValidation, + "text/plain", ) // The loop must terminate during the initial offset if the context @@ -978,6 +980,7 @@ func TestScrapeLoopMetadata(t *testing.T) { scrapeMetrics, false, model.LegacyValidation, + "text/plain", ) defer cancel() @@ -1526,7 +1529,8 @@ func TestScrapeLoopAppendCacheEntryButErrNotFound(t *testing.T) { fakeRef := storage.SeriesRef(1) expValue := float64(1) metric := []byte(`metric{n="1"} 1`) - p, warning := textparse.New(metric, "", false, false, labels.NewSymbolTable()) + p, warning := textparse.New(metric, "text/plain", "", false, false, labels.NewSymbolTable()) + require.NotNil(t, p) require.NoError(t, warning) var lset labels.Labels From 8bcb4d865d0d63c57cfbbbec4c7002949f238f5d Mon Sep 17 00:00:00 2001 From: machine424 Date: Tue, 15 Oct 2024 16:23:27 +0200 Subject: [PATCH 14/15] feat: normalize "le" and "quantile" labels values upon ingestion Signed-off-by: machine424 Co-authored-by: beorn7 --- docs/feature_flags.md | 52 ------------------------ model/textparse/interface.go | 5 ++- model/textparse/openmetricsparse.go | 15 ++++++- model/textparse/openmetricsparse_test.go | 15 ++++++- model/textparse/promparse.go | 3 +- model/textparse/promparse_test.go | 44 ++++++++++++++++++-- 6 files changed, 73 insertions(+), 61 deletions(-) diff --git a/docs/feature_flags.md b/docs/feature_flags.md index a3e2c0b9e9..2e2d2946fa 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -95,58 +95,6 @@ the corresponding classic histogram, with the notable exception of exemplars, which are always ingested. To keep the classic histograms as well, enable `scrape_classic_histograms` in the scrape job. -_Note about the format of `le` and `quantile` label values:_ - -In certain situations, the protobuf parsing changes the number formatting of -the `le` labels of classic histograms and the `quantile` labels of -summaries. Typically, this happens if the scraped target is instrumented with -[client_golang](https://github.com/prometheus/client_golang) provided that -[promhttp.HandlerOpts.EnableOpenMetrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus/promhttp#HandlerOpts) -is set to `false`. In such a case, integer label values are represented in the -text format as such, e.g. `quantile="1"` or `le="2"`. However, the protobuf parsing -changes the representation to float-like (following the OpenMetrics -specification), so the examples above become `quantile="1.0"` and `le="2.0"` after -ingestion into Prometheus, which changes the identity of the metric compared to -what was ingested before via the text format. - -The effect of this change is that alerts, recording rules and dashboards that -directly reference label values as whole numbers such as `le="1"` will stop -working. - -Aggregation by the `le` and `quantile` labels for vectors that contain the old and -new formatting will lead to unexpected results, and range vectors that span the -transition between the different formatting will contain additional series. -The most common use case for both is the quantile calculation via -`histogram_quantile`, e.g. -`histogram_quantile(0.95, sum by (le) (rate(histogram_bucket[10m])))`. -The `histogram_quantile` function already tries to mitigate the effects to some -extent, but there will be inaccuracies, in particular for shorter ranges that -cover only a few samples. - -Ways to deal with this change either globally or on a per metric basis: - -- Fix references to integer `le`, `quantile` label values, but otherwise do -nothing and accept that some queries that span the transition time will produce -inaccurate or unexpected results. -_This is the recommended solution, to get consistently normalized label values._ -Also Prometheus 3.0 is expected to enforce normalization of these label values. -- Use `metric_relabel_config` to retain the old labels when scraping targets. -This should **only** be applied to metrics that currently produce such labels. - - -```yaml - metric_relabel_configs: - - source_labels: - - quantile - target_label: quantile - regex: (\d+)\.0+ - - source_labels: - - le - - __name__ - target_label: le - regex: (\d+)\.0+;.*_bucket -``` - ## Experimental PromQL functions `--enable-feature=promql-experimental-functions` diff --git a/model/textparse/interface.go b/model/textparse/interface.go index 3b0e9a96e1..1a8f3dc48f 100644 --- a/model/textparse/interface.go +++ b/model/textparse/interface.go @@ -23,8 +23,7 @@ import ( "github.com/prometheus/prometheus/model/labels" ) -// Parser parses samples from a byte slice of samples in the official -// Prometheus and OpenMetrics text exposition formats. +// Parser parses samples from a byte slice of samples in different exposition formats. type Parser interface { // Series returns the bytes of a series with a simple float64 as a // value, the timestamp if set, and the value of the current sample. @@ -58,6 +57,8 @@ type Parser interface { // Metric writes the labels of the current sample into the passed labels. // It returns the string from which the metric was parsed. + // The values of the "le" labels of classic histograms and "quantile" labels + // of summaries should follow the OpenMetrics formatting rules. Metric(l *labels.Labels) string // Exemplar writes the exemplar of the current sample into the passed diff --git a/model/textparse/openmetricsparse.go b/model/textparse/openmetricsparse.go index 13629e66db..8d3ad75c18 100644 --- a/model/textparse/openmetricsparse.go +++ b/model/textparse/openmetricsparse.go @@ -22,6 +22,7 @@ import ( "fmt" "io" "math" + "strconv" "strings" "unicode/utf8" @@ -210,7 +211,7 @@ func (p *OpenMetricsParser) Metric(l *labels.Labels) string { label := unreplace(s[a:b]) c := p.offsets[i+2] - p.start d := p.offsets[i+3] - p.start - value := unreplace(s[c:d]) + value := normalizeFloatsInLabelValues(p.mtype, label, unreplace(s[c:d])) p.builder.Add(label, value) } @@ -724,3 +725,15 @@ func (p *OpenMetricsParser) getFloatValue(t token, after string) (float64, error } return val, nil } + +// normalizeFloatsInLabelValues ensures that values of the "le" labels of classic histograms and "quantile" labels +// of summaries follow OpenMetrics formatting rules. +func normalizeFloatsInLabelValues(t model.MetricType, l, v string) string { + if (t == model.MetricTypeSummary && l == model.QuantileLabel) || (t == model.MetricTypeHistogram && l == model.BucketLabel) { + f, err := strconv.ParseFloat(v, 64) + if err == nil { + return formatOpenMetricsFloat(f) + } + } + return v +} diff --git a/model/textparse/openmetricsparse_test.go b/model/textparse/openmetricsparse_test.go index 467a237718..9c3c679ab5 100644 --- a/model/textparse/openmetricsparse_test.go +++ b/model/textparse/openmetricsparse_test.go @@ -74,6 +74,7 @@ foo_total{a="b"} 17.0 1520879607.789 # {id="counter-test"} 5 foo_created{a="b"} 1520872607.123 foo_total{le="c"} 21.0 foo_created{le="c"} 1520872621.123 +foo_total{le="1"} 10.0 # HELP bar Summary with CT at the end, making sure we find CT even if it's multiple lines a far # TYPE bar summary bar_count 17.0 @@ -97,6 +98,7 @@ something_count 18 something_sum 324789.4 something_created 1520430001 something_bucket{le="0.0"} 1 +something_bucket{le="1"} 2 something_bucket{le="+Inf"} 18 # HELP yum Summary with _created between sum and quantiles # TYPE yum summary @@ -130,7 +132,7 @@ foobar{quantile="0.99"} 150.1` }, { m: `go_gc_duration_seconds{quantile="0"}`, v: 4.9351e-05, - lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0"), + lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.0"), }, { m: `go_gc_duration_seconds{quantile="0.25"}`, v: 7.424100000000001e-05, @@ -302,6 +304,10 @@ foobar{quantile="0.99"} 150.1` v: 21.0, lset: labels.FromStrings("__name__", "foo_total", "le", "c"), ct: int64p(1520872621123), + }, { + m: `foo_total{le="1"}`, + v: 10.0, + lset: labels.FromStrings("__name__", "foo_total", "le", "1"), }, { m: "bar", help: "Summary with CT at the end, making sure we find CT even if it's multiple lines a far", @@ -385,6 +391,11 @@ foobar{quantile="0.99"} 150.1` v: 1, lset: labels.FromStrings("__name__", "something_bucket", "le", "0.0"), ct: int64p(1520430001000), + }, { + m: `something_bucket{le="1"}`, + v: 2, + lset: labels.FromStrings("__name__", "something_bucket", "le", "1.0"), + ct: int64p(1520430001000), }, { m: `something_bucket{le="+Inf"}`, v: 18, @@ -492,7 +503,7 @@ func TestUTF8OpenMetricsParse(t *testing.T) { }, { m: `{"go.gc_duration_seconds",quantile="0"}`, v: 4.9351e-05, - lset: labels.FromStrings("__name__", "go.gc_duration_seconds", "quantile", "0"), + lset: labels.FromStrings("__name__", "go.gc_duration_seconds", "quantile", "0.0"), ct: int64p(1520872607123), }, { m: `{"go.gc_duration_seconds",quantile="0.25"}`, diff --git a/model/textparse/promparse.go b/model/textparse/promparse.go index 5759769279..0ab932c665 100644 --- a/model/textparse/promparse.go +++ b/model/textparse/promparse.go @@ -239,7 +239,8 @@ func (p *PromParser) Metric(l *labels.Labels) string { label := unreplace(s[a:b]) c := p.offsets[i+2] - p.start d := p.offsets[i+3] - p.start - value := unreplace(s[c:d]) + value := normalizeFloatsInLabelValues(p.mtype, label, unreplace(s[c:d])) + p.builder.Add(label, value) } diff --git a/model/textparse/promparse_test.go b/model/textparse/promparse_test.go index b726d8847a..e8cf66f539 100644 --- a/model/textparse/promparse_test.go +++ b/model/textparse/promparse_test.go @@ -31,6 +31,13 @@ go_gc_duration_seconds{quantile="0.25",} 7.424100000000001e-05 go_gc_duration_seconds{quantile="0.5",a="b"} 8.3835e-05 go_gc_duration_seconds{quantile="0.8", a="b"} 8.3835e-05 go_gc_duration_seconds{ quantile="0.9", a="b"} 8.3835e-05 +# HELP prometheus_http_request_duration_seconds Histogram of latencies for HTTP requests. +# TYPE prometheus_http_request_duration_seconds histogram +prometheus_http_request_duration_seconds_bucket{handler="/",le="1"} 423 +prometheus_http_request_duration_seconds_bucket{handler="/",le="2"} 1423 +prometheus_http_request_duration_seconds_bucket{handler="/",le="+Inf"} 1423 +prometheus_http_request_duration_seconds_sum{handler="/"} 2000 +prometheus_http_request_duration_seconds_count{handler="/"} 1423 # Hrandom comment starting with prefix of HELP # wind_speed{A="2",c="3"} 12345 @@ -50,7 +57,8 @@ some:aggregate:rate5m{a_b="c"} 1 go_goroutines 33 123123 _metric_starting_with_underscore 1 testmetric{_label_starting_with_underscore="foo"} 1 -testmetric{label="\"bar\""} 1` +testmetric{label="\"bar\""} 1 +testmetric{le="10"} 1` input += "\n# HELP metric foo\x00bar" input += "\nnull_byte_metric{a=\"abc\x00\"} 1" @@ -64,7 +72,7 @@ testmetric{label="\"bar\""} 1` }, { m: `go_gc_duration_seconds{quantile="0"}`, v: 4.9351e-05, - lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0"), + lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.0"), }, { m: `go_gc_duration_seconds{quantile="0.25",}`, v: 7.424100000000001e-05, @@ -81,6 +89,32 @@ testmetric{label="\"bar\""} 1` m: `go_gc_duration_seconds{ quantile="0.9", a="b"}`, v: 8.3835e-05, lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.9", "a", "b"), + }, { + m: "prometheus_http_request_duration_seconds", + help: "Histogram of latencies for HTTP requests.", + }, { + m: "prometheus_http_request_duration_seconds", + typ: model.MetricTypeHistogram, + }, { + m: `prometheus_http_request_duration_seconds_bucket{handler="/",le="1"}`, + v: 423, + lset: labels.FromStrings("__name__", "prometheus_http_request_duration_seconds_bucket", "handler", "/", "le", "1.0"), + }, { + m: `prometheus_http_request_duration_seconds_bucket{handler="/",le="2"}`, + v: 1423, + lset: labels.FromStrings("__name__", "prometheus_http_request_duration_seconds_bucket", "handler", "/", "le", "2.0"), + }, { + m: `prometheus_http_request_duration_seconds_bucket{handler="/",le="+Inf"}`, + v: 1423, + lset: labels.FromStrings("__name__", "prometheus_http_request_duration_seconds_bucket", "handler", "/", "le", "+Inf"), + }, { + m: `prometheus_http_request_duration_seconds_sum{handler="/"}`, + v: 2000, + lset: labels.FromStrings("__name__", "prometheus_http_request_duration_seconds_sum", "handler", "/"), + }, { + m: `prometheus_http_request_duration_seconds_count{handler="/"}`, + v: 1423, + lset: labels.FromStrings("__name__", "prometheus_http_request_duration_seconds_count", "handler", "/"), }, { comment: "# Hrandom comment starting with prefix of HELP", }, { @@ -151,6 +185,10 @@ testmetric{label="\"bar\""} 1` m: "testmetric{label=\"\\\"bar\\\"\"}", v: 1, lset: labels.FromStrings("__name__", "testmetric", "label", `"bar"`), + }, { + m: `testmetric{le="10"}`, + v: 1, + lset: labels.FromStrings("__name__", "testmetric", "le", "10"), }, { m: "metric", help: "foo\x00bar", @@ -197,7 +235,7 @@ func TestUTF8PromParse(t *testing.T) { }, { m: `{"go.gc_duration_seconds",quantile="0"}`, v: 4.9351e-05, - lset: labels.FromStrings("__name__", "go.gc_duration_seconds", "quantile", "0"), + lset: labels.FromStrings("__name__", "go.gc_duration_seconds", "quantile", "0.0"), }, { m: `{"go.gc_duration_seconds",quantile="0.25",}`, v: 7.424100000000001e-05, From cf128a04727cc232d8d96a1be8e089c11b7e3c88 Mon Sep 17 00:00:00 2001 From: machine424 Date: Wed, 7 Aug 2024 19:14:59 +0200 Subject: [PATCH 15/15] test(cmd/prometheus): speed up test execution by t.Parallel() when possible turn some loops into subtests to make use of t.Parallel() requires Go 1.22 to make use of https://go.dev/blog/loopvar-preview Signed-off-by: machine424 --- cmd/prometheus/main_test.go | 171 ++++++++++++++++++++----------- cmd/prometheus/main_unix_test.go | 1 + cmd/prometheus/query_log_test.go | 2 + 3 files changed, 116 insertions(+), 58 deletions(-) diff --git a/cmd/prometheus/main_test.go b/cmd/prometheus/main_test.go index d0c2846bec..4bd1c71b2d 100644 --- a/cmd/prometheus/main_test.go +++ b/cmd/prometheus/main_test.go @@ -125,6 +125,7 @@ func TestFailedStartupExitCode(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } + t.Parallel() fakeInputFile := "fake-input-file" expectedExitStatus := 2 @@ -211,83 +212,125 @@ func TestWALSegmentSizeBounds(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } + t.Parallel() - for size, expectedExitStatus := range map[string]int{"9MB": 1, "257MB": 1, "10": 2, "1GB": 1, "12MB": 0} { - prom := exec.Command(promPath, "-test.main", "--storage.tsdb.wal-segment-size="+size, "--web.listen-address=0.0.0.0:0", "--config.file="+promConfig, "--storage.tsdb.path="+filepath.Join(t.TempDir(), "data")) + for _, tc := range []struct { + size string + exitCode int + }{ + { + size: "9MB", + exitCode: 1, + }, + { + size: "257MB", + exitCode: 1, + }, + { + size: "10", + exitCode: 2, + }, + { + size: "1GB", + exitCode: 1, + }, + { + size: "12MB", + exitCode: 0, + }, + } { + t.Run(tc.size, func(t *testing.T) { + t.Parallel() + prom := exec.Command(promPath, "-test.main", "--storage.tsdb.wal-segment-size="+tc.size, "--web.listen-address=0.0.0.0:0", "--config.file="+promConfig, "--storage.tsdb.path="+filepath.Join(t.TempDir(), "data")) - // Log stderr in case of failure. - stderr, err := prom.StderrPipe() - require.NoError(t, err) - go func() { - slurp, _ := io.ReadAll(stderr) - t.Log(string(slurp)) - }() + // Log stderr in case of failure. + stderr, err := prom.StderrPipe() + require.NoError(t, err) + go func() { + slurp, _ := io.ReadAll(stderr) + t.Log(string(slurp)) + }() - err = prom.Start() - require.NoError(t, err) + err = prom.Start() + require.NoError(t, err) - if expectedExitStatus == 0 { - done := make(chan error, 1) - go func() { done <- prom.Wait() }() - select { - case err := <-done: - require.Fail(t, "prometheus should be still running: %v", err) - case <-time.After(startupTime): - prom.Process.Kill() - <-done + if tc.exitCode == 0 { + done := make(chan error, 1) + go func() { done <- prom.Wait() }() + select { + case err := <-done: + require.Fail(t, "prometheus should be still running: %v", err) + case <-time.After(startupTime): + prom.Process.Kill() + <-done + } + return } - continue - } - err = prom.Wait() - require.Error(t, err) - var exitError *exec.ExitError - require.ErrorAs(t, err, &exitError) - status := exitError.Sys().(syscall.WaitStatus) - require.Equal(t, expectedExitStatus, status.ExitStatus()) + err = prom.Wait() + require.Error(t, err) + var exitError *exec.ExitError + require.ErrorAs(t, err, &exitError) + status := exitError.Sys().(syscall.WaitStatus) + require.Equal(t, tc.exitCode, status.ExitStatus()) + }) } } func TestMaxBlockChunkSegmentSizeBounds(t *testing.T) { - t.Parallel() - if testing.Short() { t.Skip("skipping test in short mode.") } + t.Parallel() - for size, expectedExitStatus := range map[string]int{"512KB": 1, "1MB": 0} { - prom := exec.Command(promPath, "-test.main", "--storage.tsdb.max-block-chunk-segment-size="+size, "--web.listen-address=0.0.0.0:0", "--config.file="+promConfig, "--storage.tsdb.path="+filepath.Join(t.TempDir(), "data")) + for _, tc := range []struct { + size string + exitCode int + }{ + { + size: "512KB", + exitCode: 1, + }, + { + size: "1MB", + exitCode: 0, + }, + } { + t.Run(tc.size, func(t *testing.T) { + t.Parallel() + prom := exec.Command(promPath, "-test.main", "--storage.tsdb.max-block-chunk-segment-size="+tc.size, "--web.listen-address=0.0.0.0:0", "--config.file="+promConfig, "--storage.tsdb.path="+filepath.Join(t.TempDir(), "data")) - // Log stderr in case of failure. - stderr, err := prom.StderrPipe() - require.NoError(t, err) - go func() { - slurp, _ := io.ReadAll(stderr) - t.Log(string(slurp)) - }() + // Log stderr in case of failure. + stderr, err := prom.StderrPipe() + require.NoError(t, err) + go func() { + slurp, _ := io.ReadAll(stderr) + t.Log(string(slurp)) + }() - err = prom.Start() - require.NoError(t, err) + err = prom.Start() + require.NoError(t, err) - if expectedExitStatus == 0 { - done := make(chan error, 1) - go func() { done <- prom.Wait() }() - select { - case err := <-done: - require.Fail(t, "prometheus should be still running: %v", err) - case <-time.After(startupTime): - prom.Process.Kill() - <-done + if tc.exitCode == 0 { + done := make(chan error, 1) + go func() { done <- prom.Wait() }() + select { + case err := <-done: + require.Fail(t, "prometheus should be still running: %v", err) + case <-time.After(startupTime): + prom.Process.Kill() + <-done + } + return } - continue - } - err = prom.Wait() - require.Error(t, err) - var exitError *exec.ExitError - require.ErrorAs(t, err, &exitError) - status := exitError.Sys().(syscall.WaitStatus) - require.Equal(t, expectedExitStatus, status.ExitStatus()) + err = prom.Wait() + require.Error(t, err) + var exitError *exec.ExitError + require.ErrorAs(t, err, &exitError) + status := exitError.Sys().(syscall.WaitStatus) + require.Equal(t, tc.exitCode, status.ExitStatus()) + }) } } @@ -353,6 +396,8 @@ func getCurrentGaugeValuesFor(t *testing.T, reg prometheus.Gatherer, metricNames } func TestAgentSuccessfulStartup(t *testing.T) { + t.Parallel() + prom := exec.Command(promPath, "-test.main", "--agent", "--web.listen-address=0.0.0.0:0", "--config.file="+agentConfig) require.NoError(t, prom.Start()) @@ -371,6 +416,8 @@ func TestAgentSuccessfulStartup(t *testing.T) { } func TestAgentFailedStartupWithServerFlag(t *testing.T) { + t.Parallel() + prom := exec.Command(promPath, "-test.main", "--agent", "--storage.tsdb.path=.", "--web.listen-address=0.0.0.0:0", "--config.file="+promConfig) output := bytes.Buffer{} @@ -398,6 +445,8 @@ func TestAgentFailedStartupWithServerFlag(t *testing.T) { } func TestAgentFailedStartupWithInvalidConfig(t *testing.T) { + t.Parallel() + prom := exec.Command(promPath, "-test.main", "--agent", "--web.listen-address=0.0.0.0:0", "--config.file="+promConfig) require.NoError(t, prom.Start()) @@ -419,6 +468,7 @@ func TestModeSpecificFlags(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } + t.Parallel() testcases := []struct { mode string @@ -433,6 +483,7 @@ func TestModeSpecificFlags(t *testing.T) { for _, tc := range testcases { t.Run(fmt.Sprintf("%s mode with option %s", tc.mode, tc.arg), func(t *testing.T) { + t.Parallel() args := []string{"-test.main", tc.arg, t.TempDir(), "--web.listen-address=0.0.0.0:0"} if tc.mode == "agent" { @@ -484,6 +535,8 @@ func TestDocumentation(t *testing.T) { if runtime.GOOS == "windows" { t.SkipNow() } + t.Parallel() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() @@ -508,6 +561,8 @@ func TestDocumentation(t *testing.T) { } func TestRwProtoMsgFlagParser(t *testing.T) { + t.Parallel() + defaultOpts := config.RemoteWriteProtoMsgs{ config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2, } diff --git a/cmd/prometheus/main_unix_test.go b/cmd/prometheus/main_unix_test.go index 2011fb123f..94eec27e79 100644 --- a/cmd/prometheus/main_unix_test.go +++ b/cmd/prometheus/main_unix_test.go @@ -34,6 +34,7 @@ func TestStartupInterrupt(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } + t.Parallel() port := fmt.Sprintf(":%d", testutil.RandomUnprivilegedPort(t)) diff --git a/cmd/prometheus/query_log_test.go b/cmd/prometheus/query_log_test.go index f05ad9df2a..25abf5e965 100644 --- a/cmd/prometheus/query_log_test.go +++ b/cmd/prometheus/query_log_test.go @@ -456,6 +456,7 @@ func TestQueryLog(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } + t.Parallel() cwd, err := os.Getwd() require.NoError(t, err) @@ -474,6 +475,7 @@ func TestQueryLog(t *testing.T) { } t.Run(p.String(), func(t *testing.T) { + t.Parallel() p.run(t) }) }