From c25f7c600b7bae4fee845e0574eb2b2eed5cf597 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Mon, 16 Oct 2017 14:35:47 +0100 Subject: [PATCH 01/11] Apply external labels to remote read endpoint (#3263) Fixes #3261 --- web/api/v1/api.go | 17 ++++++++ web/api/v1/api_test.go | 88 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 131ed70ea4..b8de72ebc5 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -496,6 +496,23 @@ func (api *API) remoteRead(w http.ResponseWriter, r *http.Request) { OldestInclusive: from, NewestInclusive: through, })) + externalLabels := api.config().GlobalConfig.ExternalLabels.Clone() + for _, ts := range resp.Results[i].Timeseries { + globalUsed := map[string]struct{}{} + for _, l := range ts.Labels { + if _, ok := externalLabels[model.LabelName(l.Name)]; ok { + globalUsed[l.Name] = struct{}{} + } + } + for ln, lv := range externalLabels { + if _, ok := globalUsed[string(ln)]; !ok { + ts.Labels = append(ts.Labels, &remote.LabelPair{ + Name: string(ln), + Value: string(lv), + }) + } + } + } } if err := remote.EncodeReadResponse(&resp, w); err != nil { diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 463fb1d40d..6b5e9b033f 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -14,6 +14,7 @@ package v1 import ( + "bytes" "encoding/json" "errors" "fmt" @@ -25,6 +26,8 @@ import ( "testing" "time" + "github.com/golang/protobuf/proto" + "github.com/golang/snappy" "github.com/prometheus/common/model" "github.com/prometheus/common/route" "golang.org/x/net/context" @@ -32,6 +35,8 @@ import ( "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/retrieval" + "github.com/prometheus/prometheus/storage/metric" + "github.com/prometheus/prometheus/storage/remote" ) type targetRetrieverFunc func() []*retrieval.Target @@ -532,6 +537,89 @@ func TestEndpoints(t *testing.T) { } } +func TestReadEndpoint(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 1m + test_metric1{foo="bar",baz="qux"} 1 + `) + if err != nil { + t.Fatal(err) + } + defer suite.Close() + + if err := suite.Run(); err != nil { + t.Fatal(err) + } + + api := &API{ + Storage: suite.Storage(), + QueryEngine: suite.QueryEngine(), + config: func() config.Config { + return config.Config{ + GlobalConfig: config.GlobalConfig{ + ExternalLabels: model.LabelSet{ + "baz": "a", + "b": "c", + }, + }, + } + }, + } + + // Encode the request. + matcher, err := metric.NewLabelMatcher(metric.Equal, "__name__", "test_metric1") + if err != nil { + t.Fatal(err) + } + query, err := remote.ToQuery(0, 1, metric.LabelMatchers{matcher}) + if err != nil { + t.Fatal(err) + } + req := &remote.ReadRequest{Queries: []*remote.Query{query}} + data, err := proto.Marshal(req) + if err != nil { + t.Fatal(err) + } + compressed := snappy.Encode(nil, data) + request, err := http.NewRequest("POST", "", bytes.NewBuffer(compressed)) + if err != nil { + t.Fatal(err) + } + recorder := httptest.NewRecorder() + api.remoteRead(recorder, request) + + // Decode the response. + compressed, err = ioutil.ReadAll(recorder.Result().Body) + if err != nil { + t.Fatal(err) + } + uncompressed, err := snappy.Decode(nil, compressed) + if err != nil { + t.Fatal(err) + } + + var resp remote.ReadResponse + err = proto.Unmarshal(uncompressed, &resp) + if err != nil { + t.Fatal(err) + } + + if len(resp.Results) != 1 { + t.Fatalf("Expected 1 result, got %d", len(resp.Results)) + } + + result := remote.FromQueryResult(resp.Results[0]) + expected := &model.Matrix{ + &model.SampleStream{ + Metric: model.Metric{"__name__": "test_metric1", "b": "c", "baz": "qux", "foo": "bar"}, + Values: []model.SamplePair{model.SamplePair{Value: 1, Timestamp: 0}}, + }, + } + if !reflect.DeepEqual(&result, expected) { + t.Fatalf("Expected response \n%v\n but got \n%v\n", result, expected) + } +} + func TestRespondSuccess(t *testing.T) { s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { respond(w, "test") From ced935e2d2bcb253f11e233862fde40c1b0f64a1 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Thu, 19 Oct 2017 15:05:25 +0100 Subject: [PATCH 02/11] Release 1.8.1 (#3318) --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebd71eade9..483cf9792b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.8.1 / 2017-10-19 + +* [BUGFIX] Apply external labels to remote read endpoint + ## 1.8.0 / 2017-10-06 * [CHANGE] Rule links link to the _Console_ tab rather than the _Graph_ tab to From f6df3b7d5757fb358dd10194caefc0e2ecaa2c96 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Thu, 19 Oct 2017 16:45:27 +0100 Subject: [PATCH 03/11] Bump version for 1.8.1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 27f9cd322b..a8fdfda1c7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.8.0 +1.8.1 From 3a7c51ab70fc7615cd318204d3aa7c078b7c5b20 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Mon, 23 Oct 2017 15:12:22 +0100 Subject: [PATCH 04/11] Remote read endpoint should handle matchers for external labels. (#3325) If the other Prometheus has an external label that matches that of the Prometheus being read from, then we need to remove that matcher from the request as it's not actually stored in the database - it's only added for alerts, federation and on the output of the remote read endpoint. Instead we check for that label being empty, in case there is a time series with a different label value for that external label. --- CHANGELOG.md | 2 +- web/api/v1/api.go | 22 ++++++++++++++++++++-- web/api/v1/api_test.go | 11 ++++++++--- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 483cf9792b..47fd50286c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## 1.8.1 / 2017-10-19 -* [BUGFIX] Apply external labels to remote read endpoint +* [BUGFIX] Correctly handle external labels on remote read endpoint ## 1.8.0 / 2017-10-06 diff --git a/web/api/v1/api.go b/web/api/v1/api.go index b8de72ebc5..649c08f338 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -486,7 +486,26 @@ func (api *API) remoteRead(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusBadRequest) return } - iters, err := querier.QueryRange(r.Context(), from, through, matchers...) + // Change equality matchers which match external labels + // to a matcher that looks for an empty label, + // as that label should not be present in the storage. + externalLabels := api.config().GlobalConfig.ExternalLabels.Clone() + filteredMatchers := make([]*metric.LabelMatcher, 0, len(matchers)) + for _, m := range matchers { + value := externalLabels[m.Name] + if m.Type == metric.Equal && value == m.Value { + matcher, err := metric.NewLabelMatcher(metric.Equal, m.Name, "") + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + filteredMatchers = append(filteredMatchers, matcher) + } else { + filteredMatchers = append(filteredMatchers, m) + } + } + + iters, err := querier.QueryRange(r.Context(), from, through, filteredMatchers...) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -496,7 +515,6 @@ func (api *API) remoteRead(w http.ResponseWriter, r *http.Request) { OldestInclusive: from, NewestInclusive: through, })) - externalLabels := api.config().GlobalConfig.ExternalLabels.Clone() for _, ts := range resp.Results[i].Timeseries { globalUsed := map[string]struct{}{} for _, l := range ts.Labels { diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 6b5e9b033f..8be5e84960 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -560,6 +560,7 @@ func TestReadEndpoint(t *testing.T) { ExternalLabels: model.LabelSet{ "baz": "a", "b": "c", + "d": "e", }, }, } @@ -567,11 +568,15 @@ func TestReadEndpoint(t *testing.T) { } // Encode the request. - matcher, err := metric.NewLabelMatcher(metric.Equal, "__name__", "test_metric1") + matcher1, err := metric.NewLabelMatcher(metric.Equal, "__name__", "test_metric1") if err != nil { t.Fatal(err) } - query, err := remote.ToQuery(0, 1, metric.LabelMatchers{matcher}) + matcher2, err := metric.NewLabelMatcher(metric.Equal, "d", "e") + if err != nil { + t.Fatal(err) + } + query, err := remote.ToQuery(0, 1, metric.LabelMatchers{matcher1, matcher2}) if err != nil { t.Fatal(err) } @@ -611,7 +616,7 @@ func TestReadEndpoint(t *testing.T) { result := remote.FromQueryResult(resp.Results[0]) expected := &model.Matrix{ &model.SampleStream{ - Metric: model.Metric{"__name__": "test_metric1", "b": "c", "baz": "qux", "foo": "bar"}, + Metric: model.Metric{"__name__": "test_metric1", "b": "c", "d": "e", "baz": "qux", "foo": "bar"}, Values: []model.SamplePair{model.SamplePair{Value: 1, Timestamp: 0}}, }, } From 53a5f522244ead65eefc2abadd1f07555f281d1c Mon Sep 17 00:00:00 2001 From: Tobias Schmidt Date: Tue, 10 Oct 2017 14:58:52 +0200 Subject: [PATCH 05/11] Import first batch of Prometheus documentation In order to provide documentation for each individual version, this commit starts moving Prometheus server specific documentation into the repository itself. --- docs/configuration.md | 1141 +++++++++++++++++++++++++++++++++++++++ docs/getting_started.md | 275 ++++++++++ docs/index.md | 16 + docs/installation.md | 96 ++++ 4 files changed, 1528 insertions(+) create mode 100644 docs/configuration.md create mode 100644 docs/getting_started.md create mode 100644 docs/index.md create mode 100644 docs/installation.md diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000000..67f214a02c --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,1141 @@ +--- +title: Configuration +sort_rank: 20 +--- + +# Configuration + +Prometheus is configured via command-line flags and a configuration file. While +the command-line flags configure immutable system parameters (such as storage +locations, amount of data to keep on disk and in memory, etc.), the +configuration file defines everything related to scraping [jobs and their +instances](https://prometheus.io/docs/concepts/jobs_instances/), as well as +which [rule files to load](querying/rules.md#configuring-rules). + +To view all available command-line flags, run `prometheus -h`. + +Prometheus can reload its configuration at runtime. If the new configuration +is not well-formed, the changes will not be applied. +A configuration reload is triggered by sending a `SIGHUP` to the Prometheus process or +sending a HTTP POST request to the `/-/reload` endpoint. +This will also reload any configured rule files. + +## Configuration file + +To specify which configuration file to load, use the `-config.file` flag. + +The file is written in [YAML format](http://en.wikipedia.org/wiki/YAML), +defined by the scheme described below. +Brackets indicate that a parameter is optional. For non-list parameters the +value is set to the specified default. + +Generic placeholders are defined as follows: + +* ``: a boolean that can take the values `true` or `false` +* ``: a duration matching the regular expression `[0-9]+(ms|[smhdwy])` +* ``: a string matching the regular expression `[a-zA-Z_][a-zA-Z0-9_]*` +* ``: a string of unicode characters +* ``: a valid path in the current working directory +* ``: a valid string consisting of a hostname or IP followed by an optional port number +* ``: a valid URL path +* ``: a string that can take the values `http` or `https` +* ``: a regular string +* ``: a regular string that is a secret, such as a password + +The other placeholders are specified separately. + +A valid example file can be found [here](/config/testdata/conf.good.yml). + +The global configuration specifies parameters that are valid in all other configuration +contexts. They also serve as defaults for other configuration sections. + +```yaml +global: + # How frequently to scrape targets by default. + [ scrape_interval: | default = 1m ] + + # How long until a scrape request times out. + [ scrape_timeout: | default = 10s ] + + # How frequently to evaluate rules. + [ evaluation_interval: | default = 1m ] + + # The labels to add to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + [ : ... ] + +# Rule files specifies a list of globs. Rules and alerts are read from +# all matching files. +rule_files: + [ - ... ] + +# A list of scrape configurations. +scrape_configs: + [ - ... ] + +# Alerting specifies settings related to the Alertmanager. +alerting: + alert_relabel_configs: + [ - ... ] + alertmanagers: + [ - ... ] + +# Settings related to the experimental remote write feature. +remote_write: + [ - ... ] + +# Settings related to the experimental remote read feature. +remote_read: + [ - ... ] +``` + +### `` + +A `scrape_config` section specifies a set of targets and parameters describing how +to scrape them. In the general case, one scrape configuration specifies a single +job. In advanced configurations, this may change. + +Targets may be statically configured via the `static_configs` parameter or +dynamically discovered using one of the supported service-discovery mechanisms. + +Additionally, `relabel_configs` allow advanced modifications to any +target and its labels before scraping. + +```yaml +# The job name assigned to scraped metrics by default. +job_name: + +# How frequently to scrape targets from this job. +[ scrape_interval: | default = ] + +# Per-scrape timeout when scraping this job. +[ scrape_timeout: | default = ] + +# The HTTP resource path on which to fetch metrics from targets. +[ metrics_path: | default = /metrics ] + +# honor_labels controls how Prometheus handles conflicts between labels that are +# already present in scraped data and labels that Prometheus would attach +# server-side ("job" and "instance" labels, manually configured target +# labels, and labels generated by service discovery implementations). +# +# If honor_labels is set to "true", label conflicts are resolved by keeping label +# values from the scraped data and ignoring the conflicting server-side labels. +# +# If honor_labels is set to "false", label conflicts are resolved by renaming +# conflicting labels in the scraped data to "exported_" (for +# example "exported_instance", "exported_job") and then attaching server-side +# labels. This is useful for use cases such as federation, where all labels +# specified in the target should be preserved. +# +# Note that any globally configured "external_labels" are unaffected by this +# setting. In communication with external systems, they are always applied only +# when a time series does not have a given label yet and are ignored otherwise. +[ honor_labels: | default = false ] + +# Configures the protocol scheme used for requests. +[ scheme: | default = http ] + +# Optional HTTP URL parameters. +params: + [ : [, ...] ] + +# Sets the `Authorization` header on every scrape request with the +# configured username and password. +basic_auth: + [ username: ] + [ password: ] + +# Sets the `Authorization` header on every scrape request with +# the configured bearer token. It is mutually exclusive with `bearer_token_file`. +[ bearer_token: ] + +# Sets the `Authorization` header on every scrape request with the bearer token +# read from the configured file. It is mutually exclusive with `bearer_token`. +[ bearer_token_file: /path/to/bearer/token/file ] + +# Configures the scrape request's TLS settings. +tls_config: + [ ] + +# Optional proxy URL. +[ proxy_url: ] + +# List of Azure service discovery configurations. +azure_sd_configs: + [ - ... ] + +# List of Consul service discovery configurations. +consul_sd_configs: + [ - ... ] + +# List of DNS service discovery configurations. +dns_sd_configs: + [ - ... ] + +# List of EC2 service discovery configurations. +ec2_sd_configs: + [ - ... ] + +# List of OpenStack service discovery configurations. +openstack_sd_configs: + [ - ... ] + +# List of file service discovery configurations. +file_sd_configs: + [ - ... ] + +# List of GCE service discovery configurations. +gce_sd_configs: + [ - ... ] + +# List of Kubernetes service discovery configurations. +kubernetes_sd_configs: + [ - ... ] + +# List of Marathon service discovery configurations. +marathon_sd_configs: + [ - ... ] + +# List of AirBnB's Nerve service discovery configurations. +nerve_sd_configs: + [ - ... ] + +# List of Zookeeper Serverset service discovery configurations. +serverset_sd_configs: + [ - ... ] + +# List of Triton service discovery configurations. +triton_sd_configs: + [ - ... ] + +# List of labeled statically configured targets for this job. +static_configs: + [ - ... ] + +# List of target relabel configurations. +relabel_configs: + [ - ... ] + +# List of metric relabel configurations. +metric_relabel_configs: + [ - ... ] + +# Per-scrape limit on number of scraped samples that will be accepted. +# If more than this number of samples are present after metric relabelling +# the entire scrape will be treated as failed. 0 means no limit. +[ sample_limit: | default = 0 ] +``` + +Where `` must be unique across all scrape configurations. + +### `` + +A `tls_config` allows configuring TLS connections. + +```yaml +# CA certificate to validate API server certificate with. +[ ca_file: ] + +# Certificate and key files for client cert authentication to the server. +[ cert_file: ] +[ key_file: ] + +# ServerName extension to indicate the name of the server. +# http://tools.ietf.org/html/rfc4366#section-3.1 +[ server_name: ] + +# Disable validation of the server certificate. +[ insecure_skip_verify: ] +``` + +### `` + +CAUTION: Azure SD is in beta: breaking changes to configuration are still +likely in future releases. + +Azure SD configurations allow retrieving scrape targets from Azure VMs. + +The following meta labels are available on targets during relabeling: + +* `__meta_azure_machine_id`: the machine ID +* `__meta_azure_machine_location`: the location the machine runs in +* `__meta_azure_machine_name`: the machine name +* `__meta_azure_machine_private_ip`: the machine's private IP +* `__meta_azure_machine_resource_group`: the machine's resource group +* `__meta_azure_machine_tag_`: each tag value of the machine + +See below for the configuration options for Azure discovery: + +```yaml +# The information to access the Azure API. +# The subscription ID. +subscription_id: +# The tenant ID. +tenant_id: +# The client ID. +client_id: +# The client secret. +client_secret: + +# Refresh interval to re-read the instance list. +[ refresh_interval: | default = 300s ] + +# The port to scrape metrics from. If using the public IP address, this must +# instead be specified in the relabeling rule. +[ port: | default = 80 ] +``` + +### `` + +Consul SD configurations allow retrieving scrape targets from [Consul's](https://www.consul.io) +Catalog API. + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_consul_address`: the address of the target +* `__meta_consul_dc`: the datacenter name for the target +* `__meta_consul_node`: the node name defined for the target +* `__meta_consul_service_address`: the service address of the target +* `__meta_consul_service_id`: the service ID of the target +* `__meta_consul_service_port`: the service port of the target +* `__meta_consul_service`: the name of the service the target belongs to +* `__meta_consul_tags`: the list of tags of the target joined by the tag separator + +```yaml +# The information to access the Consul API. It is to be defined +# as the Consul documentation requires. +server: +[ token: ] +[ datacenter: ] +[ scheme: ] +[ username: ] +[ password: ] + +# A list of services for which targets are retrieved. If omitted, all services +# are scraped. +services: + [ - ] + +# The string by which Consul tags are joined into the tag label. +[ tag_separator: | default = , ] +``` + +Note that the IP number and port used to scrape the targets is assembled as +`<__meta_consul_address>:<__meta_consul_service_port>`. However, in some +Consul setups, the relevant address is in `__meta_consul_service_address`. +In those cases, you can use the [relabel](#relabel_config) +feature to replace the special `__address__` label. + +### `` + +A DNS-based service discovery configuration allows specifying a set of DNS +domain names which are periodically queried to discover a list of targets. The +DNS servers to be contacted are read from `/etc/resolv.conf`. + +This service discovery method only supports basic DNS A, AAAA and SRV record +queries, but not the advanced DNS-SD approach specified in +[RFC6763](https://tools.ietf.org/html/rfc6763). + +During the [relabeling phase](#relabel_config), the meta label +`__meta_dns_name` is available on each target and is set to the +record name that produced the discovered target. + +```yaml +# A list of DNS domain names to be queried. +names: + [ - ] + +# The type of DNS query to perform. +[ type: | default = 'SRV' ] + +# The port number used if the query type is not SRV. +[ port: ] + +# The time after which the provided names are refreshed. +[ refresh_interval: | default = 30s ] +``` + +Where `` is a valid DNS domain name. +Where `` is `SRV`, `A`, or `AAAA`. + +### `` + +EC2 SD configurations allow retrieving scrape targets from AWS EC2 +instances. The private IP address is used by default, but may be changed to +the public IP address with relabeling. + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_ec2_availability_zone`: the availability zone in which the instance is running +* `__meta_ec2_instance_id`: the EC2 instance ID +* `__meta_ec2_instance_state`: the state of the EC2 instance +* `__meta_ec2_instance_type`: the type of the EC2 instance +* `__meta_ec2_private_ip`: the private IP address of the instance, if present +* `__meta_ec2_public_dns_name`: the public DNS name of the instance, if available +* `__meta_ec2_public_ip`: the public IP address of the instance, if available +* `__meta_ec2_subnet_id`: comma separated list of subnets IDs in which the instance is running, if available +* `__meta_ec2_tag_`: each tag value of the instance +* `__meta_ec2_vpc_id`: the ID of the VPC in which the instance is running, if available + +See below for the configuration options for EC2 discovery: + +```yaml +# The information to access the EC2 API. + +# The AWS Region. +region: + +# The AWS API keys. If blank, the environment variables `AWS_ACCESS_KEY_ID` +# and `AWS_SECRET_ACCESS_KEY` are used. +[ access_key: ] +[ secret_key: ] +# Named AWS profile used to connect to the API. +[ profile: ] + +# Refresh interval to re-read the instance list. +[ refresh_interval: | default = 60s ] + +# The port to scrape metrics from. If using the public IP address, this must +# instead be specified in the relabeling rule. +[ port: | default = 80 ] +``` + +### `` + +CAUTION: OpenStack SD is in beta: breaking changes to configuration are still +likely in future releases. + +OpenStack SD configurations allow retrieving scrape targets from OpenStack Nova +instances. + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_openstack_instance_id`: the OpenStack instance ID +* `__meta_openstack_instance_name`: the OpenStack instance name +* `__meta_openstack_instance_status`: the status of the OpenStack instance +* `__meta_openstack_instance_flavor`: the flavor of the OpenStack instance +* `__meta_openstack_public_ip`: the public IP of the OpenStack instance +* `__meta_openstack_private_ip`: the private IP of the OpenStack instance +* `__meta_openstack_tag_`: each tag value of the instance + +See below for the configuration options for OpenStack discovery: + +```yaml +# The information to access the OpenStack API. + +# The OpenStack Region. +region: + +# identity_endpoint specifies the HTTP endpoint that is required to work with +# the Identity API of the appropriate version. While it's ultimately needed by +# all of the identity services, it will often be populated by a provider-level +# function. +[ identity_endpoint: ] + +# username is required if using Identity V2 API. Consult with your provider's +# control panel to discover your account's username. In Identity V3, either +# userid or a combination of username and domain_id or domain_name are needed. +[ username: ] +[ userid: ] + +# password for the Identity V2 and V3 APIs. Consult with your provider's +# control panel to discover your account's preferred method of authentication. +[ password: ] + +# At most one of domain_id and domain_name must be provided if using username +# with Identity V3. Otherwise, either are optional. +[ domain_name: ] +[ domain_id: ] + +# The project_id and project_name fields are optional for the Identity V2 API. +# Some providers allow you to specify a project_name instead of the project_id. +# Some require both. Your provider's authentication policies will determine +# how these fields influence authentication. +[ project_name: ] +[ project_id: ] + +# Refresh interval to re-read the instance list. +[ refresh_interval: | default = 60s ] + +# The port to scrape metrics from. If using the public IP address, this must +# instead be specified in the relabeling rule. +[ port: | default = 80 ] +``` + +### `` + +File-based service discovery provides a more generic way to configure static targets +and serves as an interface to plug in custom service discovery mechanisms. + +It reads a set of files containing a list of zero or more +``s. Changes to all defined files are detected via disk watches +and applied immediately. Files may be provided in YAML or JSON format. Only +changes resulting in well-formed target groups are applied. + +The JSON file must contain a list of static configs, using this format: + +```yaml +[ + { + "targets": [ "", ... ], + "labels": { + "": "", ... + } + }, + ... +] +``` + +As a fallback, the file contents are also re-read periodically at the specified +refresh interval. + +Each target has a meta label `__meta_filepath` during the +[relabeling phase](#relabel_config). Its value is set to the +filepath from which the target was extracted. + +There is a list of +[integrations](/docs/operating/configuration/#) with this +discovery mechanism. + +```yaml +# Patterns for files from which target groups are extracted. +files: + [ - ... ] + +# Refresh interval to re-read the files. +[ refresh_interval: | default = 5m ] +``` + +Where `` may be a path ending in `.json`, `.yml` or `.yaml`. The last path segment +may contain a single `*` that matches any character sequence, e.g. `my/path/tg_*.json`. + +### `` + +CAUTION: GCE SD is in beta: breaking changes to configuration are still +likely in future releases. + +[GCE](https://cloud.google.com/compute/) SD configurations allow retrieving scrape targets from GCP GCE instances. +The private IP address is used by default, but may be changed to the public IP +address with relabeling. + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_gce_instance_name`: the name of the instance +* `__meta_gce_metadata_`: each metadata item of the instance +* `__meta_gce_network`: the network URL of the instance +* `__meta_gce_private_ip`: the private IP address of the instance +* `__meta_gce_project`: the GCP project in which the instance is running +* `__meta_gce_public_ip`: the public IP address of the instance, if present +* `__meta_gce_subnetwork`: the subnetwork URL of the instance +* `__meta_gce_tags`: comma separated list of instance tags +* `__meta_gce_zone`: the GCE zone URL in which the instance is running + +See below for the configuration options for GCE discovery: + +```yaml +# The information to access the GCE API. + +# The GCP Project +project: + +# The zone of the scrape targets. If you need multiple zones use multiple +# gce_sd_configs. +zone: + +# Filter can be used optionally to filter the instance list by other criteria +[ filter: ] + +# Refresh interval to re-read the instance list +[ refresh_interval: | default = 60s ] + +# The port to scrape metrics from. If using the public IP address, this must +# instead be specified in the relabeling rule. +[ port: | default = 80 ] + +# The tag separator is used to separate the tags on concatenation +[ tag_separator: | default = , ] +``` + +Credentials are discovered by the Google Cloud SDK default client by looking +in the following places, preferring the first location found: + +1. a JSON file specified by the `GOOGLE_APPLICATION_CREDENTIALS` environment variable +2. a JSON file in the well-known path `$HOME/.config/gcloud/application_default_credentials.json` +3. fetched from the GCE metadata server + +If Prometheus is running within GCE, the service account associated with the +instance it is running on should have at least read-only permissions to the +compute resources. If running outside of GCE make sure to create an appropriate +service account and place the credential file in one of the expected locations. + +### `` + +CAUTION: Kubernetes SD is in beta: breaking changes to configuration are still +likely in future releases. + +Kubernetes SD configurations allow retrieving scrape targets from +[Kubernetes'](http://kubernetes.io/) REST API and always staying synchronized with +the cluster state. + +One of the following `role` types can be configured to discover targets: + +#### `node` + +The `node` role discovers one target per cluster node with the address defaulting +to the Kubelet's HTTP port. +The target address defaults to the first existing address of the Kubernetes +node object in the address type order of `NodeInternalIP`, `NodeExternalIP`, +`NodeLegacyHostIP`, and `NodeHostName`. + +Available meta labels: + +* `__meta_kubernetes_node_name`: The name of the node object. +* `__meta_kubernetes_node_label_`: Each label from the node object. +* `__meta_kubernetes_node_annotation_`: Each annotation from the node object. +* `__meta_kubernetes_node_address_`: The first address for each node address type, if it exists. + +In addition, the `instance` label for the node will be set to the node name +as retrieved from the API server. + +#### `service` + +The `service` role discovers a target for each service port for each service. +This is generally useful for blackbox monitoring of a service. +The address will be set to the Kubernetes DNS name of the service and respective +service port. + +Available meta labels: + +* `__meta_kubernetes_namespace`: The namespace of the service object. +* `__meta_kubernetes_service_name`: The name of the service object. +* `__meta_kubernetes_service_label_`: The label of the service object. +* `__meta_kubernetes_service_annotation_`: The annotation of the service object. +* `__meta_kubernetes_service_port_name`: Name of the service port for the target. +* `__meta_kubernetes_service_port_number`: Number of the service port for the target. +* `__meta_kubernetes_service_port_protocol`: Protocol of the service port for the target. + +#### `pod` + +The `pod` role discovers all pods and exposes their containers as targets. For each declared +port of a container, a single target is generated. If a container has no specified ports, +a port-free target per container is created for manually adding a port via relabeling. + +Available meta labels: + +* `__meta_kubernetes_namespace`: The namespace of the pod object. +* `__meta_kubernetes_pod_name`: The name of the pod object. +* `__meta_kubernetes_pod_ip`: The pod IP of the pod object. +* `__meta_kubernetes_pod_label_`: The label of the pod object. +* `__meta_kubernetes_pod_annotation_`: The annotation of the pod object. +* `__meta_kubernetes_pod_container_name`: Name of the container the target address points to. +* `__meta_kubernetes_pod_container_port_name`: Name of the container port. +* `__meta_kubernetes_pod_container_port_number`: Number of the container port. +* `__meta_kubernetes_pod_container_port_protocol`: Protocol of the container port. +* `__meta_kubernetes_pod_ready`: Set to `true` or `false` for the pod's ready state. +* `__meta_kubernetes_pod_node_name`: The name of the node the pod is scheduled onto. +* `__meta_kubernetes_pod_host_ip`: The current host IP of the pod object. + +#### `endpoints` + +The `endpoints` role discovers targets from listed endpoints of a service. For each endpoint +address one target is discovered per port. If the endpoint is backed by a pod, all +additional container ports of the pod, not bound to an endpoint port, are discovered as targets as well. + +Available meta labels: + +* `__meta_kubernetes_namespace`: The namespace of the endpoints object. +* `__meta_kubernetes_endpoints_name`: The names of the endpoints object. +* For all targets discovered directly from the endpoints list (those not additionally inferred + from underlying pods), the following labels are attached: + * `__meta_kubernetes_endpoint_ready`: Set to `true` or `false` for the endpoint's ready state. + * `__meta_kubernetes_endpoint_port_name`: Name of the endpoint port. + * `__meta_kubernetes_endpoint_port_protocol`: Protocol of the endpoint port. +* If the endpoints belong to a service, all labels of the `role: service` discovery are attached. +* For all targets backed by a pod, all labels of the `role: pod` discovery are attached. + +See below for the configuration options for Kubernetes discovery: + +```yaml +# The information to access the Kubernetes API. + +# The API server addresses. If left empty, Prometheus is assumed to run inside +# of the cluster and will discover API servers automatically and use the pod's +# CA certificate and bearer token file at /var/run/secrets/kubernetes.io/serviceaccount/. +[ api_server: ] + +# The Kubernetes role of entities that should be discovered. +role: + +# Optional authentication information used to authenticate to the API server. +# Note that `basic_auth`, `bearer_token` and `bearer_token_file` options are +# mutually exclusive. + +# Optional HTTP basic authentication information. +basic_auth: + [ username: ] + [ password: ] + +# Optional bearer token authentication information. +[ bearer_token: ] + +# Optional bearer token file authentication information. +[ bearer_token_file: ] + +# TLS configuration. +tls_config: + [ ] + +# Optional namespace discovery. If omitted, all namespaces are used. +namespaces: + names: + [ - ] +``` + +Where `` must be `endpoints`, `service`, `pod`, or `node`. + +See [this example Prometheus configuration file](/documentation/examples/prometheus-kubernetes.yml) +for a detailed example of configuring Prometheus for Kubernetes. + +You may wish to check out the 3rd party [Prometheus Operator](https://github.com/coreos/prometheus-operator), +which automates the Prometheus setup on top of Kubernetes. + +### `` + +CAUTION: Marathon SD is in beta: breaking changes to configuration are still +likely in future releases. + +Marathon SD configurations allow retrieving scrape targets using the +[Marathon](https://mesosphere.github.io/marathon/) REST API. Prometheus +will periodically check the REST endpoint for currently running tasks and +create a target group for every app that has at least one healthy task. + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_marathon_app`: the name of the app (with slashes replaced by dashes) +* `__meta_marathon_image`: the name of the Docker image used (if available) +* `__meta_marathon_task`: the ID of the Mesos task +* `__meta_marathon_app_label_`: any Marathon labels attached to the app +* `__meta_marathon_port_definition_label_`: the port definition labels +* `__meta_marathon_port_mapping_label_`: the port mapping labels + +See below for the configuration options for Marathon discovery: + +```yaml +# List of URLs to be used to contact Marathon servers. +# You need to provide at least one server URL, but should provide URLs for +# all masters you have running. +servers: + - + +# Optional bearer token authentication information. +# It is mutually exclusive with `bearer_token_file`. +[ bearer_token: ] + +# Optional bearer token file authentication information. +# It is mutually exclusive with `bearer_token`. +[ bearer_token_file: ] + +# Polling interval +[ refresh_interval: | default = 30s ] +``` + +By default every app listed in Marathon will be scraped by Prometheus. If not all +of your services provide Prometheus metrics, you can use a Marathon label and +Prometheus relabeling to control which instances will actually be scraped. Also +by default all apps will show up as a single job in Prometheus (the one specified +in the configuration file), which can also be changed using relabeling. + +### `` + +Nerve SD configurations allow retrieving scrape targets from [AirBnB's +Nerve](https://github.com/airbnb/nerve) which are stored in +[Zookeeper](https://zookeeper.apache.org/). + +The following meta labels are available on targets during [relabeling](#relabel_config): + +* `__meta_nerve_path`: the full path to the endpoint node in Zookeeper +* `__meta_nerve_endpoint_host`: the host of the endpoint +* `__meta_nerve_endpoint_port`: the port of the endpoint +* `__meta_nerve_endpoint_name`: the name of the endpoint + +```yaml +# The Zookeeper servers. +servers: + - +# Paths can point to a single service, or the root of a tree of services. +paths: + - +[ timeout: | default = 10s ] +``` + +### `` + +Serverset SD configurations allow retrieving scrape targets from +[Serversets](https://github.com/twitter/finagle/tree/master/finagle-serversets) +which are stored in [Zookeeper](https://zookeeper.apache.org/). Serversets are +commonly used by [Finagle](https://twitter.github.io/finagle/) and +[Aurora](http://aurora.apache.org/). + +The following meta labels are available on targets during relabeling: + +* `__meta_serverset_path`: the full path to the serverset member node in Zookeeper +* `__meta_serverset_endpoint_host`: the host of the default endpoint +* `__meta_serverset_endpoint_port`: the port of the default endpoint +* `__meta_serverset_endpoint_host_`: the host of the given endpoint +* `__meta_serverset_endpoint_port_`: the port of the given endpoint +* `__meta_serverset_shard`: the shard number of the member +* `__meta_serverset_status`: the status of the member + +```yaml +# The Zookeeper servers. +servers: + - +# Paths can point to a single serverset, or the root of a tree of serversets. +paths: + - +[ timeout: | default = 10s ] +``` + +Serverset data must be in the JSON format, the Thrift format is not currently supported. + +### `` + +CAUTION: Triton SD is in beta: breaking changes to configuration are still +likely in future releases. + +[Triton](https://github.com/joyent/triton) SD configurations allow retrieving +scrape targets from [Container Monitor](https://github.com/joyent/rfd/blob/master/rfd/0027/README.md) +discovery endpoints. + +The following meta labels are available on targets during relabeling: + +* `__meta_triton_machine_id`: the UUID of the target container +* `__meta_triton_machine_alias`: the alias of the target container +* `__meta_triton_machine_image`: the target containers image type +* `__meta_triton_machine_server_id`: the server UUID for the target container + +```yaml +# The information to access the Triton discovery API. + +# The account to use for discovering new target containers. +account: + +# The DNS suffix which should be applied to target containers. +dns_suffix: + +# The Triton discovery endpoint (e.g. 'cmon.us-east-3b.triton.zone'). This is +# often the same value as dns_suffix. +endpoint: + +# The port to use for discovery and metric scraping. +[ port: | default = 9163 ] + +# The interval which should should be used for refreshing target containers. +[ refresh_interval: | default = 60s ] + +# The Triton discovery API version. +[ version: | default = 1 ] + +# TLS configuration. +tls_config: + [ ] +``` + +### `` + +A `static_config` allows specifying a list of targets and a common label set +for them. It is the canonical way to specify static targets in a scrape +configuration. + +```yaml +# The targets specified by the static config. +targets: + [ - '' ] + +# Labels assigned to all metrics scraped from the targets. +labels: + [ : ... ] +``` + +### `` + +Relabeling is a powerful tool to dynamically rewrite the label set of a target before +it gets scraped. Multiple relabeling steps can be configured per scrape configuration. +They are applied to the label set of each target in order of their appearance +in the configuration file. + +Initially, aside from the configured per-target labels, a target's `job` +label is set to the `job_name` value of the respective scrape configuration. +The `__address__` label is set to the `:` address of the target. +After relabeling, the `instance` label is set to the value of `__address__` by default if +it was not set during relabeling. The `__scheme__` and `__metrics_path__` labels +are set to the scheme and metrics path of the target respectively. The `__param_` +label is set to the value of the first passed URL parameter called ``. + +Additional labels prefixed with `__meta_` may be available during the +relabeling phase. They are set by the service discovery mechanism that provided +the target and vary between mechanisms. + +Labels starting with `__` will be removed from the label set after relabeling is completed. + +If a relabeling step needs to store a label value only temporarily (as the +input to a subsequent relabeling step), use the `__tmp` label name prefix. This +prefix is guaranteed to never be used by Prometheus itself. + +```yaml +# The source labels select values from existing labels. Their content is concatenated +# using the configured separator and matched against the configured regular expression +# for the replace, keep, and drop actions. +[ source_labels: '[' [, ...] ']' ] + +# Separator placed between concatenated source label values. +[ separator: | default = ; ] + +# Label to which the resulting value is written in a replace action. +# It is mandatory for replace actions. Regex capture groups are available. +[ target_label: ] + +# Regular expression against which the extracted value is matched. +[ regex: | default = (.*) ] + +# Modulus to take of the hash of the source label values. +[ modulus: ] + +# Replacement value against which a regex replace is performed if the +# regular expression matches. Regex capture groups are available. +[ replacement: | default = $1 ] + +# Action to perform based on regex matching. +[ action: | default = replace ] +``` + +`` is any valid +[RE2 regular expression](https://github.com/google/re2/wiki/Syntax). It is +required for the `replace`, `keep`, `drop`, `labelmap`,`labeldrop` and `labelkeep` actions. The regex is +anchored on both ends. To un-anchor the regex, use `.*.*`. + +`` determines the relabeling action to take: + +* `replace`: Match `regex` against the concatenated `source_labels`. Then, set + `target_label` to `replacement`, with match group references + (`${1}`, `${2}`, ...) in `replacement` substituted by their value. If `regex` + does not match, no replacement takes place. +* `keep`: Drop targets for which `regex` does not match the concatenated `source_labels`. +* `drop`: Drop targets for which `regex` matches the concatenated `source_labels`. +* `hashmod`: Set `target_label` to the `modulus` of a hash of the concatenated `source_labels`. +* `labelmap`: Match `regex` against all label names. Then copy the values of the matching labels + to label names given by `replacement` with match group references + (`${1}`, `${2}`, ...) in `replacement` substituted by their value. +* `labeldrop`: Match `regex` against all label names. Any label that matches will be + removed from the set of labels. +* `labelkeep`: Match `regex` against all label names. Any label that does not match will be + removed from the set of labels. + +Care must be taken with `labeldrop` and `labelkeep` to ensure that metrics are still uniquely labeled +once the labels are removed. + +### `` + +Metric relabeling is applied to samples as the last step before ingestion. It +has the same configuration format and actions as target relabeling. Metric +relabeling does not apply to automatically generated timeseries such as `up`. + +One use for this is to blacklist time series that are too expensive to ingest. + +### `` + +Alert relabeling is applied to alerts before they are sent to the Alertmanager. +It has the same configuration format and actions as target relabeling. Alert +relabeling is applied after external labels. + +One use for this is ensuring a HA pair of Prometheus servers with different +external labels send identical alerts. + +### `` + +CAUTION: Dynamic discovery of Alertmanager instances is in alpha state. Breaking configuration +changes may happen in future releases. Use static configuration via the `-alertmanager.url` flag +as a stable alternative. + +An `alertmanager_config` section specifies Alertmanager instances the Prometheus server sends +alerts to. It also provides parameters to configure how to communicate with these Alertmanagers. + +Alertmanagers may be statically configured via the `static_configs` parameter or +dynamically discovered using one of the supported service-discovery mechanisms. + +Additionally, `relabel_configs` allow selecting Alertmanagers from discovered +entities and provide advanced modifications to the used API path, which is exposed +through the `__alerts_path__` label. + +```yaml +# Per-target Alertmanager timeout when pushing alerts. +[ timeout: | default = 10s ] + +# Prefix for the HTTP path alerts are pushed to. +[ path_prefix: | default = / ] + +# Configures the protocol scheme used for requests. +[ scheme: | default = http ] + +# Sets the `Authorization` header on every request with the +# configured username and password. +basic_auth: + [ username: ] + [ password: ] + +# Sets the `Authorization` header on every request with +# the configured bearer token. It is mutually exclusive with `bearer_token_file`. +[ bearer_token: ] + +# Sets the `Authorization` header on every request with the bearer token +# read from the configured file. It is mutually exclusive with `bearer_token`. +[ bearer_token_file: /path/to/bearer/token/file ] + +# Configures the scrape request's TLS settings. +tls_config: + [ ] + +# Optional proxy URL. +[ proxy_url: ] + +# List of Azure service discovery configurations. +azure_sd_configs: + [ - ... ] + +# List of Consul service discovery configurations. +consul_sd_configs: + [ - ... ] + +# List of DNS service discovery configurations. +dns_sd_configs: + [ - ... ] + +# List of EC2 service discovery configurations. +ec2_sd_configs: + [ - ... ] + +# List of file service discovery configurations. +file_sd_configs: + [ - ... ] + +# List of GCE service discovery configurations. +gce_sd_configs: + [ - ... ] + +# List of Kubernetes service discovery configurations. +kubernetes_sd_configs: + [ - ... ] + +# List of Marathon service discovery configurations. +marathon_sd_configs: + [ - ... ] + +# List of AirBnB's Nerve service discovery configurations. +nerve_sd_configs: + [ - ... ] + +# List of Zookeeper Serverset service discovery configurations. +serverset_sd_configs: + [ - ... ] + +# List of Triton service discovery configurations. +triton_sd_configs: + [ - ... ] + +# List of labeled statically configured Alertmanagers. +static_configs: + [ - ... ] + +# List of Alertmanager relabel configurations. +relabel_configs: + [ - ... ] +``` + +### `` + +CAUTION: Remote write is experimental: breaking changes to configuration are +likely in future releases. + +`write_relabel_configs` is relabeling applied to samples before sending them +to the remote endpoint. Write relabeling is applied after external labels. This +could be used to limit which samples are sent. + +There is a [small demo](/documentation/examples/remote_storage) of how to use +this functionality. + +```yaml +# The URL of the endpoint to send samples to. +url: + +# Timeout for requests to the remote write endpoint. +[ remote_timeout: | default = 30s ] + +# List of remote write relabel configurations. +write_relabel_configs: + [ - ... ] + +# Sets the `Authorization` header on every remote write request with the +# configured username and password. +basic_auth: + [ username: ] + [ password: ] + +# Sets the `Authorization` header on every remote write request with +# the configured bearer token. It is mutually exclusive with `bearer_token_file`. +[ bearer_token: ] + +# Sets the `Authorization` header on every remote write request with the bearer token +# read from the configured file. It is mutually exclusive with `bearer_token`. +[ bearer_token_file: /path/to/bearer/token/file ] + +# Configures the remote write request's TLS settings. +tls_config: + [ ] + +# Optional proxy URL. +[ proxy_url: ] +``` + +There is a list of +[integrations](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) +with this feature. + +### `` + +CAUTION: Remote read is experimental: breaking changes to configuration are +likely in future releases. + +```yaml +# The URL of the endpoint to query from. +url: + +# Timeout for requests to the remote read endpoint. +[ remote_timeout: | default = 30s ] + +# Sets the `Authorization` header on every remote read request with the +# configured username and password. +basic_auth: + [ username: ] + [ password: ] + +# Sets the `Authorization` header on every remote read request with +# the configured bearer token. It is mutually exclusive with `bearer_token_file`. +[ bearer_token: ] + +# Sets the `Authorization` header on every remote read request with the bearer token +# read from the configured file. It is mutually exclusive with `bearer_token`. +[ bearer_token_file: /path/to/bearer/token/file ] + +# Configures the remote read request's TLS settings. +tls_config: + [ ] + +# Optional proxy URL. +[ proxy_url: ] +``` + +There is a list of +[integrations](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) +with this feature. diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 0000000000..8585e25664 --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,275 @@ +--- +title: Getting started +sort_rank: 10 +--- + +# Getting started + +This guide is a "Hello World"-style tutorial which shows how to install, +configure, and use Prometheus in a simple example setup. You will download and run +Prometheus locally, configure it to scrape itself and an example application, +and then work with queries, rules, and graphs to make use of the collected time +series data. + +## Downloading and running Prometheus + +[Download the latest release](https://prometheus.io/download) of Prometheus for +your platform, then extract and run it: + +```bash +tar xvfz prometheus-*.tar.gz +cd prometheus-* +``` + +Before starting Prometheus, let's configure it. + +## Configuring Prometheus to monitor itself + +Prometheus collects metrics from monitored targets by scraping metrics HTTP +endpoints on these targets. Since Prometheus also exposes data in the same +manner about itself, it can also scrape and monitor its own health. + +While a Prometheus server that collects only data about itself is not very +useful in practice, it is a good starting example. Save the following basic +Prometheus configuration as a file named `prometheus.yml`: + +```yaml +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'codelab-monitor' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['localhost:9090'] +``` + +For a complete specification of configuration options, see the +[configuration documentation](configuration.md). + +## Starting Prometheus + +To start Prometheus with your newly created configuration file, change to your +Prometheus build directory and run: + +```bash +# Start Prometheus. +# By default, Prometheus stores its database in ./data (flag -storage.local.path). +./prometheus -config.file=prometheus.yml +``` + +Prometheus should start up and it should show a status page about itself at +[localhost:9090](http://localhost:9090). Give it a couple of seconds to collect +data about itself from its own HTTP metrics endpoint. + +You can also verify that Prometheus is serving metrics about itself by +navigating to its metrics endpoint: +[localhost:9090/metrics](http://localhost:9090/metrics) + +The number of OS threads executed by Prometheus is controlled by the +`GOMAXPROCS` environment variable. As of Go 1.5 the default value is +the number of cores available. + +Blindly setting `GOMAXPROCS` to a high value can be +counterproductive. See the relevant [Go +FAQs](http://golang.org/doc/faq#Why_no_multi_CPU). + +Note that Prometheus by default uses around 3GB in memory. If you have a +smaller machine, you can tune Prometheus to use less memory. For details, +see the [memory usage documentation](storage.md#memory-usage). + +## Using the expression browser + +Let us try looking at some data that Prometheus has collected about itself. To +use Prometheus's built-in expression browser, navigate to +http://localhost:9090/graph and choose the "Console" view within the "Graph" +tab. + +As you can gather from http://localhost:9090/metrics, one metric that +Prometheus exports about itself is called +`prometheus_target_interval_length_seconds` (the actual amount of time between +target scrapes). Go ahead and enter this into the expression console: + +``` +prometheus_target_interval_length_seconds +``` + +This should return a lot of different time series (along with the latest value +recorded for each), all with the metric name +`prometheus_target_interval_length_seconds`, but with different labels. These +labels designate different latency percentiles and target group intervals. + +If we were only interested in the 99th percentile latencies, we could use this +query to retrieve that information: + +``` +prometheus_target_interval_length_seconds{quantile="0.99"} +``` + +To count the number of returned time series, you could write: + +``` +count(prometheus_target_interval_length_seconds) +``` + +For more about the expression language, see the +[expression language documentation](querying/basics.md). + +## Using the graphing interface + +To graph expressions, navigate to http://localhost:9090/graph and use the "Graph" +tab. + +For example, enter the following expression to graph the per-second rate of all +storage chunk operations happening in the self-scraped Prometheus: + +``` +rate(prometheus_local_storage_chunk_ops_total[1m]) +``` + +Experiment with the graph range parameters and other settings. + +## Starting up some sample targets + +Let us make this more interesting and start some example targets for Prometheus +to scrape. + +The Go client library includes an example which exports fictional RPC latencies +for three services with different latency distributions. + +Ensure you have the [Go compiler installed](https://golang.org/doc/install) and +have a [working Go build environment](https://golang.org/doc/code.html) (with +correct `GOPATH`) set up. + +Download the Go client library for Prometheus and run three of these example +processes: + +```bash +# Fetch the client library code and compile example. +git clone https://github.com/prometheus/client_golang.git +cd client_golang/examples/random +go get -d +go build + +# Start 3 example targets in separate terminals: +./random -listen-address=:8080 +./random -listen-address=:8081 +./random -listen-address=:8082 +``` + +You should now have example targets listening on http://localhost:8080/metrics, +http://localhost:8081/metrics, and http://localhost:8082/metrics. + +## Configuring Prometheus to monitor the sample targets + +Now we will configure Prometheus to scrape these new targets. Let's group all +three endpoints into one job called `example-random`. However, imagine that the +first two endpoints are production targets, while the third one represents a +canary instance. To model this in Prometheus, we can add several groups of +endpoints to a single job, adding extra labels to each group of targets. In +this example, we will add the `group="production"` label to the first group of +targets, while adding `group="canary"` to the second. + +To achieve this, add the following job definition to the `scrape_configs` +section in your `prometheus.yml` and restart your Prometheus instance: + +```yaml +scrape_configs: + - job_name: 'example-random' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['localhost:8080', 'localhost:8081'] + labels: + group: 'production' + + - targets: ['localhost:8082'] + labels: + group: 'canary' +``` + +Go to the expression browser and verify that Prometheus now has information +about time series that these example endpoints expose, such as the +`rpc_durations_seconds` metric. + +## Configure rules for aggregating scraped data into new time series + +Though not a problem in our example, queries that aggregate over thousands of +time series can get slow when computed ad-hoc. To make this more efficient, +Prometheus allows you to prerecord expressions into completely new persisted +time series via configured recording rules. Let's say we are interested in +recording the per-second rate of example RPCs +(`rpc_durations_seconds_count`) averaged over all instances (but +preserving the `job` and `service` dimensions) as measured over a window of 5 +minutes. We could write this as: + +``` +avg(rate(rpc_durations_seconds_count[5m])) by (job, service) +``` + +Try graphing this expression. + +To record the time series resulting from this expression into a new metric +called `job_service:rpc_durations_seconds_count:avg_rate5m`, create a file +with the following recording rule and save it as `prometheus.rules`: + +``` +job_service:rpc_durations_seconds_count:avg_rate5m = avg(rate(rpc_durations_seconds_count[5m])) by (job, service) +``` + +To make Prometheus pick up this new rule, add a `rule_files` statement to the +global configuration section in your `prometheus.yml`. The config should now +look like this: + +```yaml +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # Evaluate rules every 15 seconds. + + # Attach these extra labels to all timeseries collected by this Prometheus instance. + external_labels: + monitor: 'codelab-monitor' + +rule_files: + - 'prometheus.rules' + +scrape_configs: + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'example-random' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['localhost:8080', 'localhost:8081'] + labels: + group: 'production' + + - targets: ['localhost:8082'] + labels: + group: 'canary' +``` + +Restart Prometheus with the new configuration and verify that a new time series +with the metric name `job_service:rpc_durations_seconds_count:avg_rate5m` +is now available by querying it through the expression browser or graphing it. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000..8f4e3aabc6 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,16 @@ +--- +# todo: internal +--- + +# Prometheus 1.8 + +Welcome to the documentation of the Prometheus server. + +The documentation is available alongside all the project documentation at +[prometheus.io](https://prometheus.io/docs/prometheus/1.8/). + +## Content + +- [Installing](install.md) +- [Getting started](getting_started.md) +- [Configuration](configuration.md) diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000000..186de8aaf9 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,96 @@ +--- +title: Installing +--- + +# Installing + +## Using pre-compiled binaries + +We provide precompiled binaries for most official Prometheus components. Check +out the [download section](https://prometheus.io/download) for a list of all +available versions. + +## From source + +For building Prometheus components from source, see the `Makefile` targets in +the respective repository. + +## Using Docker + +All Prometheus services are available as Docker images under the +[prom](https://hub.docker.com/u/prom/) organization. + +Running Prometheus on Docker is as simple as `docker run -p 9090:9090 +prom/prometheus`. This starts Prometheus with a sample configuration and +exposes it on port 9090. + +The Prometheus image uses a volume to store the actual metrics. For +production deployments it is highly recommended to use the +[Data Volume Container](https://docs.docker.com/engine/userguide/containers/dockervolumes/#creating-and-mounting-a-data-volume-container) +pattern to ease managing the data on Prometheus upgrades. + +To provide your own configuration, there are several options. Here are +two examples. + +### Volumes & bind-mount + +Bind-mount your prometheus.yml from the host by running: + +``` +docker run -p 9090:9090 -v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml \ + prom/prometheus +``` + +Or use an additional volume for the config: + +``` +docker run -p 9090:9090 -v /prometheus-data \ + prom/prometheus -config.file=/prometheus-data/prometheus.yml +``` + +### Custom image + +To avoid managing a file on the host and bind-mount it, the +configuration can be baked into the image. This works well if the +configuration itself is rather static and the same across all +environments. + +For this, create a new directory with a Prometheus configuration and a +Dockerfile like this: + +``` +FROM prom/prometheus +ADD prometheus.yml /etc/prometheus/ +``` + +Now build and run it: + +``` +docker build -t my-prometheus . +docker run -p 9090:9090 my-prometheus +``` + +A more advanced option is to render the config dynamically on start +with some tooling or even have a daemon update it periodically. + +## Using configuration management systems + +If you prefer using configuration management systems you might be interested in +the following third-party contributions: + +Ansible: + +* [griggheo/ansible-prometheus](https://github.com/griggheo/ansible-prometheus) +* [William-Yeh/ansible-prometheus](https://github.com/William-Yeh/ansible-prometheus) + +Chef: + +* [rayrod2030/chef-prometheus](https://github.com/rayrod2030/chef-prometheus) + +Puppet: + +* [puppet/prometheus](https://forge.puppet.com/puppet/prometheus) + +SaltStack: + +* [bechtoldt/saltstack-prometheus-formula](https://github.com/bechtoldt/saltstack-prometheus-formula) From 41281aff8133e2e6338c1903e1437e7fbb0ee265 Mon Sep 17 00:00:00 2001 From: Tobias Schmidt Date: Tue, 10 Oct 2017 19:55:29 +0200 Subject: [PATCH 06/11] Include 1.8 changes in configuration docs --- docs/configuration.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/configuration.md b/docs/configuration.md index 67f214a02c..b0bf8ef8b3 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -296,6 +296,7 @@ The following meta labels are available on targets during [relabeling](#relabel_ * `__meta_consul_address`: the address of the target * `__meta_consul_dc`: the datacenter name for the target +* `__meta_consul_metadata_`: each metadata key value of the target * `__meta_consul_node`: the node name defined for the target * `__meta_consul_service_address`: the service address of the target * `__meta_consul_service_id`: the service ID of the target @@ -394,6 +395,9 @@ region: # Named AWS profile used to connect to the API. [ profile: ] +# AWS Role ARN, an alternative to using AWS API keys. +[ role_arn: ] + # Refresh interval to re-read the instance list. [ refresh_interval: | default = 60s ] @@ -655,6 +659,22 @@ Available meta labels: * If the endpoints belong to a service, all labels of the `role: service` discovery are attached. * For all targets backed by a pod, all labels of the `role: pod` discovery are attached. +#### `ingress` + +The `ingress` role discovers a target for each path of each ingress. +This is generally useful for blackbox monitoring of an ingress. +The address will be set to the host specified in the ingress spec. + +Available meta labels: + +* `__meta_kubernetes_namespace`: The namespace of the ingress object. +* `__meta_kubernetes_ingress_name`: The name of the ingress object. +* `__meta_kubernetes_ingress_label_`: The label of the ingress object. +* `__meta_kubernetes_ingress_annotation_`: The annotation of the ingress object. +* `__meta_kubernetes_ingress_scheme`: Protocol scheme of ingress, `https` if TLS + config is set. Defaults to `http`. +* `__meta_kubernetes_ingress_path`: Path from ingress spec. Defaults to `/`. + See below for the configuration options for Kubernetes discovery: ```yaml @@ -719,6 +739,7 @@ The following meta labels are available on targets during [relabeling](#relabel_ * `__meta_marathon_app_label_`: any Marathon labels attached to the app * `__meta_marathon_port_definition_label_`: the port definition labels * `__meta_marathon_port_mapping_label_`: the port mapping labels +* `__meta_marathon_port_index`: the port index number (e.g. `1` for `PORT1`) See below for the configuration options for Marathon discovery: From 299802dfd03a9a2b28275cb9029c3b1b762e1179 Mon Sep 17 00:00:00 2001 From: Tobias Schmidt Date: Thu, 26 Oct 2017 15:42:07 +0200 Subject: [PATCH 07/11] Integrate changes from prometheus/docs --- docs/configuration.md | 15 ++++++++------- docs/getting_started.md | 27 +++++++++++++-------------- docs/installation.md | 29 +++++++++++++++++------------ 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index b0bf8ef8b3..0fcca8578a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1,6 +1,5 @@ --- title: Configuration -sort_rank: 20 --- # Configuration @@ -549,6 +548,8 @@ project: zone: # Filter can be used optionally to filter the instance list by other criteria +# Syntax of this filter string is described here in the filter query parameter section: +# https://cloud.google.com/compute/docs/reference/latest/instances/list [ filter: ] # Refresh interval to re-read the instance list @@ -770,8 +771,8 @@ in the configuration file), which can also be changed using relabeling. ### `` -Nerve SD configurations allow retrieving scrape targets from [AirBnB's -Nerve](https://github.com/airbnb/nerve) which are stored in +Nerve SD configurations allow retrieving scrape targets from [AirBnB's Nerve] +(https://github.com/airbnb/nerve) which are stored in [Zookeeper](https://zookeeper.apache.org/). The following meta labels are available on targets during [relabeling](#relabel_config): @@ -793,10 +794,10 @@ paths: ### `` -Serverset SD configurations allow retrieving scrape targets from -[Serversets](https://github.com/twitter/finagle/tree/master/finagle-serversets) -which are stored in [Zookeeper](https://zookeeper.apache.org/). Serversets are -commonly used by [Finagle](https://twitter.github.io/finagle/) and +Serverset SD configurations allow retrieving scrape targets from [Serversets] +(https://github.com/twitter/finagle/tree/master/finagle-serversets) which are +stored in [Zookeeper](https://zookeeper.apache.org/). Serversets are commonly +used by [Finagle](https://twitter.github.io/finagle/) and [Aurora](http://aurora.apache.org/). The following meta labels are available on targets during relabeling: diff --git a/docs/getting_started.md b/docs/getting_started.md index 8585e25664..112b4b1b7a 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -60,8 +60,8 @@ For a complete specification of configuration options, see the ## Starting Prometheus -To start Prometheus with your newly created configuration file, change to your -Prometheus build directory and run: +To start Prometheus with your newly created configuration file, change to the +directory containing the Prometheus binary and run: ```bash # Start Prometheus. @@ -69,9 +69,9 @@ Prometheus build directory and run: ./prometheus -config.file=prometheus.yml ``` -Prometheus should start up and it should show a status page about itself at -[localhost:9090](http://localhost:9090). Give it a couple of seconds to collect -data about itself from its own HTTP metrics endpoint. +Prometheus should start up. You should also be able to browse to a status page +about itself at [localhost:9090](http://localhost:9090). Give it a couple of +seconds to collect data about itself from its own HTTP metrics endpoint. You can also verify that Prometheus is serving metrics about itself by navigating to its metrics endpoint: @@ -81,11 +81,10 @@ The number of OS threads executed by Prometheus is controlled by the `GOMAXPROCS` environment variable. As of Go 1.5 the default value is the number of cores available. -Blindly setting `GOMAXPROCS` to a high value can be -counterproductive. See the relevant [Go -FAQs](http://golang.org/doc/faq#Why_no_multi_CPU). +Blindly setting `GOMAXPROCS` to a high value can be counterproductive. See the +relevant [Go FAQs](http://golang.org/doc/faq#Why_no_multi_CPU). -Note that Prometheus by default uses around 3GB in memory. If you have a +Prometheus by default uses around 3GB in memory. If you have a smaller machine, you can tune Prometheus to use less memory. For details, see the [memory usage documentation](storage.md#memory-usage). @@ -96,8 +95,8 @@ use Prometheus's built-in expression browser, navigate to http://localhost:9090/graph and choose the "Console" view within the "Graph" tab. -As you can gather from http://localhost:9090/metrics, one metric that -Prometheus exports about itself is called +As you can gather from [localhost:9090/metrics](http://localhost:9090/metrics), +one metric that Prometheus exports about itself is called `prometheus_target_interval_length_seconds` (the actual amount of time between target scrapes). Go ahead and enter this into the expression console: @@ -105,7 +104,7 @@ target scrapes). Go ahead and enter this into the expression console: prometheus_target_interval_length_seconds ``` -This should return a lot of different time series (along with the latest value +This should return a number of different time series (along with the latest value recorded for each), all with the metric name `prometheus_target_interval_length_seconds`, but with different labels. These labels designate different latency percentiles and target group intervals. @@ -186,7 +185,7 @@ section in your `prometheus.yml` and restart your Prometheus instance: ```yaml scrape_configs: - - job_name: 'example-random' + - job_name: 'example-random' # Override the global default and scrape targets from this job every 5 seconds. scrape_interval: 5s @@ -231,7 +230,7 @@ job_service:rpc_durations_seconds_count:avg_rate5m = avg(rate(rpc_durations_seco ``` To make Prometheus pick up this new rule, add a `rule_files` statement to the -global configuration section in your `prometheus.yml`. The config should now +`global` configuration section in your `prometheus.yml`. The config should now look like this: ```yaml diff --git a/docs/installation.md b/docs/installation.md index 186de8aaf9..1f7648cf97 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -15,6 +15,11 @@ available versions. For building Prometheus components from source, see the `Makefile` targets in the respective repository. +NOTE: **Note:** The documentation on this website refers to the latest stable +release (excluding pre-releases). The branch +[next-release](https://github.com/prometheus/docs/compare/next-release) refers +to unreleased changes that are in master branches of source repos. + ## Using Docker All Prometheus services are available as Docker images under the @@ -26,7 +31,7 @@ exposes it on port 9090. The Prometheus image uses a volume to store the actual metrics. For production deployments it is highly recommended to use the -[Data Volume Container](https://docs.docker.com/engine/userguide/containers/dockervolumes/#creating-and-mounting-a-data-volume-container) +[Data Volume Container](https://docs.docker.com/engine/admin/volumes/volumes/) pattern to ease managing the data on Prometheus upgrades. To provide your own configuration, there are several options. Here are @@ -34,16 +39,16 @@ two examples. ### Volumes & bind-mount -Bind-mount your prometheus.yml from the host by running: +Bind-mount your `prometheus.yml` from the host by running: -``` +```bash docker run -p 9090:9090 -v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml \ prom/prometheus ``` Or use an additional volume for the config: -``` +```bash docker run -p 9090:9090 -v /prometheus-data \ prom/prometheus -config.file=/prometheus-data/prometheus.yml ``` @@ -56,21 +61,21 @@ configuration itself is rather static and the same across all environments. For this, create a new directory with a Prometheus configuration and a -Dockerfile like this: +`Dockerfile` like this: -``` +```Dockerfile FROM prom/prometheus ADD prometheus.yml /etc/prometheus/ ``` Now build and run it: -``` +```bash docker build -t my-prometheus . docker run -p 9090:9090 my-prometheus ``` -A more advanced option is to render the config dynamically on start +A more advanced option is to render the configuration dynamically on start with some tooling or even have a daemon update it periodically. ## Using configuration management systems @@ -78,19 +83,19 @@ with some tooling or even have a daemon update it periodically. If you prefer using configuration management systems you might be interested in the following third-party contributions: -Ansible: +### Ansible * [griggheo/ansible-prometheus](https://github.com/griggheo/ansible-prometheus) * [William-Yeh/ansible-prometheus](https://github.com/William-Yeh/ansible-prometheus) -Chef: +### Chef * [rayrod2030/chef-prometheus](https://github.com/rayrod2030/chef-prometheus) -Puppet: +### Puppet * [puppet/prometheus](https://forge.puppet.com/puppet/prometheus) -SaltStack: +### SaltStack * [bechtoldt/saltstack-prometheus-formula](https://github.com/bechtoldt/saltstack-prometheus-formula) From e6cdc2d35570a0890efe026b2cad2c0d99a335cc Mon Sep 17 00:00:00 2001 From: Tobias Schmidt Date: Thu, 26 Oct 2017 15:53:27 +0200 Subject: [PATCH 08/11] Import querying documentation from prometheus/docs --- docs/configuration.md | 1 + docs/getting_started.md | 2 +- docs/index.md | 1 + docs/installation.md | 5 +- docs/querying/api.md | 417 +++++++++++++++++++++++++++++++++++++ docs/querying/basics.md | 215 +++++++++++++++++++ docs/querying/examples.md | 83 ++++++++ docs/querying/functions.md | 408 ++++++++++++++++++++++++++++++++++++ docs/querying/index.md | 4 + docs/querying/operators.md | 250 ++++++++++++++++++++++ docs/querying/rules.md | 66 ++++++ 11 files changed, 1449 insertions(+), 3 deletions(-) create mode 100644 docs/querying/api.md create mode 100644 docs/querying/basics.md create mode 100644 docs/querying/examples.md create mode 100644 docs/querying/functions.md create mode 100644 docs/querying/index.md create mode 100644 docs/querying/operators.md create mode 100644 docs/querying/rules.md diff --git a/docs/configuration.md b/docs/configuration.md index 0fcca8578a..4efd392c77 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1,5 +1,6 @@ --- title: Configuration +sort_rank: 3 --- # Configuration diff --git a/docs/getting_started.md b/docs/getting_started.md index 112b4b1b7a..a2518bd43e 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,6 +1,6 @@ --- title: Getting started -sort_rank: 10 +sort_rank: 1 --- # Getting started diff --git a/docs/index.md b/docs/index.md index 8f4e3aabc6..8641cd1b07 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,3 +14,4 @@ The documentation is available alongside all the project documentation at - [Installing](install.md) - [Getting started](getting_started.md) - [Configuration](configuration.md) +- [Querying](querying/basics.md) diff --git a/docs/installation.md b/docs/installation.md index 1f7648cf97..4d00edea6b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,8 +1,9 @@ --- -title: Installing +title: Installation +sort_rank: 2 --- -# Installing +# Installation ## Using pre-compiled binaries diff --git a/docs/querying/api.md b/docs/querying/api.md new file mode 100644 index 0000000000..c23677a5a9 --- /dev/null +++ b/docs/querying/api.md @@ -0,0 +1,417 @@ +--- +title: HTTP API +sort_rank: 7 +--- + +# HTTP API + +The current stable HTTP API is reachable under `/api/v1` on a Prometheus +server. Any non-breaking additions will be added under that endpoint. + +## Format overview + +The API response format is JSON. Every successful API request returns a `2xx` +status code. + +Invalid requests that reach the API handlers return a JSON error object +and one of the following HTTP response codes: + +- `400 Bad Request` when parameters are missing or incorrect. +- `422 Unprocessable Entity` when an expression can't be executed + ([RFC4918](http://tools.ietf.org/html/rfc4918#page-78)). +- `503 Service Unavailable` when queries time out or abort. + +Other non-`2xx` codes may be returned for errors occurring before the API +endpoint is reached. + +The JSON response envelope format is as follows: + +``` +{ + "status": "success" | "error", + "data": , + + // Only set if status is "error". The data field may still hold + // additional data. + "errorType": "", + "error": "" +} +``` + +Input timestamps may be provided either in +[RFC3339](https://www.ietf.org/rfc/rfc3339.txt) format or as a Unix timestamp +in seconds, with optional decimal places for sub-second precision. Output +timestamps are always represented as Unix timestamps in seconds. + +Names of query parameters that may be repeated end with `[]`. + +`` placeholders refer to Prometheus [time series +selectors](basics.md#time-series-selectors) like `http_requests_total` or +`http_requests_total{method=~"^GET|POST$"}` and need to be URL-encoded. + +`` placeholders refer to Prometheus duration strings of the form +`[0-9]+[smhdwy]`. For example, `5m` refers to a duration of 5 minutes. + +## Expression queries + +Query language expressions may be evaluated at a single instant or over a range +of time. The sections below describe the API endpoints for each type of +expression query. + +### Instant queries + +The following endpoint evaluates an instant query at a single point in time: + +``` +GET /api/v1/query +``` + +URL query parameters: + +- `query=`: Prometheus expression query string. +- `time=`: Evaluation timestamp. Optional. +- `timeout=`: Evaluation timeout. Optional. Defaults to and + is capped by the value of the `-query.timeout` flag. + +The current server time is used if the `time` parameter is omitted. + +The `data` section of the query result has the following format: + +``` +{ + "resultType": "matrix" | "vector" | "scalar" | "string", + "result": +} +``` + +`` refers to the query result data, which has varying formats +depending on the `resultType`. See the [expression query result +formats](#expression-query-result-formats). + +The following example evaluates the expression `up` at the time +`2015-07-01T20:10:51.781Z`: + +```json +$ curl 'http://localhost:9090/api/v1/query?query=up&time=2015-07-01T20:10:51.781Z' +{ + "status" : "success", + "data" : { + "resultType" : "vector", + "result" : [ + { + "metric" : { + "__name__" : "up", + "job" : "prometheus", + "instance" : "localhost:9090" + }, + "value": [ 1435781451.781, "1" ] + }, + { + "metric" : { + "__name__" : "up", + "job" : "node", + "instance" : "localhost:9100" + }, + "value" : [ 1435781451.781, "0" ] + } + ] + } +} +``` + +### Range queries + +The following endpoint evaluates an expression query over a range of time: + +``` +GET /api/v1/query_range +``` + +URL query parameters: + +- `query=`: Prometheus expression query string. +- `start=`: Start timestamp. +- `end=`: End timestamp. +- `step=`: Query resolution step width. +- `timeout=`: Evaluation timeout. Optional. Defaults to and + is capped by the value of the `-query.timeout` flag. + +The `data` section of the query result has the following format: + +``` +{ + "resultType": "matrix", + "result": +} +``` + +For the format of the `` placeholder, see the [range-vector result +format](#range-vectors). + +The following example evaluates the expression `up` over a 30-second range with +a query resolution of 15 seconds. + +```json +$ curl 'http://localhost:9090/api/v1/query_range?query=up&start=2015-07-01T20:10:30.781Z&end=2015-07-01T20:11:00.781Z&step=15s' +{ + "status" : "success", + "data" : { + "resultType" : "matrix", + "result" : [ + { + "metric" : { + "__name__" : "up", + "job" : "prometheus", + "instance" : "localhost:9090" + }, + "values" : [ + [ 1435781430.781, "1" ], + [ 1435781445.781, "1" ], + [ 1435781460.781, "1" ] + ] + }, + { + "metric" : { + "__name__" : "up", + "job" : "node", + "instance" : "localhost:9091" + }, + "values" : [ + [ 1435781430.781, "0" ], + [ 1435781445.781, "0" ], + [ 1435781460.781, "1" ] + ] + } + ] + } +} +``` + +## Querying metadata + +### Finding series by label matchers + +The following endpoint returns the list of time series that match a certain label set. + +``` +GET /api/v1/series +``` + +URL query parameters: + +- `match[]=`: Repeated series selector argument that selects the + series to return. At least one `match[]` argument must be provided. +- `start=`: Start timestamp. +- `end=`: End timestamp. + +The `data` section of the query result consists of a list of objects that +contain the label name/value pairs which identify each series. + +The following example returns all series that match either of the selectors +`up` or `process_start_time_seconds{job="prometheus"}`: + +```json +$ curl -g 'http://localhost:9090/api/v1/series?match[]=up&match[]=process_start_time_seconds{job="prometheus"}' +{ + "status" : "success", + "data" : [ + { + "__name__" : "up", + "job" : "prometheus", + "instance" : "localhost:9090" + }, + { + "__name__" : "up", + "job" : "node", + "instance" : "localhost:9091" + }, + { + "__name__" : "process_start_time_seconds", + "job" : "prometheus", + "instance" : "localhost:9090" + } + ] +} +``` + +### Querying label values + +The following endpoint returns a list of label values for a provided label name: + +``` +GET /api/v1/label//values +``` + +The `data` section of the JSON response is a list of string label names. + +This example queries for all label values for the `job` label: + +```json +$ curl http://localhost:9090/api/v1/label/job/values +{ + "status" : "success", + "data" : [ + "node", + "prometheus" + ] +} +``` + +## Deleting series + +The following endpoint deletes matched series entirely from a Prometheus server: + +``` +DELETE /api/v1/series +``` + +URL query parameters: + +- `match[]=`: Repeated label matcher argument that selects the + series to delete. At least one `match[]` argument must be provided. + +The `data` section of the JSON response has the following format: + +``` +{ + "numDeleted": +} +``` + +The following example deletes all series that match either of the selectors +`up` or `process_start_time_seconds{job="prometheus"}`: + +```json +$ curl -XDELETE -g 'http://localhost:9090/api/v1/series?match[]=up&match[]=process_start_time_seconds{job="prometheus"}' +{ + "status" : "success", + "data" : { + "numDeleted" : 3 + } +} +``` + +## Expression query result formats + +Expression queries may return the following response values in the `result` +property of the `data` section. `` placeholders are numeric +sample values. JSON does not support special float values such as `NaN`, `Inf`, +and `-Inf`, so sample values are transferred as quoted JSON strings rather than +raw numbers. + +### Range vectors + +Range vectors are returned as result type `matrix`. The corresponding +`result` property has the following format: + +``` +[ + { + "metric": { "": "", ... }, + "values": [ [ , "" ], ... ] + }, + ... +] +``` + +### Instant vectors + +Instant vectors are returned as result type `vector`. The corresponding +`result` property has the following format: + +``` +[ + { + "metric": { "": "", ... }, + "value": [ , "" ] + }, + ... +] +``` + +### Scalars + +Scalar results are returned as result type `scalar`. The corresponding +`result` property has the following format: + +``` +[ , "" ] +``` + +### Strings + +String results are returned as result type `string`. The corresponding +`result` property has the following format: + +``` +[ , "" ] +``` + +## Targets + +> This API is experimental as it is intended to be extended with targets +> dropped due to relabelling in the future. + +The following endpoint returns an overview of the current state of the +Prometheus target discovery: + +``` +GET /api/v1/targets +``` + +Currently only the active targets are part of the response. + +```json +$ curl http://localhost:9090/api/v1/targets +{ + "status": "success", [3/11] + "data": { + "activeTargets": [ + { + "discoveredLabels": { + "__address__": "127.0.0.1:9090", + "__metrics_path__": "/metrics", + "__scheme__": "http", + "job": "prometheus" + }, + "labels": { + "instance": "127.0.0.1:9090", + "job": "prometheus" + }, + "scrapeUrl": "http://127.0.0.1:9090/metrics", + "lastError": "", + "lastScrape": "2017-01-17T15:07:44.723715405+01:00", + "health": "up" + } + ] + } +} +``` + +## Alertmanagers + +> This API is experimental as it is intended to be extended with Alertmanagers +> dropped due to relabelling in the future. + +The following endpoint returns an overview of the current state of the +Prometheus alertmanager discovery: + +``` +GET /api/v1/alertmanagers +``` + +Currently only the active Alertmanagers are part of the response. + +```json +$ curl http://localhost:9090/api/v1/alertmanagers +{ + "status": "success", + "data": { + "activeAlertmanagers": [ + { + "url": "http://127.0.0.1:9090/api/v1/alerts" + } + ] + } +} +``` diff --git a/docs/querying/basics.md b/docs/querying/basics.md new file mode 100644 index 0000000000..f001c6d0d1 --- /dev/null +++ b/docs/querying/basics.md @@ -0,0 +1,215 @@ +--- +title: Querying basics +nav_title: Basics +sort_rank: 1 +--- + +# Querying Prometheus + +Prometheus provides a functional expression language that lets the user select +and aggregate time series data in real time. The result of an expression can +either be shown as a graph, viewed as tabular data in Prometheus's expression +browser, or consumed by external systems via the [HTTP API](api.md). + +## Examples + +This document is meant as a reference. For learning, it might be easier to +start with a couple of [examples](examples.md). + +## Expression language data types + +In Prometheus's expression language, an expression or sub-expression can +evaluate to one of four types: + +* **Instant vector** - a set of time series containing a single sample for each time series, all sharing the same timestamp +* **Range vector** - a set of time series containing a range of data points over time for each time series +* **Scalar** - a simple numeric floating point value +* **String** - a simple string value; currently unused + +Depending on the use-case (e.g. when graphing vs. displaying the output of an +expression), only some of these types are legal as the result from a +user-specified expression. For example, an expression that returns an instant +vector is the only type that can be directly graphed. + +## Literals + +### String literals + +Strings may be specified as literals in single quotes, double quotes or +backticks. + +PromQL follows the same [escaping rules as +Go](https://golang.org/ref/spec#String_literals). In single or double quotes a +backslash begins an escape sequence, which may be followed by `a`, `b`, `f`, +`n`, `r`, `t`, `v` or `\`. Specific characters can be provided using octal +(`\nnn`) or hexadecimal (`\xnn`, `\unnnn` and `\Unnnnnnnn`). + +No escaping is processed inside backticks. Unlike Go, Prometheus does not discard newlines inside backticks. + +Example: + + "this is a string" + 'these are unescaped: \n \\ \t' + `these are not unescaped: \n ' " \t` + +### Float literals + +Scalar float values can be literally written as numbers of the form +`[-](digits)[.(digits)]`. + + -2.43 + +## Time series Selectors + +### Instant vector selectors + +Instant vector selectors allow the selection of a set of time series and a +single sample value for each at a given timestamp (instant): in the simplest +form, only a metric name is specified. This results in an instant vector +containing elements for all time series that have this metric name. + +This example selects all time series that have the `http_requests_total` metric +name: + + http_requests_total + +It is possible to filter these time series further by appending a set of labels +to match in curly braces (`{}`). + +This example selects only those time series with the `http_requests_total` +metric name that also have the `job` label set to `prometheus` and their +`group` label set to `canary`: + + http_requests_total{job="prometheus",group="canary"} + +It is also possible to negatively match a label value, or to match label values +against regular expressions. The following label matching operators exist: + +* `=`: Select labels that are exactly equal to the provided string. +* `!=`: Select labels that are not equal to the provided string. +* `=~`: Select labels that regex-match the provided string (or substring). +* `!~`: Select labels that do not regex-match the provided string (or substring). + +For example, this selects all `http_requests_total` time series for `staging`, +`testing`, and `development` environments and HTTP methods other than `GET`. + + http_requests_total{environment=~"staging|testing|development",method!="GET"} + +Label matchers that match empty label values also select all time series that do +not have the specific label set at all. Regex-matches are fully anchored. + +Vector selectors must either specify a name or at least one label matcher +that does not match the empty string. The following expression is illegal: + + {job=~".*"} # Bad! + +In contrast, these expressions are valid as they both have a selector that does not +match empty label values. + + {job=~".+"} # Good! + {job=~".*",method="get"} # Good! + +Label matchers can also be applied to metric names by matching against the internal +`__name__` label. For example, the expression `http_requests_total` is equivalent to +`{__name__="http_requests_total"}`. Matchers other than `=` (`!=`, `=~`, `!~`) may also be used. +The following expression selects all metrics that have a name starting with `job:`: + + {__name__=~"^job:.*"} + +### Range Vector Selectors + +Range vector literals work like instant vector literals, except that they +select a range of samples back from the current instant. Syntactically, a range +duration is appended in square brackets (`[]`) at the end of a vector selector +to specify how far back in time values should be fetched for each resulting +range vector element. + +Time durations are specified as a number, followed immediately by one of the +following units: + +* `s` - seconds +* `m` - minutes +* `h` - hours +* `d` - days +* `w` - weeks +* `y` - years + +In this example, we select all the values we have recorded within the last 5 +minutes for all time series that have the metric name `http_requests_total` and +a `job` label set to `prometheus`: + + http_requests_total{job="prometheus"}[5m] + +### Offset modifier + +The `offset` modifier allows changing the time offset for individual +instant and range vectors in a query. + +For example, the following expression returns the value of +`http_requests_total` 5 minutes in the past relative to the current +query evaluation time: + + http_requests_total offset 5m + +Note that the `offset` modifier always needs to follow the selector +immediately, i.e. the following would be correct: + + sum(http_requests_total{method="GET"} offset 5m) // GOOD. + +While the following would be *incorrect*: + + sum(http_requests_total{method="GET"}) offset 5m // INVALID. + +The same works for range vectors. This returns the 5-minutes rate that +`http_requests_total` had a week ago: + + rate(http_requests_total[5m] offset 1w) + +## Operators + +Prometheus supports many binary and aggregation operators. These are described +in detail in the [expression language operators](operators.md) page. + +## Functions + +Prometheus supports several functions to operate on data. These are described +in detail in the [expression language functions](functions.md) page. + +## Gotchas + +### Interpolation and staleness + +When queries are run, timestamps at which to sample data are selected +independently of the actual present time series data. This is mainly to support +cases like aggregation (`sum`, `avg`, and so on), where multiple aggregated +time series do not exactly align in time. Because of their independence, +Prometheus needs to assign a value at those timestamps for each relevant time +series. It does so by simply taking the newest sample before this timestamp. + +If no stored sample is found (by default) 5 minutes before a sampling timestamp, +no value is assigned for this time series at this point in time. This +effectively means that time series "disappear" from graphs at times where their +latest collected sample is older than 5 minutes. + +NOTE: NOTE: Staleness and interpolation handling might change. See +https://github.com/prometheus/prometheus/issues/398 and +https://github.com/prometheus/prometheus/issues/581. + +### Avoiding slow queries and overloads + +If a query needs to operate on a very large amount of data, graphing it might +time out or overload the server or browser. Thus, when constructing queries +over unknown data, always start building the query in the tabular view of +Prometheus's expression browser until the result set seems reasonable +(hundreds, not thousands, of time series at most). Only when you have filtered +or aggregated your data sufficiently, switch to graph mode. If the expression +still takes too long to graph ad-hoc, pre-record it via a [recording +rule](rules.md#recording-rules). + +This is especially relevant for Prometheus's query language, where a bare +metric name selector like `api_http_requests_total` could expand to thousands +of time series with different labels. Also keep in mind that expressions which +aggregate over many time series will generate load on the server even if the +output is only a small number of time series. This is similar to how it would +be slow to sum all values of a column in a relational database, even if the +output value is only a single number. diff --git a/docs/querying/examples.md b/docs/querying/examples.md new file mode 100644 index 0000000000..4e522ab85d --- /dev/null +++ b/docs/querying/examples.md @@ -0,0 +1,83 @@ +--- +title: Querying examples +nav_title: Examples +sort_rank: 4 +--- + +# Query examples + +## Simple time series selection + +Return all time series with the metric `http_requests_total`: + + http_requests_total + +Return all time series with the metric `http_requests_total` and the given +`job` and `handler` labels: + + http_requests_total{job="apiserver", handler="/api/comments"} + +Return a whole range of time (in this case 5 minutes) for the same vector, +making it a range vector: + + http_requests_total{job="apiserver", handler="/api/comments"}[5m] + +Note that an expression resulting in a range vector cannot be graphed directly, +but viewed in the tabular ("Console") view of the expression browser. + +Using regular expressions, you could select time series only for jobs whose +name match a certain pattern, in this case, all jobs that end with `server`. +Note that this does a substring match, not a full string match: + + http_requests_total{job=~"server$"} + +To select all HTTP status codes except 4xx ones, you could run: + + http_requests_total{status!~"^4..$"} + +## Using functions, operators, etc. + +Return the per-second rate for all time series with the `http_requests_total` +metric name, as measured over the last 5 minutes: + + rate(http_requests_total[5m]) + +Assuming that the `http_requests_total` time series all have the labels `job` +(fanout by job name) and `instance` (fanout by instance of the job), we might +want to sum over the rate of all instances, so we get fewer output time series, +but still preserve the `job` dimension: + + sum(rate(http_requests_total[5m])) by (job) + +If we have two different metrics with the same dimensional labels, we can apply +binary operators to them and elements on both sides with the same label set +will get matched and propagated to the output. For example, this expression +returns the unused memory in MiB for every instance (on a fictional cluster +scheduler exposing these metrics about the instances it runs): + + (instance_memory_limit_bytes - instance_memory_usage_bytes) / 1024 / 1024 + +The same expression, but summed by application, could be written like this: + + sum( + instance_memory_limit_bytes - instance_memory_usage_bytes + ) by (app, proc) / 1024 / 1024 + +If the same fictional cluster scheduler exposed CPU usage metrics like the +following for every instance: + + instance_cpu_time_ns{app="lion", proc="web", rev="34d0f99", env="prod", job="cluster-manager"} + instance_cpu_time_ns{app="elephant", proc="worker", rev="34d0f99", env="prod", job="cluster-manager"} + instance_cpu_time_ns{app="turtle", proc="api", rev="4d3a513", env="prod", job="cluster-manager"} + instance_cpu_time_ns{app="fox", proc="widget", rev="4d3a513", env="prod", job="cluster-manager"} + ... + +...we could get the top 3 CPU users grouped by application (`app`) and process +type (`proc`) like this: + + topk(3, sum(rate(instance_cpu_time_ns[5m])) by (app, proc)) + +Assuming this metric contains one time series per running instance, you could +count the number of running instances per application like this: + + count(instance_cpu_time_ns) by (app) diff --git a/docs/querying/functions.md b/docs/querying/functions.md new file mode 100644 index 0000000000..74e6740285 --- /dev/null +++ b/docs/querying/functions.md @@ -0,0 +1,408 @@ +--- +title: Query functions +nav_title: Functions +sort_rank: 3 +--- + +# Functions + +Some functions have default arguments, e.g. `year(v=vector(time()) +instant-vector)`. This means that there is one argument `v` which is an instant +vector, which if not provided it will default to the value of the expression +`vector(time())`. + +## `abs()` + +`abs(v instant-vector)` returns the input vector with all sample values converted to +their absolute value. + +## `absent()` + +`absent(v instant-vector)` returns an empty vector if the vector passed to it +has any elements and a 1-element vector with the value 1 if the vector passed to +it has no elements. + +This is useful for alerting on when no time series exist for a given metric name +and label combination. + +``` +absent(nonexistent{job="myjob"}) +# => {job="myjob"} + +absent(nonexistent{job="myjob",instance=~".*"}) +# => {job="myjob"} + +absent(sum(nonexistent{job="myjob"})) +# => {} +``` + +In the second example, `absent()` tries to be smart about deriving labels of the +1-element output vector from the input vector. + +## `ceil()` + +`ceil(v instant-vector)` rounds the sample values of all elements in `v` up to +the nearest integer. + +## `changes()` + +For each input time series, `changes(v range-vector)` returns the number of +times its value has changed within the provided time range as an instant +vector. + +## `clamp_max()` + +`clamp_max(v instant-vector, max scalar)` clamps the sample values of all +elements in `v` to have an upper limit of `max`. + +## `clamp_min()` + +`clamp_min(v instant-vector, min scalar)` clamps the sample values of all +elements in `v` to have a lower limit of `min`. + +## `count_scalar()` + +`count_scalar(v instant-vector)` returns the number of elements in a time series +vector as a scalar. This is in contrast to the `count()` +[aggregation operator](operators.md#aggregation-operators), which +always returns a vector (an empty one if the input vector is empty) and allows +grouping by labels via a `by` clause. + +## `day_of_month()` + +`day_of_month(v=vector(time()) instant-vector)` returns the day of the month +for each of the given times in UTC. Returned values are from 1 to 31. + +## `day_of_week()` + +`day_of_week(v=vector(time()) instant-vector)` returns the day of the week for +each of the given times in UTC. Returned values are from 0 to 6, where 0 means +Sunday etc. + +## `days_in_month()` + +`days_in_month(v=vector(time()) instant-vector)` returns number of days in the +month for each of the given times in UTC. Returned values are from 28 to 31. + +## `delta()` + +`delta(v range-vector)` calculates the difference between the +first and last value of each time series element in a range vector `v`, +returning an instant vector with the given deltas and equivalent labels. +The delta is extrapolated to cover the full time range as specified in +the range vector selector, so that it is possible to get a non-integer +result even if the sample values are all integers. + +The following example expression returns the difference in CPU temperature +between now and 2 hours ago: + +``` +delta(cpu_temp_celsius{host="zeus"}[2h]) +``` + +`delta` should only be used with gauges. + +## `deriv()` + +`deriv(v range-vector)` calculates the per-second derivative of the time series in a range +vector `v`, using [simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression). + +`deriv` should only be used with gauges. + +## `drop_common_labels()` + +`drop_common_labels(instant-vector)` drops all labels that have the same name +and value across all series in the input vector. + +## `exp()` + +`exp(v instant-vector)` calculates the exponential function for all elements in `v`. +Special cases are: + +* `Exp(+Inf) = +Inf` +* `Exp(NaN) = NaN` + +## `floor()` + +`floor(v instant-vector)` rounds the sample values of all elements in `v` down +to the nearest integer. + +## `histogram_quantile()` + +`histogram_quantile(φ float, b instant-vector)` calculates the φ-quantile (0 ≤ φ +≤ 1) from the buckets `b` of a +[histogram](https://prometheus.io/docs/concepts/metric_types/#histogram). (See +[histograms and summaries](https://prometheus.io/docs/practices/histograms) for +a detailed explanation of φ-quantiles and the usage of the histogram metric type +in general.) The samples in `b` are the counts of observations in each bucket. +Each sample must have a label `le` where the label value denotes the inclusive +upper bound of the bucket. (Samples without such a label are silently ignored.) +The [histogram metric type](https://prometheus.io/docs/concepts/metric_types/#histogram) +automatically provides time series with the `_bucket` suffix and the appropriate +labels. + +Use the `rate()` function to specify the time window for the quantile +calculation. + +Example: A histogram metric is called `http_request_duration_seconds`. To +calculate the 90th percentile of request durations over the last 10m, use the +following expression: + + histogram_quantile(0.9, rate(http_request_duration_seconds_bucket[10m])) + +The quantile is calculated for each label combination in +`http_request_duration_seconds`. To aggregate, use the `sum()` aggregator +around the `rate()` function. Since the `le` label is required by +`histogram_quantile()`, it has to be included in the `by` clause. The following +expression aggregates the 90th percentile by `job`: + + histogram_quantile(0.9, sum(rate(http_request_duration_seconds_bucket[10m])) by (job, le)) + +To aggregate everything, specify only the `le` label: + + histogram_quantile(0.9, sum(rate(http_request_duration_seconds_bucket[10m])) by (le)) + +The `histogram_quantile()` function interpolates quantile values by +assuming a linear distribution within a bucket. The highest bucket +must have an upper bound of `+Inf`. (Otherwise, `NaN` is returned.) If +a quantile is located in the highest bucket, the upper bound of the +second highest bucket is returned. A lower limit of the lowest bucket +is assumed to be 0 if the upper bound of that bucket is greater than +0. In that case, the usual linear interpolation is applied within that +bucket. Otherwise, the upper bound of the lowest bucket is returned +for quantiles located in the lowest bucket. + +If `b` contains fewer than two buckets, `NaN` is returned. For φ < 0, `-Inf` is +returned. For φ > 1, `+Inf` is returned. + +## `holt_winters()` + +`holt_winters(v range-vector, sf scalar, tf scalar)` produces a smoothed value +for time series based on the range in `v`. The lower the smoothing factor `sf`, +the more importance is given to old data. The higher the trend factor `tf`, the +more trends in the data is considered. Both `sf` and `tf` must be between 0 and +1. + +`holt_winters` should only be used with gauges. + +## `hour()` + +`hour(v=vector(time()) instant-vector)` returns the hour of the day +for each of the given times in UTC. Returned values are from 0 to 23. + +## `idelta()` + +`idelta(v range-vector)` + +`idelta(v range-vector)` calculates the difference between the last two samples +in the range vector `v`, returning an instant vector with the given deltas and +equivalent labels. + +`idelta` should only be used with gauges. + +## `increase()` + +`increase(v range-vector)` calculates the increase in the +time series in the range vector. Breaks in monotonicity (such as counter +resets due to target restarts) are automatically adjusted for. The +increase is extrapolated to cover the full time range as specified +in the range vector selector, so that it is possible to get a +non-integer result even if a counter increases only by integer +increments. + +The following example expression returns the number of HTTP requests as measured +over the last 5 minutes, per time series in the range vector: + +``` +increase(http_requests_total{job="api-server"}[5m]) +``` + +`increase` should only be used with counters. It is syntactic sugar +for `rate(v)` multiplied by the number of seconds under the specified +time range window, and should be used primarily for human readability. +Use `rate` in recording rules so that increases are tracked consistently +on a per-second basis. + +## `irate()` + +`irate(v range-vector)` calculates the per-second instant rate of increase of +the time series in the range vector. This is based on the last two data points. +Breaks in monotonicity (such as counter resets due to target restarts) are +automatically adjusted for. + +The following example expression returns the per-second rate of HTTP requests +looking up to 5 minutes back for the two most recent data points, per time +series in the range vector: + +``` +irate(http_requests_total{job="api-server"}[5m]) +``` + +`irate` should only be used when graphing volatile, fast-moving counters. +Use `rate` for alerts and slow-moving counters, as brief changes +in the rate can reset the `FOR` clause and graphs consisting entirely of rare +spikes are hard to read. + +Note that when combining `irate()` with an +[aggregation operator](operators.md#aggregation-operators) (e.g. `sum()`) +or a function aggregating over time (any function ending in `_over_time`), +always take a `irate()` first, then aggregate. Otherwise `irate()` cannot detect +counter resets when your target restarts. + +## `label_join()` + +For each timeseries in `v`, `label_join(v instant-vector, dst_label string, separator string, src_label_1 string, src_label_2 string, ...)` joins all the values of all the `src_labels` +using `separator` and returns the timeseries with the label `dst_label` containing the joined value. +There can be any number of `src_labels` in this function. + +This example will return a vector with each time series having a `foo` label with the value `a,b,c` added to it: + +``` +label_join(up{job="api-server",src1="a",src2="b",src3="c"}, "foo", ",", "src1", "src2", "src3") +``` + +## `label_replace()` + +For each timeseries in `v`, `label_replace(v instant-vector, dst_label string, +replacement string, src_label string, regex string)` matches the regular +expression `regex` against the label `src_label`. If it matches, then the +timeseries is returned with the label `dst_label` replaced by the expansion of +`replacement`. `$1` is replaced with the first matching subgroup, `$2` with the +second etc. If the regular expression doesn't match then the timeseries is +returned unchanged. + +This example will return a vector with each time series having a `foo` +label with the value `a` added to it: + +``` +label_replace(up{job="api-server",service="a:c"}, "foo", "$1", "service", "(.*):.*") +``` + +## `ln()` + +`ln(v instant-vector)` calculates the natural logarithm for all elements in `v`. +Special cases are: + +* `ln(+Inf) = +Inf` +* `ln(0) = -Inf` +* `ln(x < 0) = NaN` +* `ln(NaN) = NaN` + +## `log2()` + +`log2(v instant-vector)` calculates the binary logarithm for all elements in `v`. +The special cases are equivalent to those in `ln`. + +## `log10()` + +`log10(v instant-vector)` calculates the decimal logarithm for all elements in `v`. +The special cases are equivalent to those in `ln`. + +## `minute()` + +`minute(v=vector(time()) instant-vector)` returns the minute of the hour for each +of the given times in UTC. Returned values are from 0 to 59. + +## `month()` + +`month(v=vector(time()) instant-vector)` returns the month of the year for each +of the given times in UTC. Returned values are from 1 to 12, where 1 means +January etc. + +## `predict_linear()` + +`predict_linear(v range-vector, t scalar)` predicts the value of time series +`t` seconds from now, based on the range vector `v`, using [simple linear +regression](http://en.wikipedia.org/wiki/Simple_linear_regression). + +`predict_linear` should only be used with gauges. + +## `rate()` + +`rate(v range-vector)` calculates the per-second average rate of increase of the +time series in the range vector. Breaks in monotonicity (such as counter +resets due to target restarts) are automatically adjusted for. Also, the +calculation extrapolates to the ends of the time range, allowing for missed +scrapes or imperfect alignment of scrape cycles with the range's time period. + +The following example expression returns the per-second rate of HTTP requests as measured +over the last 5 minutes, per time series in the range vector: + +``` +rate(http_requests_total{job="api-server"}[5m]) +``` + +`rate` should only be used with counters. It is best suited for alerting, +and for graphing of slow-moving counters. + +Note that when combining `rate()` with an aggregation operator (e.g. `sum()`) +or a function aggregating over time (any function ending in `_over_time`), +always take a `rate()` first, then aggregate. Otherwise `rate()` cannot detect +counter resets when your target restarts. + +## `resets()` + +For each input time series, `resets(v range-vector)` returns the number of +counter resets within the provided time range as an instant vector. Any +decrease in the value between two consecutive samples is interpreted as a +counter reset. + +`resets` should only be used with counters. + +## `round()` + +`round(v instant-vector, to_nearest=1 scalar)` rounds the sample values of all +elements in `v` to the nearest integer. Ties are resolved by rounding up. The +optional `to_nearest` argument allows specifying the nearest multiple to which +the sample values should be rounded. This multiple may also be a fraction. + +## `scalar()` + +Given a single-element input vector, `scalar(v instant-vector)` returns the +sample value of that single element as a scalar. If the input vector does not +have exactly one element, `scalar` will return `NaN`. + +## `sort()` + +`sort(v instant-vector)` returns vector elements sorted by their sample values, +in ascending order. + +## `sort_desc()` + +Same as `sort`, but sorts in descending order. + +## `sqrt()` + +`sqrt(v instant-vector)` calculates the square root of all elements in `v`. + +## `time()` + +`time()` returns the number of seconds since January 1, 1970 UTC. Note that +this does not actually return the current time, but the time at which the +expression is to be evaluated. + +## `vector()` + +`vector(s scalar)` returns the scalar `s` as a vector with no labels. + +## `year()` + +`year(v=vector(time()) instant-vector)` returns the year +for each of the given times in UTC. + +## `_over_time()` + +The following functions allow aggregating each series of a given range vector +over time and return an instant vector with per-series aggregation results: + +* `avg_over_time(range-vector)`: the average value of all points in the specified interval. +* `min_over_time(range-vector)`: the minimum value of all points in the specified interval. +* `max_over_time(range-vector)`: the maximum value of all points in the specified interval. +* `sum_over_time(range-vector)`: the sum of all values in the specified interval. +* `count_over_time(range-vector)`: the count of all values in the specified interval. +* `quantile_over_time(scalar, range-vector)`: the φ-quantile (0 ≤ φ ≤ 1) of the values in the specified interval. +* `stddev_over_time(range-vector)`: the population standard deviation of the values in the specified interval. +* `stdvar_over_time(range-vector)`: the population standard variance of the values in the specified interval. + +Note that all values in the specified interval have the same weight in the +aggregation even if the values are not equally spaced throughout the interval. diff --git a/docs/querying/index.md b/docs/querying/index.md new file mode 100644 index 0000000000..1566750e89 --- /dev/null +++ b/docs/querying/index.md @@ -0,0 +1,4 @@ +--- +title: Querying +sort_rank: 4 +--- diff --git a/docs/querying/operators.md b/docs/querying/operators.md new file mode 100644 index 0000000000..7aa7a6b79b --- /dev/null +++ b/docs/querying/operators.md @@ -0,0 +1,250 @@ +--- +title: Operators +sort_rank: 2 +--- + +# Operators + +## Binary operators + +Prometheus's query language supports basic logical and arithmetic operators. +For operations between two instant vectors, the [matching behavior](#vector-matching) +can be modified. + +### Arithmetic binary operators + +The following binary arithmetic operators exist in Prometheus: + +* `+` (addition) +* `-` (subtraction) +* `*` (multiplication) +* `/` (division) +* `%` (modulo) +* `^` (power/exponentiation) + +Binary arithmetic operators are defined between scalar/scalar, vector/scalar, +and vector/vector value pairs. + +**Between two scalars**, the behavior is obvious: they evaluate to another +scalar that is the result of the operator applied to both scalar operands. + +**Between an instant vector and a scalar**, the operator is applied to the +value of every data sample in the vector. E.g. if a time series instant vector +is multiplied by 2, the result is another vector in which every sample value of +the original vector is multiplied by 2. + +**Between two instant vectors**, a binary arithmetic operator is applied to +each entry in the left-hand-side vector and its [matching element](#vector-matching) +in the right hand vector. The result is propagated into the result vector and the metric +name is dropped. Entries for which no matching entry in the right-hand vector can be +found are not part of the result. + +### Comparison binary operators + +The following binary comparison operators exist in Prometheus: + +* `==` (equal) +* `!=` (not-equal) +* `>` (greater-than) +* `<` (less-than) +* `>=` (greater-or-equal) +* `<=` (less-or-equal) + +Comparison operators are defined between scalar/scalar, vector/scalar, +and vector/vector value pairs. By default they filter. Their behaviour can be +modified by providing `bool` after the operator, which will return `0` or `1` +for the value rather than filtering. + +**Between two scalars**, the `bool` modifier must be provided and these +operators result in another scalar that is either `0` (`false`) or `1` +(`true`), depending on the comparison result. + +**Between an instant vector and a scalar**, these operators are applied to the +value of every data sample in the vector, and vector elements between which the +comparison result is `false` get dropped from the result vector. If the `bool` +modifier is provided, vector elements that would be dropped instead have the value +`0` and vector elements that would be kept have the value `1`. + +**Between two instant vectors**, these operators behave as a filter by default, +applied to matching entries. Vector elements for which the expression is not +true or which do not find a match on the other side of the expression get +dropped from the result, while the others are propagated into a result vector +with their original (left-hand-side) metric names and label values. +If the `bool` modifier is provided, vector elements that would have been +dropped instead have the value `0` and vector elements that would be kept have +the value `1` with the left-hand-side metric names and label values. + +### Logical/set binary operators + +These logical/set binary operators are only defined between instant vectors: + +* `and` (intersection) +* `or` (union) +* `unless` (complement) + +`vector1 and vector2` results in a vector consisting of the elements of +`vector1` for which there are elements in `vector2` with exactly matching +label sets. Other elements are dropped. The metric name and values are carried +over from the left-hand-side vector. + +`vector1 or vector2` results in a vector that contains all original elements +(label sets + values) of `vector1` and additionally all elements of `vector2` +which do not have matching label sets in `vector1`. + +`vector1 unless vector2` results in a vector consisting of the elements of +`vector1` for which there are no elements in `vector2` with exactly matching +label sets. All matching elements in both vectors are dropped. + +## Vector matching + +Operations between vectors attempt to find a matching element in the right-hand-side +vector for each entry in the left-hand side. There are two basic types of +matching behavior: + +**One-to-one** finds a unique pair of entries from each side of the operation. +In the default case, that is an operation following the format `vector1 vector2`. +Two entries match if they have the exact same set of labels and corresponding values. +The `ignoring` keyword allows ignoring certain labels when matching, while the +`on` keyword allows reducing the set of considered labels to a provided list: + + ignoring(