From 1e8e5e06c5e6bf23ec0600e1ef5c5a87ed6889cc Mon Sep 17 00:00:00 2001 From: Ivan Ka <5395690+ivankatliarchuk@users.noreply.github.com> Date: Tue, 4 Mar 2025 13:05:44 +0000 Subject: [PATCH] chore(docs): generate docs/monitoring/metrics.md file (#5117) * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Signed-off-by: ivan katliarchuk * chore(docs): generate docs/monitoring/metrics.md file Co-authored-by: Michel Loiseleur <97035654+mloiseleur@users.noreply.github.com> --------- Signed-off-by: ivan katliarchuk Co-authored-by: Michel Loiseleur <97035654+mloiseleur@users.noreply.github.com> --- Makefile | 8 +- controller/controller.go | 95 ++++++------- controller/controller_test.go | 28 ++-- docs/contributing/dev-guide.md | 3 +- docs/faq.md | 42 ------ docs/monitoring/index.md | 50 +++++++ docs/monitoring/metrics.md | 89 ++++++++++++ internal/gen/docs/flags/main.go | 42 ++---- internal/gen/docs/flags/main_test.go | 2 +- internal/gen/docs/flags/templates/flags.gotpl | 11 ++ internal/gen/docs/metrics/main.go | 122 ++++++++++++++++ internal/gen/docs/metrics/main_test.go | 133 ++++++++++++++++++ .../gen/docs/metrics/templates/metrics.gotpl | 37 +++++ internal/gen/docs/utils/utils.go | 50 +++++++ internal/gen/docs/utils/utils_test.go | 73 ++++++++++ main.go | 7 +- mkdocs.yml | 1 + pkg/metrics/metrics.go | 77 ++++++++++ pkg/metrics/metrics_test.go | 103 ++++++++++++++ pkg/metrics/models.go | 111 +++++++++++++++ pkg/metrics/models_test.go | 83 +++++++++++ provider/cached_provider.go | 23 +-- provider/webhook/webhook.go | 61 ++++---- 23 files changed, 1072 insertions(+), 179 deletions(-) create mode 100644 docs/monitoring/index.md create mode 100644 docs/monitoring/metrics.md create mode 100644 internal/gen/docs/flags/templates/flags.gotpl create mode 100644 internal/gen/docs/metrics/main.go create mode 100644 internal/gen/docs/metrics/main_test.go create mode 100644 internal/gen/docs/metrics/templates/metrics.gotpl create mode 100644 internal/gen/docs/utils/utils.go create mode 100644 internal/gen/docs/utils/utils_test.go create mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/metrics/metrics_test.go create mode 100644 pkg/metrics/models.go create mode 100644 pkg/metrics/models_test.go diff --git a/Makefile b/Makefile index e3e1a995f..b28a79fbd 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ golangci-lint-verify: golangci-lint .PHONY: go-lint go-lint: golangci-lint gofmt -l -s -w . - golangci-lint run --timeout=30m ./... + golangci-lint run --timeout=30m --fix ./... #? licensecheck: Run the to check for license headers .PHONY: licensecheck @@ -149,7 +149,6 @@ clean: @rm -rf build @go clean -cache - .PHONY: release.staging #? release.staging: Builds and push container images to the staging bucket. release.staging: test @@ -167,6 +166,11 @@ ko: generate-flags-documentation: go run internal/gen/docs/flags/main.go +.PHONY: generate-metrics-documentation +#? generate-metrics-documentation: Generate documentation (docs/monitoring/metrics.md) +generate-metrics-documentation: + go run internal/gen/docs/metrics/main.go + #? pre-commit-install: Install pre-commit hooks pre-commit-install: @pre-commit install diff --git a/controller/controller.go b/controller/controller.go index 2521b4436..d98fb8104 100644 --- a/controller/controller.go +++ b/controller/controller.go @@ -27,6 +27,7 @@ import ( log "github.com/sirupsen/logrus" "sigs.k8s.io/external-dns/endpoint" + "sigs.k8s.io/external-dns/pkg/metrics" "sigs.k8s.io/external-dns/plan" "sigs.k8s.io/external-dns/provider" "sigs.k8s.io/external-dns/registry" @@ -34,7 +35,7 @@ import ( ) var ( - registryErrorsTotal = prometheus.NewCounter( + registryErrorsTotal = metrics.NewCounterWithOpts( prometheus.CounterOpts{ Namespace: "external_dns", Subsystem: "registry", @@ -42,7 +43,7 @@ var ( Help: "Number of Registry errors.", }, ) - sourceErrorsTotal = prometheus.NewCounter( + sourceErrorsTotal = metrics.NewCounterWithOpts( prometheus.CounterOpts{ Namespace: "external_dns", Subsystem: "source", @@ -50,7 +51,7 @@ var ( Help: "Number of Source errors.", }, ) - sourceEndpointsTotal = prometheus.NewGauge( + sourceEndpointsTotal = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "source", @@ -58,7 +59,7 @@ var ( Help: "Number of Endpoints in all sources", }, ) - registryEndpointsTotal = prometheus.NewGauge( + registryEndpointsTotal = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "registry", @@ -66,7 +67,7 @@ var ( Help: "Number of Endpoints in the registry", }, ) - lastSyncTimestamp = prometheus.NewGauge( + lastSyncTimestamp = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "controller", @@ -74,7 +75,7 @@ var ( Help: "Timestamp of last successful sync with the DNS provider", }, ) - lastReconcileTimestamp = prometheus.NewGauge( + lastReconcileTimestamp = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "controller", @@ -82,7 +83,7 @@ var ( Help: "Timestamp of last attempted sync with the DNS provider", }, ) - controllerNoChangesTotal = prometheus.NewCounter( + controllerNoChangesTotal = metrics.NewCounterWithOpts( prometheus.CounterOpts{ Namespace: "external_dns", Subsystem: "controller", @@ -90,21 +91,21 @@ var ( Help: "Number of reconcile loops ending up with no changes on the DNS provider side.", }, ) - deprecatedRegistryErrors = prometheus.NewCounter( + deprecatedRegistryErrors = metrics.NewCounterWithOpts( prometheus.CounterOpts{ Subsystem: "registry", Name: "errors_total", Help: "Number of Registry errors.", }, ) - deprecatedSourceErrors = prometheus.NewCounter( + deprecatedSourceErrors = metrics.NewCounterWithOpts( prometheus.CounterOpts{ Subsystem: "source", Name: "errors_total", Help: "Number of Source errors.", }, ) - registryARecords = prometheus.NewGauge( + registryARecords = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "registry", @@ -112,7 +113,7 @@ var ( Help: "Number of Registry A records.", }, ) - registryAAAARecords = prometheus.NewGauge( + registryAAAARecords = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "registry", @@ -120,7 +121,7 @@ var ( Help: "Number of Registry AAAA records.", }, ) - sourceARecords = prometheus.NewGauge( + sourceARecords = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "source", @@ -128,7 +129,7 @@ var ( Help: "Number of Source A records.", }, ) - sourceAAAARecords = prometheus.NewGauge( + sourceAAAARecords = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "source", @@ -136,7 +137,7 @@ var ( Help: "Number of Source AAAA records.", }, ) - verifiedARecords = prometheus.NewGauge( + verifiedARecords = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "controller", @@ -144,7 +145,7 @@ var ( Help: "Number of DNS A-records that exists both in source and registry.", }, ) - verifiedAAAARecords = prometheus.NewGauge( + verifiedAAAARecords = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "controller", @@ -155,21 +156,21 @@ var ( ) func init() { - prometheus.MustRegister(registryErrorsTotal) - prometheus.MustRegister(sourceErrorsTotal) - prometheus.MustRegister(sourceEndpointsTotal) - prometheus.MustRegister(registryEndpointsTotal) - prometheus.MustRegister(lastSyncTimestamp) - prometheus.MustRegister(lastReconcileTimestamp) - prometheus.MustRegister(deprecatedRegistryErrors) - prometheus.MustRegister(deprecatedSourceErrors) - prometheus.MustRegister(controllerNoChangesTotal) - prometheus.MustRegister(registryARecords) - prometheus.MustRegister(registryAAAARecords) - prometheus.MustRegister(sourceARecords) - prometheus.MustRegister(sourceAAAARecords) - prometheus.MustRegister(verifiedARecords) - prometheus.MustRegister(verifiedAAAARecords) + metrics.RegisterMetric.MustRegister(registryErrorsTotal) + metrics.RegisterMetric.MustRegister(sourceErrorsTotal) + metrics.RegisterMetric.MustRegister(sourceEndpointsTotal) + metrics.RegisterMetric.MustRegister(registryEndpointsTotal) + metrics.RegisterMetric.MustRegister(lastSyncTimestamp) + metrics.RegisterMetric.MustRegister(lastReconcileTimestamp) + metrics.RegisterMetric.MustRegister(deprecatedRegistryErrors) + metrics.RegisterMetric.MustRegister(deprecatedSourceErrors) + metrics.RegisterMetric.MustRegister(controllerNoChangesTotal) + metrics.RegisterMetric.MustRegister(registryARecords) + metrics.RegisterMetric.MustRegister(registryAAAARecords) + metrics.RegisterMetric.MustRegister(sourceARecords) + metrics.RegisterMetric.MustRegister(sourceAAAARecords) + metrics.RegisterMetric.MustRegister(verifiedARecords) + metrics.RegisterMetric.MustRegister(verifiedAAAARecords) } // Controller is responsible for orchestrating the different components. @@ -203,7 +204,7 @@ type Controller struct { // RunOnce runs a single iteration of a reconciliation loop. func (c *Controller) RunOnce(ctx context.Context) error { - lastReconcileTimestamp.SetToCurrentTime() + lastReconcileTimestamp.Gauge.SetToCurrentTime() c.runAtMutex.Lock() c.lastRunAt = time.Now() @@ -211,30 +212,30 @@ func (c *Controller) RunOnce(ctx context.Context) error { records, err := c.Registry.Records(ctx) if err != nil { - registryErrorsTotal.Inc() - deprecatedRegistryErrors.Inc() + registryErrorsTotal.Counter.Inc() + deprecatedRegistryErrors.Counter.Inc() return err } - registryEndpointsTotal.Set(float64(len(records))) + registryEndpointsTotal.Gauge.Set(float64(len(records))) regARecords, regAAAARecords := countAddressRecords(records) - registryARecords.Set(float64(regARecords)) - registryAAAARecords.Set(float64(regAAAARecords)) + registryARecords.Gauge.Set(float64(regARecords)) + registryAAAARecords.Gauge.Set(float64(regAAAARecords)) ctx = context.WithValue(ctx, provider.RecordsContextKey, records) endpoints, err := c.Source.Endpoints(ctx) if err != nil { - sourceErrorsTotal.Inc() - deprecatedSourceErrors.Inc() + sourceErrorsTotal.Counter.Inc() + deprecatedSourceErrors.Counter.Inc() return err } - sourceEndpointsTotal.Set(float64(len(endpoints))) + sourceEndpointsTotal.Gauge.Set(float64(len(endpoints))) srcARecords, srcAAAARecords := countAddressRecords(endpoints) - sourceARecords.Set(float64(srcARecords)) - sourceAAAARecords.Set(float64(srcAAAARecords)) + sourceARecords.Gauge.Set(float64(srcARecords)) + sourceAAAARecords.Gauge.Set(float64(srcAAAARecords)) vARecords, vAAAARecords := countMatchingAddressRecords(endpoints, records) - verifiedARecords.Set(float64(vARecords)) - verifiedAAAARecords.Set(float64(vAAAARecords)) + verifiedARecords.Gauge.Set(float64(vARecords)) + verifiedAAAARecords.Gauge.Set(float64(vAAAARecords)) endpoints, err = c.Registry.AdjustEndpoints(endpoints) if err != nil { return fmt.Errorf("adjusting endpoints: %w", err) @@ -256,16 +257,16 @@ func (c *Controller) RunOnce(ctx context.Context) error { if plan.Changes.HasChanges() { err = c.Registry.ApplyChanges(ctx, plan.Changes) if err != nil { - registryErrorsTotal.Inc() - deprecatedRegistryErrors.Inc() + registryErrorsTotal.Counter.Inc() + deprecatedRegistryErrors.Counter.Inc() return err } } else { - controllerNoChangesTotal.Inc() + controllerNoChangesTotal.Counter.Inc() log.Info("All records are already up to date") } - lastSyncTimestamp.SetToCurrentTime() + lastSyncTimestamp.Gauge.SetToCurrentTime() return nil } diff --git a/controller/controller_test.go b/controller/controller_test.go index c6074f833..a5fca2c51 100644 --- a/controller/controller_test.go +++ b/controller/controller_test.go @@ -234,8 +234,8 @@ func TestRunOnce(t *testing.T) { // Validate that the mock source was called. source.AssertExpectations(t) // check the verified records - assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedARecords)) - assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedAAAARecords)) + assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedARecords.Gauge)) + assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedAAAARecords.Gauge)) } // TestRun tests that Run correctly starts and stops @@ -268,8 +268,8 @@ func TestRun(t *testing.T) { // Validate that the mock source was called. source.AssertExpectations(t) // check the verified records - assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedARecords)) - assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedAAAARecords)) + assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedARecords.Gauge)) + assert.Equal(t, math.Float64bits(1), valueFromMetric(verifiedAAAARecords.Gauge)) } func valueFromMetric(metric prometheus.Gauge) uint64 { @@ -520,7 +520,7 @@ func TestVerifyARecords(t *testing.T) { }, []*plan.Changes{}, ) - assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedARecords)) + assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedARecords.Gauge)) testControllerFiltersDomains( t, @@ -564,8 +564,8 @@ func TestVerifyARecords(t *testing.T) { }, }}, ) - assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedARecords)) - assert.Equal(t, math.Float64bits(0), valueFromMetric(verifiedAAAARecords)) + assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedARecords.Gauge)) + assert.Equal(t, math.Float64bits(0), valueFromMetric(verifiedAAAARecords.Gauge)) } func TestVerifyAAAARecords(t *testing.T) { @@ -598,7 +598,7 @@ func TestVerifyAAAARecords(t *testing.T) { }, []*plan.Changes{}, ) - assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedAAAARecords)) + assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedAAAARecords.Gauge)) testControllerFiltersDomains( t, @@ -642,8 +642,8 @@ func TestVerifyAAAARecords(t *testing.T) { }, }}, ) - assert.Equal(t, math.Float64bits(0), valueFromMetric(verifiedARecords)) - assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedAAAARecords)) + assert.Equal(t, math.Float64bits(0), valueFromMetric(verifiedARecords.Gauge)) + assert.Equal(t, math.Float64bits(2), valueFromMetric(verifiedAAAARecords.Gauge)) } func TestARecords(t *testing.T) { @@ -689,8 +689,8 @@ func TestARecords(t *testing.T) { }, }}, ) - assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceARecords)) - assert.Equal(t, math.Float64bits(1), valueFromMetric(registryARecords)) + assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceARecords.Gauge)) + assert.Equal(t, math.Float64bits(1), valueFromMetric(registryARecords.Gauge)) } func TestAAAARecords(t *testing.T) { @@ -736,6 +736,6 @@ func TestAAAARecords(t *testing.T) { }, }}, ) - assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords)) - assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords)) + assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords.Gauge)) + assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords.Gauge)) } diff --git a/docs/contributing/dev-guide.md b/docs/contributing/dev-guide.md index 8f220ce35..6c474f144 100644 --- a/docs/contributing/dev-guide.md +++ b/docs/contributing/dev-guide.md @@ -38,10 +38,11 @@ make test make cover-html ``` -If added any flags, re-generate flags documentation +If added any flags or metrics, re-generate documentation ```shell make generate-flags-documentation +make generate-metrics-documentation ``` We require all changes to be covered by acceptance tests and/or unit tests, depending on the situation. diff --git a/docs/faq.md b/docs/faq.md index c3d956e5e..4bdb4bbdb 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -182,48 +182,6 @@ or You need to add either https://www.googleapis.com/auth/ndev.clouddns.readwrite or https://www.googleapis.com/auth/cloud-platform on your instance group's scope. -## What metrics can I get from ExternalDNS and what do they mean? - -ExternalDNS exposes 2 types of metrics: Sources and Registry errors. - -`Source`s are mostly Kubernetes API objects. Examples of `source` errors may be connection errors to the Kubernetes API server itself or missing RBAC permissions. -It can also stem from incompatible configuration in the objects itself like invalid characters, processing a broken fqdnTemplate, etc. - -`Registry` errors are mostly Provider errors, unless there's some coding flaw in the registry package. Provider errors often arise due to accessing their APIs due to network or missing cloud-provider permissions when reading records. -When applying a changeset, errors will arise if the changeset applied is incompatible with the current state. - -In case of an increased error count, you could correlate them with the `http_request_duration_seconds{handler="instrumented_http"}` metric which should show increased numbers for status codes 4xx (permissions, configuration, invalid changeset) or 5xx (apiserver down). - -You can use the host label in the metric to figure out if the request was against the Kubernetes API server (Source errors) or the DNS provider API (Registry/Provider errors). - -Here is the full list of available metrics provided by ExternalDNS: - -| Name | Description | Type | -| -------------------------------------------------------- | ------------------------------------------------------------------ | ------- | -| external_dns_controller_last_sync_timestamp_seconds | Timestamp of last successful sync with the DNS provider | Gauge | -| external_dns_controller_last_reconcile_timestamp_seconds | Timestamp of last attempted sync with the DNS provider | Gauge | -| external_dns_registry_endpoints_total | Number of Endpoints in all sources | Gauge | -| external_dns_registry_errors_total | Number of Registry errors | Counter | -| external_dns_source_endpoints_total | Number of Endpoints in the registry | Gauge | -| external_dns_source_errors_total | Number of Source errors | Counter | -| external_dns_controller_verified_aaaa_records | Number of DNS AAAA-records that exists both in source and registry | Gauge | -| external_dns_controller_verified_a_records | Number of DNS A-records that exists both in source and registry | Gauge | -| external_dns_registry_aaaa_records | Number of AAAA records in registry | Gauge | -| external_dns_registry_a_records | Number of A records in registry | Gauge | -| external_dns_source_aaaa_records | Number of AAAA records in source | Gauge | -| external_dns_source_a_records | Number of A records in source | Gauge | - -If you're using the webhook provider, the following additional metrics will be provided: - -| Name | Description | Type | -| ------------------------------------------------------------ | ------------------------------------------------------ | ------- | -| external_dns_webhook_provider_records_errors_total | Number of errors with the /records method | Gauge | -| external_dns_webhook_provider_records_requests_total | Number of requests made to the /records method | Gauge | -| external_dns_webhook_provider_applychanges_errors_total | Number of errors with the /applychanges method | Gauge | -| external_dns_webhook_provider_applychanges_requests_total | Number of requests made to the /applychanges method | Gauge | -| external_dns_webhook_provider_adjustendpoints_errors_total | Number of errors with the /adjustendpoints method | Gauge | -| external_dns_webhook_provider_adjustendpoints_requests_total | Number of requests made to the /adjustendpoints method | Gauge | - ## How can I run ExternalDNS under a specific GCP Service Account, e.g. to access DNS records in other projects? Have a look at https://github.com/linki/mate/blob/v0.6.2/examples/google/README.md#permissions diff --git a/docs/monitoring/index.md b/docs/monitoring/index.md new file mode 100644 index 000000000..f796a939e --- /dev/null +++ b/docs/monitoring/index.md @@ -0,0 +1,50 @@ +# Monitoring & Observability + +Monitoring is a crucial aspect of maintaining the health and performance of your applications. +It involves collecting, analyzing, and using information to ensure that your system is running smoothly and efficiently. Effective monitoring helps in identifying issues early, understanding system behavior, and making informed decisions to improve performance and reliability. + +For `external-dns`, all metrics available for scraping are exposed on the `/metrics` endpoint. The metrics are in the Prometheus exposition format, which is widely used for monitoring and alerting. + +To access the metrics: + +```sh +curl https://localhost:7979/metrics +``` + +In the metrics output, you'll see the help text, type information, and current value of the `external_dns_registry_endpoints_total` counter: + +```yml +# HELP external_dns_registry_endpoints_total Number of Endpoints in the registry +# TYPE external_dns_registry_endpoints_total gauge +external_dns_registry_endpoints_total 11 +``` + +You can configure a locally running [Prometheus instance](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) to scrape metrics from the application. Here's an example prometheus.yml configuration: + +```yml +scrape_configs: +- job_name: external-dns + scrape_interval: 10s + static_configs: + - targets: + - localhost:7979 +``` + +For more detailed information on how to instrument application with Prometheus, you can refer to the [Prometheus Go client library documentation](https://prometheus.io/docs/guides/go-application/). + +## What metrics can I get from ExternalDNS and what do they mean? + +- The project maintain a [metrics page](./metrics.md) with a list of supported custom metrics. +- [Go runtime](https://pkg.go.dev/runtime/metrics#hdr-Supported_metrics) metrics also available for scraping. + +ExternalDNS exposes 3 types of metrics: Sources, Registry errors and Cache hits. + +`Source`s are mostly Kubernetes API objects. Examples of `source` errors may be connection errors to the Kubernetes API server itself or missing RBAC permissions. +It can also stem from incompatible configuration in the objects itself like invalid characters, processing a broken fqdnTemplate, etc. + +`Registry` errors are mostly Provider errors, unless there's some coding flaw in the registry package. Provider errors often arise due to accessing their APIs due to network or missing cloud-provider permissions when reading records. +When applying a changeset, errors will arise if the changeset applied is incompatible with the current state. + +In case of an increased error count, you could correlate them with the `http_request_duration_seconds{handler="instrumented_http"}` metric which should show increased numbers for status codes 4xx (permissions, configuration, invalid changeset) or 5xx (apiserver down). + +You can use the host label in the metric to figure out if the request was against the Kubernetes API server (Source errors) or the DNS provider API (Registry/Provider errors). diff --git a/docs/monitoring/metrics.md b/docs/monitoring/metrics.md new file mode 100644 index 000000000..ae4752913 --- /dev/null +++ b/docs/monitoring/metrics.md @@ -0,0 +1,89 @@ +# Available Metrics + + + + + +All metrics available for scraping are exposed on the `/metrics` endpoint. +The metrics are in the Prometheus exposition format. + +To access the metrics: + +```sh +curl https://localhost:7979/metrics +``` + +## Supported Metrics + +> Full metric name is constructed as follows: +> `external_dns__` + +| Name | Metric Type | Subsystem | Help | +|:---------------------------------|:------------|:------------|:------------------------------------------------------| +| last_reconcile_timestamp_seconds | Gauge | controller | Timestamp of last attempted sync with the DNS provider | +| last_sync_timestamp_seconds | Gauge | controller | Timestamp of last successful sync with the DNS provider | +| no_op_runs_total | Counter | controller | Number of reconcile loops ending up with no changes on the DNS provider side. | +| verified_a_records | Gauge | controller | Number of DNS A-records that exists both in source and registry. | +| verified_aaaa_records | Gauge | controller | Number of DNS AAAA-records that exists both in source and registry. | +| cache_apply_changes_calls | Counter | provider | Number of calls to the provider cache ApplyChanges. | +| cache_records_calls | Counter | provider | Number of calls to the provider cache Records list. | +| a_records | Gauge | registry | Number of Registry A records. | +| aaaa_records | Gauge | registry | Number of Registry AAAA records. | +| endpoints_total | Gauge | registry | Number of Endpoints in the registry | +| errors_total | Counter | registry | Number of Registry errors. | +| a_records | Gauge | source | Number of Source A records. | +| aaaa_records | Gauge | source | Number of Source AAAA records. | +| endpoints_total | Gauge | source | Number of Endpoints in all sources | +| errors_total | Counter | source | Number of Source errors. | +| adjustendpoints_errors_total | Gauge | webhook_provider | Errors with AdjustEndpoints method | +| adjustendpoints_requests_total | Gauge | webhook_provider | Requests with AdjustEndpoints method | +| applychanges_errors_total | Gauge | webhook_provider | Errors with ApplyChanges method | +| applychanges_requests_total | Gauge | webhook_provider | Requests with ApplyChanges method | +| records_errors_total | Gauge | webhook_provider | Errors with Records method | +| records_requests_total | Gauge | webhook_provider | Requests with Records method | + +## Available Go Runtime Metrics + +> The following Go runtime metrics are available for scraping. Please note that they may change over time and they are OS dependent. + +| Name | +|:----------------------| +| go_gc_duration_seconds | +| go_gc_gogc_percent | +| go_gc_gomemlimit_bytes | +| go_goroutines | +| go_info | +| go_memstats_alloc_bytes | +| go_memstats_alloc_bytes_total | +| go_memstats_buck_hash_sys_bytes | +| go_memstats_frees_total | +| go_memstats_gc_sys_bytes | +| go_memstats_heap_alloc_bytes | +| go_memstats_heap_idle_bytes | +| go_memstats_heap_inuse_bytes | +| go_memstats_heap_objects | +| go_memstats_heap_released_bytes | +| go_memstats_heap_sys_bytes | +| go_memstats_last_gc_time_seconds | +| go_memstats_mallocs_total | +| go_memstats_mcache_inuse_bytes | +| go_memstats_mcache_sys_bytes | +| go_memstats_mspan_inuse_bytes | +| go_memstats_mspan_sys_bytes | +| go_memstats_next_gc_bytes | +| go_memstats_other_sys_bytes | +| go_memstats_stack_inuse_bytes | +| go_memstats_stack_sys_bytes | +| go_memstats_sys_bytes | +| go_sched_gomaxprocs_threads | +| go_threads | +| http_request_duration_seconds | +| process_cpu_seconds_total | +| process_max_fds | +| process_open_fds | +| process_resident_memory_bytes | +| process_start_time_seconds | +| process_virtual_memory_bytes | +| process_virtual_memory_max_bytes | +| process_network_receive_bytes_total | +| process_network_transmit_bytes_total | diff --git a/internal/gen/docs/flags/main.go b/internal/gen/docs/flags/main.go index c86b78608..dc8053d6b 100644 --- a/internal/gen/docs/flags/main.go +++ b/internal/gen/docs/flags/main.go @@ -1,5 +1,5 @@ /* -Copyright 2017 The Kubernetes Authors. +Copyright 2025 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,14 +18,21 @@ package main import ( "bytes" + "embed" "fmt" "os" "strings" "text/template" + "sigs.k8s.io/external-dns/internal/gen/docs/utils" cfg "sigs.k8s.io/external-dns/pkg/apis/externaldns" ) +var ( + //go:embed "templates/*" + templates embed.FS +) + type Flag struct { Name string Description string @@ -37,19 +44,6 @@ func (f *Flags) addFlag(name, description string) { *f = append(*f, Flag{Name: name, Description: description}) } -const markdownTemplate = `# Flags - - - - - -| Flag | Description | -| :------ | :----------- | -{{- range . }} -| {{ .Name }} | {{ .Description }} | -{{- end -}} -` - // It generates a markdown file // with the supported flags and writes it to the 'docs/flags.md' file. // to re-generate `docs/flags.md` execute 'go run internal/gen/main.go' @@ -64,7 +58,7 @@ func main() { _ = fmt.Errorf("failed to generate markdown file '%s': %v\n", path, err.Error()) } content = content + "\n" - _ = writeToFile(path, content) + _ = utils.WriteToFile(path, content) } func computeFlags() Flags { @@ -94,25 +88,13 @@ func computeFlags() Flags { } func (f *Flags) generateMarkdownTable() (string, error) { - tmpl := template.Must(template.New("flags.md.tpl").Parse(markdownTemplate)) + tmpl := template.New("").Funcs(utils.FuncMap()) + template.Must(tmpl.ParseFS(templates, "templates/*.gotpl")) var b bytes.Buffer - err := tmpl.Execute(&b, f) + err := tmpl.ExecuteTemplate(&b, "flags.gotpl", f) if err != nil { return "", err } return b.String(), nil } - -func writeToFile(filename string, content string) error { - file, fileErr := os.Create(filename) - if fileErr != nil { - _ = fmt.Errorf("failed to create file: %v", fileErr) - } - defer file.Close() - _, writeErr := file.WriteString(content) - if writeErr != nil { - _ = fmt.Errorf("failed to write to file: %s", filename) - } - return nil -} diff --git a/internal/gen/docs/flags/main_test.go b/internal/gen/docs/flags/main_test.go index 60fa2c119..83b1ca3f3 100644 --- a/internal/gen/docs/flags/main_test.go +++ b/internal/gen/docs/flags/main_test.go @@ -1,5 +1,5 @@ /* -Copyright 2017 The Kubernetes Authors. +Copyright 2025 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/internal/gen/docs/flags/templates/flags.gotpl b/internal/gen/docs/flags/templates/flags.gotpl new file mode 100644 index 000000000..bfc32d564 --- /dev/null +++ b/internal/gen/docs/flags/templates/flags.gotpl @@ -0,0 +1,11 @@ +# Flags + + + + + +| Flag | Description | +| :------ | :----------- | +{{- range . }} +| {{ .Name }} | {{ .Description }} | +{{- end -}} diff --git a/internal/gen/docs/metrics/main.go b/internal/gen/docs/metrics/main.go new file mode 100644 index 000000000..f529a60c6 --- /dev/null +++ b/internal/gen/docs/metrics/main.go @@ -0,0 +1,122 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "bytes" + "embed" + "fmt" + "os" + "reflect" + "sort" + "strings" + "text/template" + "unsafe" + + "github.com/prometheus/client_golang/prometheus" + + "sigs.k8s.io/external-dns/internal/gen/docs/utils" + "sigs.k8s.io/external-dns/pkg/metrics" + + // the imports is necessary for the code generation process. + _ "sigs.k8s.io/external-dns/controller" + _ "sigs.k8s.io/external-dns/provider" + _ "sigs.k8s.io/external-dns/provider/webhook" +) + +var ( + //go:embed "templates/*" + templates embed.FS +) + +func main() { + testPath, _ := os.Getwd() + path := fmt.Sprintf("%s/docs/monitoring/metrics.md", testPath) + fmt.Printf("generate file '%s' with configured metrics\n", path) + + content, err := generateMarkdownTable(metrics.RegisterMetric, true) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to generate markdown file '%s': %v\n", path, err) + os.Exit(1) + } + content = content + "\n" + _ = utils.WriteToFile(path, content) +} + +func generateMarkdownTable(m *metrics.MetricRegistry, withRuntime bool) (string, error) { + tmpl := template.New("").Funcs(utils.FuncMap()) + template.Must(tmpl.ParseFS(templates, "templates/*.gotpl")) + + sortMetrics(m.Metrics) + var runtimeMetrics []string + if withRuntime { + runtimeMetrics = getRuntimeMetrics(prometheus.DefaultRegisterer) + // available when promhttp.Handler() is activated + runtimeMetrics = append(runtimeMetrics, []string{ + "process_network_receive_bytes_total", + "process_network_transmit_bytes_total", + }...) + } else { + runtimeMetrics = []string{} + } + + var b bytes.Buffer + err := tmpl.ExecuteTemplate(&b, "metrics.gotpl", struct { + Metrics []*metrics.Metric + RuntimeMetrics []string + }{ + Metrics: m.Metrics, + RuntimeMetrics: runtimeMetrics, + }) + + if err != nil { + return "", err + } + return b.String(), nil +} + +// sortMetrics sorts the given slice of metrics by their subsystem and name. +// Metrics are first sorted by their subsystem, and then by their name within each subsystem. +func sortMetrics(metrics []*metrics.Metric) { + sort.Slice(metrics, func(i, j int) bool { + if metrics[i].Subsystem == metrics[j].Subsystem { + return metrics[i].Name < metrics[j].Name + } + return metrics[i].Subsystem < metrics[j].Subsystem + }) +} + +// getRuntimeMetrics retrieves the list of runtime metrics from the Prometheus library. +func getRuntimeMetrics(reg prometheus.Registerer) []string { + var runtimeMetrics []string + + // hacks to get the runtime metrics from prometheus library + // safe to do because it's a just a documentation generator + values := reflect.ValueOf(reg).Elem().FieldByName("dimHashesByName") + values = reflect.NewAt(values.Type(), unsafe.Pointer(values.UnsafeAddr())).Elem() + + switch v := values.Interface().(type) { + case map[string]uint64: + for k := range v { + if !strings.HasPrefix(k, "external_dns") { + runtimeMetrics = append(runtimeMetrics, k) + } + } + } + sort.Strings(runtimeMetrics) + return runtimeMetrics +} diff --git a/internal/gen/docs/metrics/main_test.go b/internal/gen/docs/metrics/main_test.go new file mode 100644 index 000000000..260ffad93 --- /dev/null +++ b/internal/gen/docs/metrics/main_test.go @@ -0,0 +1,133 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "io/fs" + "math/rand/v2" + "os" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "sigs.k8s.io/external-dns/pkg/metrics" +) + +const pathToDocs = "%s/../../../../docs/monitoring" + +func TestComputeMetrics(t *testing.T) { + reg := metrics.RegisterMetric + + if len(reg.Metrics) == 0 { + t.Errorf("Expected not empty metrics registry, got %d", len(reg.Metrics)) + } + + assert.Len(t, reg.Metrics, 21) +} + +func TestGenerateMarkdownTableRenderer(t *testing.T) { + reg := metrics.NewMetricsRegister() + + got, err := generateMarkdownTable(reg, false) + assert.NoError(t, err) + + assert.Contains(t, got, "# Available Metrics\n\n\n") + assert.Contains(t, got, "| Metric Type | Subsystem | Help") +} + +func TestGenerateMarkdownTableWithSingleMetric(t *testing.T) { + reg := metrics.NewMetricsRegister() + + reg.MustRegister(metrics.NewGaugeWithOpts( + prometheus.GaugeOpts{ + Namespace: "external_dns", + Subsystem: fmt.Sprintf("controller_%d", rand.IntN(100)), + Name: "verified_aaaa_records", + Help: "This is just a test.", + }, + )) + + got, err := generateMarkdownTable(reg, false) + assert.NoError(t, err) + + assert.Contains(t, got, "verified_aaaa_records") + assert.Contains(t, got, "This is just a test.") +} + +func TestMetricsMdUpToDate(t *testing.T) { + testPath, _ := os.Getwd() + fsys := os.DirFS(fmt.Sprintf(pathToDocs, testPath)) + fileName := "metrics.md" + expected, err := fs.ReadFile(fsys, fileName) + assert.NoError(t, err, "expected file %s to exist", fileName) + + reg := metrics.RegisterMetric + actual, err := generateMarkdownTable(reg, false) + assert.NoError(t, err) + assert.Contains(t, string(expected), actual) +} + +func TestMetricsMdExtraMetricAdded(t *testing.T) { + testPath, _ := os.Getwd() + fsys := os.DirFS(fmt.Sprintf(pathToDocs, testPath)) + fileName := "metrics.md" + expected, err := fs.ReadFile(fsys, fileName) + assert.NoError(t, err, "expected file %s to exist", fileName) + + reg := metrics.RegisterMetric + + reg.MustRegister(metrics.NewGaugeWithOpts( + prometheus.GaugeOpts{ + Namespace: "external_dns", + Subsystem: fmt.Sprintf("controller_%d", rand.IntN(100)), + Name: "verified_aaaa_records", + Help: "This is just a test.", + }, + )) + + actual, err := generateMarkdownTable(reg, false) + assert.NoError(t, err) + assert.NotEqual(t, string(expected), actual) +} + +func TestGetRuntimeMetricsForNewRegistry(t *testing.T) { + reg := prometheus.NewRegistry() + // Register some runtime metrics + reg.MustRegister(prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "go_goroutines", + Help: "Number of goroutines that currently exist.", + })) + reg.MustRegister(prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "go_memstats_alloc_bytes", + Help: "Number of bytes allocated and still in use.", + })) + runtimeMetrics := getRuntimeMetrics(reg) + + // Check that the runtime metrics are correctly retrieved + expectedMetrics := []string{"go_goroutines", "go_memstats_alloc_bytes"} + assert.ElementsMatch(t, expectedMetrics, runtimeMetrics) + assert.Len(t, runtimeMetrics, 2) +} + +func TestGetRuntimeMetricsForDefaultRegistry(t *testing.T) { + reg := prometheus.DefaultRegisterer + runtimeMetrics := getRuntimeMetrics(reg) + if len(runtimeMetrics) == 0 { + t.Errorf("Expected not empty runtime metrics, got %d", len(runtimeMetrics)) + } +} diff --git a/internal/gen/docs/metrics/templates/metrics.gotpl b/internal/gen/docs/metrics/templates/metrics.gotpl new file mode 100644 index 000000000..9ca9769ef --- /dev/null +++ b/internal/gen/docs/metrics/templates/metrics.gotpl @@ -0,0 +1,37 @@ +# Available Metrics + + + + + +All metrics available for scraping are exposed on the {{backtick 1}}/metrics{{backtick 1}} endpoint. +The metrics are in the Prometheus exposition format. + +To access the metrics: + +{{backtick 3}}sh +curl https://localhost:7979/metrics +{{backtick 3}} + +## Supported Metrics + +> Full metric name is constructed as follows: +> {{backtick 1}}external_dns__{{backtick 1}} + +| Name | Metric Type | Subsystem | Help | +|:---------------------------------|:------------|:------------|:------------------------------------------------------| +{{- range .Metrics }} +| {{ .Name }} | {{ .Type | capitalize }} | {{ .Subsystem }} | {{ .Help }} | +{{- end }} + +## Available Go Runtime Metrics + +> The following Go runtime metrics are available for scraping. Please note that they may change over time and they are OS dependent. + +{{ if .RuntimeMetrics -}} +| Name | +|:----------------------| +{{- range .RuntimeMetrics }} +| {{ . }} | +{{- end -}} +{{- end -}} diff --git a/internal/gen/docs/utils/utils.go b/internal/gen/docs/utils/utils.go new file mode 100644 index 000000000..5aff64664 --- /dev/null +++ b/internal/gen/docs/utils/utils.go @@ -0,0 +1,50 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "fmt" + "os" + "strings" + "text/template" + + "golang.org/x/text/cases" + "golang.org/x/text/language" +) + +func WriteToFile(filename string, content string) error { + file, fileErr := os.Create(filename) + if fileErr != nil { + _ = fmt.Errorf("failed to create file: %v", fileErr) + } + defer file.Close() + _, writeErr := file.WriteString(content) + if writeErr != nil { + _ = fmt.Errorf("failed to write to file: %s", filename) + } + return nil +} + +// FuncMap returns a mapping of all of the functions that Engine has. +func FuncMap() template.FuncMap { + return template.FuncMap{ + "backtick": func(times int) string { + return strings.Repeat("`", times) + }, + "capitalize": cases.Title(language.English, cases.Compact).String, + } +} diff --git a/internal/gen/docs/utils/utils_test.go b/internal/gen/docs/utils/utils_test.go new file mode 100644 index 000000000..b330e9f9e --- /dev/null +++ b/internal/gen/docs/utils/utils_test.go @@ -0,0 +1,73 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "fmt" + "os" + "strings" + "testing" + "text/template" + + "github.com/stretchr/testify/assert" +) + +func TestWriteToFile(t *testing.T) { + filename := fmt.Sprintf("%s/testfile", t.TempDir()) + content := "Hello, World!" + + defer os.Remove(filename) + + err := WriteToFile(filename, content) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + data, err := os.ReadFile(filename) + if err != nil { + t.Fatalf("expected no error reading file, got %v", err) + } + + if string(data) != content { + t.Errorf("expected content %q, got %q", content, string(data)) + } +} + +func TestFuncs(t *testing.T) { + tests := []struct { + tpl, expect string + vars interface{} + }{ + { + tpl: `{{ backtick 3 }}`, + expect: "```", + vars: map[string]interface{}{}, + }, + { + tpl: `{{ capitalize .name }}`, + expect: "Capital", + vars: map[string]interface{}{"name": "capital"}, + }, + } + + for _, tt := range tests { + var b strings.Builder + err := template.Must(template.New("test").Funcs(FuncMap()).Parse(tt.tpl)).Execute(&b, tt.vars) + assert.NoError(t, err) + assert.Equal(t, tt.expect, b.String(), tt.tpl) + } +} diff --git a/main.go b/main.go index bc4790757..25f2b5c15 100644 --- a/main.go +++ b/main.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/external-dns/endpoint" "sigs.k8s.io/external-dns/pkg/apis/externaldns" "sigs.k8s.io/external-dns/pkg/apis/externaldns/validation" + "sigs.k8s.io/external-dns/pkg/metrics" "sigs.k8s.io/external-dns/plan" "sigs.k8s.io/external-dns/provider" "sigs.k8s.io/external-dns/provider/akamai" @@ -449,9 +450,13 @@ func handleSigterm(cancel func()) { func serveMetrics(address string) { http.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) - w.Write([]byte("OK")) + _, _ = w.Write([]byte("OK")) }) + log.Debugf("serving 'healthz' on 'localhost:%s/healthz'", address) + log.Debugf("serving 'metrics' on 'localhost:%s/metrics'", address) + log.Debugf("registered '%d' metrics", len(metrics.RegisterMetric.Metrics)) + http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(address, nil)) diff --git a/mkdocs.yml b/mkdocs.yml index c276dc74b..b491f7836 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - Advanced Topics: - Initial Design: docs/initial-design.md - Leader Election: docs/proposal/001-leader-election.md + - Monitoring: docs/monitoring/* - MultiTarget: docs/proposal/multi-target.md - NAT64: docs/nat64.md - Rate Limits: docs/rate-limits.md diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 000000000..028e78aae --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,77 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "runtime" + + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" + + cfg "sigs.k8s.io/external-dns/pkg/apis/externaldns" +) + +var ( + RegisterMetric = NewMetricsRegister() +) + +func NewMetricsRegister() *MetricRegistry { + reg := prometheus.WrapRegistererWith( + prometheus.Labels{ + "version": cfg.Version, + "arch": runtime.GOARCH, + "go_version": runtime.Version(), + }, + prometheus.DefaultRegisterer) + return &MetricRegistry{ + Registerer: reg, + Metrics: []*Metric{}, + mName: make(map[string]bool), + } +} + +// MustRegister registers a metric if it hasn't been registered yet. +// +// Usage: MustRegister(...) +// Example: +// +// func init() { +// metrics.RegisterMetric.MustRegister(errorsTotal) +// } +func (m *MetricRegistry) MustRegister(cs IMetric) { + switch v := cs.(type) { + case CounterMetric, GaugeMetric, CounterVecMetric: + if _, exists := m.mName[cs.Get().FQDN]; exists { + return + } else { + m.mName[cs.Get().FQDN] = true + } + m.Metrics = append(m.Metrics, cs.Get()) + switch metric := v.(type) { + case CounterMetric: + m.Registerer.MustRegister(metric.Counter) + case GaugeMetric: + m.Registerer.MustRegister(metric.Gauge) + case CounterVecMetric: + m.Registerer.MustRegister(metric.CounterVec) + } + log.Debugf("Register metric: %s", cs.Get().FQDN) + default: + log.Warnf("Unsupported metric type: %T", v) + return + } +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go new file mode 100644 index 000000000..878ed784a --- /dev/null +++ b/pkg/metrics/metrics_test.go @@ -0,0 +1,103 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "sigs.k8s.io/external-dns/internal/testutils" +) + +type MockMetric struct { + FQDN string +} + +func (m *MockMetric) Get() *Metric { + return &Metric{FQDN: m.FQDN} +} + +func TestMustRegister(t *testing.T) { + tests := []struct { + name string + metrics []IMetric + expected int + }{ + { + name: "single metric", + metrics: []IMetric{ + NewCounterWithOpts(prometheus.CounterOpts{Name: "test_counter_1"}), + }, + expected: 1, + }, + { + name: "two metrics", + metrics: []IMetric{ + NewGaugeWithOpts(prometheus.GaugeOpts{Name: "test_gauge_2", Subsystem: "test"}), + NewCounterWithOpts(prometheus.CounterOpts{Name: "test_counter_2", Subsystem: "app"}), + }, + expected: 2, + }, + { + name: "mix of metrics", + metrics: []IMetric{ + NewGaugeWithOpts(prometheus.GaugeOpts{Name: "test_gauge_3"}), + NewCounterWithOpts(prometheus.CounterOpts{Name: "test_counter_3"}), + NewCounterVecWithOpts(prometheus.CounterOpts{Name: "test_counter_vec_3"}, []string{"label"}), + }, + expected: 3, + }, + { + name: "unsupported metric", + metrics: []IMetric{ + &MockMetric{FQDN: "unsupported_metric"}, + }, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + registry := NewMetricsRegister() + for _, m := range tt.metrics { + registry.MustRegister(m) + } + assert.Len(t, registry.Metrics, tt.expected) + }) + } +} + +func TestUnsupportedMetricWarning(t *testing.T) { + buf := testutils.LogsToBuffer(log.WarnLevel, t) + registry := NewMetricsRegister() + mockUnsupported := &MockMetric{FQDN: "unsupported_metric"} + registry.MustRegister(mockUnsupported) + assert.NotContains(t, registry.mName, "unsupported_metric") + + assert.Contains(t, buf.String(), "Unsupported metric type: *metrics.MockMetric") +} + +func TestNewMetricsRegister(t *testing.T) { + registry := NewMetricsRegister() + + assert.NotNil(t, registry) + assert.NotNil(t, registry.Registerer) + assert.Empty(t, registry.Metrics) + assert.Empty(t, registry.mName) +} diff --git a/pkg/metrics/models.go b/pkg/metrics/models.go new file mode 100644 index 000000000..8878a5887 --- /dev/null +++ b/pkg/metrics/models.go @@ -0,0 +1,111 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "fmt" + + "github.com/prometheus/client_golang/prometheus" +) + +type MetricRegistry struct { + Registerer prometheus.Registerer + Metrics []*Metric + mName map[string]bool +} + +type Metric struct { + Type string + Namespace string + Subsystem string + Name string + Help string + FQDN string +} + +type IMetric interface { + Get() *Metric +} + +type GaugeMetric struct { + Metric + Gauge prometheus.Gauge +} + +func (g GaugeMetric) Get() *Metric { + return &g.Metric +} + +type CounterMetric struct { + Metric + Counter prometheus.Counter +} + +func (g CounterMetric) Get() *Metric { + return &g.Metric +} + +type CounterVecMetric struct { + Metric + CounterVec *prometheus.CounterVec +} + +func (g CounterVecMetric) Get() *Metric { + return &g.Metric +} + +func NewGaugeWithOpts(opts prometheus.GaugeOpts) GaugeMetric { + return GaugeMetric{ + Metric: Metric{ + Type: "gauge", + Name: opts.Name, + FQDN: fmt.Sprintf("%s_%s", opts.Subsystem, opts.Name), + Namespace: opts.Namespace, + Subsystem: opts.Subsystem, + Help: opts.Help, + }, + Gauge: prometheus.NewGauge(opts), + } +} + +func NewCounterWithOpts(opts prometheus.CounterOpts) CounterMetric { + return CounterMetric{ + Metric: Metric{ + Type: "counter", + Name: opts.Name, + FQDN: fmt.Sprintf("%s_%s", opts.Subsystem, opts.Name), + Namespace: opts.Namespace, + Subsystem: opts.Subsystem, + Help: opts.Help, + }, + Counter: prometheus.NewCounter(opts), + } +} + +func NewCounterVecWithOpts(opts prometheus.CounterOpts, labelNames []string) CounterVecMetric { + return CounterVecMetric{ + Metric: Metric{ + Type: "counter", + Name: opts.Name, + FQDN: fmt.Sprintf("%s_%s", opts.Subsystem, opts.Name), + Namespace: opts.Namespace, + Subsystem: opts.Subsystem, + Help: opts.Help, + }, + CounterVec: prometheus.NewCounterVec(opts, labelNames), + } +} diff --git a/pkg/metrics/models_test.go b/pkg/metrics/models_test.go new file mode 100644 index 000000000..aad967c4a --- /dev/null +++ b/pkg/metrics/models_test.go @@ -0,0 +1,83 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" +) + +func TestNewGaugeWithOpts(t *testing.T) { + opts := prometheus.GaugeOpts{ + Name: "test_gauge", + Namespace: "test_namespace", + Subsystem: "test_subsystem", + Help: "This is a test gauge", + } + + gaugeMetric := NewGaugeWithOpts(opts) + + assert.Equal(t, "gauge", gaugeMetric.Type) + assert.Equal(t, "test_gauge", gaugeMetric.Name) + assert.Equal(t, "test_namespace", gaugeMetric.Namespace) + assert.Equal(t, "test_subsystem", gaugeMetric.Subsystem) + assert.Equal(t, "This is a test gauge", gaugeMetric.Help) + assert.Equal(t, "test_subsystem_test_gauge", gaugeMetric.FQDN) + assert.NotNil(t, gaugeMetric.Gauge) +} + +func TestNewCounterWithOpts(t *testing.T) { + opts := prometheus.CounterOpts{ + Name: "test_counter", + Namespace: "test_namespace", + Subsystem: "test_subsystem", + Help: "This is a test counter", + } + + counterMetric := NewCounterWithOpts(opts) + + assert.Equal(t, "counter", counterMetric.Type) + assert.Equal(t, "test_counter", counterMetric.Name) + assert.Equal(t, "test_namespace", counterMetric.Namespace) + assert.Equal(t, "test_subsystem", counterMetric.Subsystem) + assert.Equal(t, "This is a test counter", counterMetric.Help) + assert.Equal(t, "test_subsystem_test_counter", counterMetric.FQDN) + assert.NotNil(t, counterMetric.Counter) +} + +func TestNewCounterVecWithOpts(t *testing.T) { + opts := prometheus.CounterOpts{ + Name: "test_counter_vec", + Namespace: "test_namespace", + Subsystem: "test_subsystem", + Help: "This is a test counter vector", + } + + labelNames := []string{"label1", "label2"} + + counterVecMetric := NewCounterVecWithOpts(opts, labelNames) + + assert.Equal(t, "counter", counterVecMetric.Type) + assert.Equal(t, "test_counter_vec", counterVecMetric.Name) + assert.Equal(t, "test_namespace", counterVecMetric.Namespace) + assert.Equal(t, "test_subsystem", counterVecMetric.Subsystem) + assert.Equal(t, "This is a test counter vector", counterVecMetric.Help) + assert.Equal(t, "test_subsystem_test_counter_vec", counterVecMetric.FQDN) + assert.NotNil(t, counterVecMetric.CounterVec) +} diff --git a/provider/cached_provider.go b/provider/cached_provider.go index 6009c77d7..8f86d704c 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -13,22 +13,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package provider import ( "context" - "sync" "time" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "sigs.k8s.io/external-dns/endpoint" + "sigs.k8s.io/external-dns/pkg/metrics" "sigs.k8s.io/external-dns/plan" ) var ( - cachedRecordsCallsTotal = prometheus.NewCounterVec( + cachedRecordsCallsTotal = metrics.NewCounterVecWithOpts( prometheus.CounterOpts{ Namespace: "external_dns", Subsystem: "provider", @@ -39,7 +40,7 @@ var ( "from_cache", }, ) - cachedApplyChangesCallsTotal = prometheus.NewCounter( + cachedApplyChangesCallsTotal = metrics.NewCounterWithOpts( prometheus.CounterOpts{ Namespace: "external_dns", Subsystem: "provider", @@ -47,10 +48,13 @@ var ( Help: "Number of calls to the provider cache ApplyChanges.", }, ) - - registerCacheProviderMetrics = sync.Once{} ) +func init() { + metrics.RegisterMetric.MustRegister(cachedRecordsCallsTotal) + metrics.RegisterMetric.MustRegister(cachedApplyChangesCallsTotal) +} + type CachedProvider struct { Provider RefreshDelay time.Duration @@ -59,9 +63,6 @@ type CachedProvider struct { } func NewCachedProvider(provider Provider, refreshDelay time.Duration) *CachedProvider { - registerCacheProviderMetrics.Do(func() { - prometheus.MustRegister(cachedRecordsCallsTotal) - }) return &CachedProvider{ Provider: provider, RefreshDelay: refreshDelay, @@ -78,10 +79,10 @@ func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, err } c.cache = records c.lastRead = time.Now() - cachedRecordsCallsTotal.WithLabelValues("false").Inc() + cachedRecordsCallsTotal.CounterVec.WithLabelValues("false").Inc() } else { log.Debug("Records cache provider: using records list from cache") - cachedRecordsCallsTotal.WithLabelValues("true").Inc() + cachedRecordsCallsTotal.CounterVec.WithLabelValues("true").Inc() } return c.cache, nil } @@ -91,7 +92,7 @@ func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes return nil } c.Reset() - cachedApplyChangesCallsTotal.Inc() + cachedApplyChangesCallsTotal.Counter.Inc() return c.Provider.ApplyChanges(ctx, changes) } diff --git a/provider/webhook/webhook.go b/provider/webhook/webhook.go index 8cae0850e..1a1558d4e 100644 --- a/provider/webhook/webhook.go +++ b/provider/webhook/webhook.go @@ -25,11 +25,12 @@ import ( "net/url" "sigs.k8s.io/external-dns/endpoint" + "sigs.k8s.io/external-dns/pkg/metrics" "sigs.k8s.io/external-dns/plan" "sigs.k8s.io/external-dns/provider" webhookapi "sigs.k8s.io/external-dns/provider/webhook/api" - backoff "github.com/cenkalti/backoff/v4" + "github.com/cenkalti/backoff/v4" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -40,7 +41,7 @@ const ( ) var ( - recordsErrorsGauge = prometheus.NewGauge( + recordsErrorsGauge = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "webhook_provider", @@ -48,7 +49,7 @@ var ( Help: "Errors with Records method", }, ) - recordsRequestsGauge = prometheus.NewGauge( + recordsRequestsGauge = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "webhook_provider", @@ -56,7 +57,7 @@ var ( Help: "Requests with Records method", }, ) - applyChangesErrorsGauge = prometheus.NewGauge( + applyChangesErrorsGauge = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "webhook_provider", @@ -64,7 +65,7 @@ var ( Help: "Errors with ApplyChanges method", }, ) - applyChangesRequestsGauge = prometheus.NewGauge( + applyChangesRequestsGauge = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "webhook_provider", @@ -72,7 +73,7 @@ var ( Help: "Requests with ApplyChanges method", }, ) - adjustEndpointsErrorsGauge = prometheus.NewGauge( + adjustEndpointsErrorsGauge = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "webhook_provider", @@ -80,7 +81,7 @@ var ( Help: "Errors with AdjustEndpoints method", }, ) - adjustEndpointsRequestsGauge = prometheus.NewGauge( + adjustEndpointsRequestsGauge = metrics.NewGaugeWithOpts( prometheus.GaugeOpts{ Namespace: "external_dns", Subsystem: "webhook_provider", @@ -97,12 +98,12 @@ type WebhookProvider struct { } func init() { - prometheus.MustRegister(recordsErrorsGauge) - prometheus.MustRegister(recordsRequestsGauge) - prometheus.MustRegister(applyChangesErrorsGauge) - prometheus.MustRegister(applyChangesRequestsGauge) - prometheus.MustRegister(adjustEndpointsErrorsGauge) - prometheus.MustRegister(adjustEndpointsRequestsGauge) + metrics.RegisterMetric.MustRegister(recordsErrorsGauge) + metrics.RegisterMetric.MustRegister(recordsRequestsGauge) + metrics.RegisterMetric.MustRegister(applyChangesErrorsGauge) + metrics.RegisterMetric.MustRegister(applyChangesRequestsGauge) + metrics.RegisterMetric.MustRegister(adjustEndpointsErrorsGauge) + metrics.RegisterMetric.MustRegister(adjustEndpointsRequestsGauge) } func NewWebhookProvider(u string) (*WebhookProvider, error) { @@ -160,26 +161,26 @@ func NewWebhookProvider(u string) (*WebhookProvider, error) { // Records will make a GET call to remoteServerURL/records and return the results func (p WebhookProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { - recordsRequestsGauge.Inc() + recordsRequestsGauge.Gauge.Inc() u := p.remoteServerURL.JoinPath("records").String() req, err := http.NewRequest("GET", u, nil) if err != nil { - recordsErrorsGauge.Inc() + recordsErrorsGauge.Gauge.Inc() log.Debugf("Failed to create request: %s", err.Error()) return nil, err } req.Header.Set(acceptHeader, webhookapi.MediaTypeFormatAndVersion) resp, err := p.client.Do(req) if err != nil { - recordsErrorsGauge.Inc() + recordsErrorsGauge.Gauge.Inc() log.Debugf("Failed to perform request: %s", err.Error()) return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - recordsErrorsGauge.Inc() + recordsErrorsGauge.Gauge.Inc() log.Debugf("Failed to get records with code %d", resp.StatusCode) err := fmt.Errorf("failed to get records with code %d", resp.StatusCode) if isRetryableError(resp.StatusCode) { @@ -190,7 +191,7 @@ func (p WebhookProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, err endpoints := []*endpoint.Endpoint{} if err := json.NewDecoder(resp.Body).Decode(&endpoints); err != nil { - recordsErrorsGauge.Inc() + recordsErrorsGauge.Gauge.Inc() log.Debugf("Failed to decode response body: %s", err.Error()) return nil, err } @@ -199,19 +200,19 @@ func (p WebhookProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, err // ApplyChanges will make a POST to remoteServerURL/records with the changes func (p WebhookProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) error { - applyChangesRequestsGauge.Inc() + applyChangesRequestsGauge.Gauge.Inc() u := p.remoteServerURL.JoinPath("records").String() b := new(bytes.Buffer) if err := json.NewEncoder(b).Encode(changes); err != nil { - applyChangesErrorsGauge.Inc() + applyChangesErrorsGauge.Gauge.Inc() log.Debugf("Failed to encode changes: %s", err.Error()) return err } req, err := http.NewRequest("POST", u, b) if err != nil { - applyChangesErrorsGauge.Inc() + applyChangesErrorsGauge.Gauge.Inc() log.Debugf("Failed to create request: %s", err.Error()) return err } @@ -220,14 +221,14 @@ func (p WebhookProvider) ApplyChanges(ctx context.Context, changes *plan.Changes resp, err := p.client.Do(req) if err != nil { - applyChangesErrorsGauge.Inc() + applyChangesErrorsGauge.Gauge.Inc() log.Debugf("Failed to perform request: %s", err.Error()) return err } defer resp.Body.Close() if resp.StatusCode != http.StatusNoContent { - applyChangesErrorsGauge.Inc() + applyChangesErrorsGauge.Gauge.Inc() log.Debugf("Failed to apply changes with code %d", resp.StatusCode) err := fmt.Errorf("failed to apply changes with code %d", resp.StatusCode) if isRetryableError(resp.StatusCode) { @@ -242,25 +243,25 @@ func (p WebhookProvider) ApplyChanges(ctx context.Context, changes *plan.Changes // based on a provider specific requirement. // This method returns an empty slice in case there is a technical error on the provider's side so that no endpoints will be considered. func (p WebhookProvider) AdjustEndpoints(e []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) { - adjustEndpointsRequestsGauge.Inc() + adjustEndpointsRequestsGauge.Gauge.Inc() endpoints := []*endpoint.Endpoint{} u, err := url.JoinPath(p.remoteServerURL.String(), "adjustendpoints") if err != nil { - adjustEndpointsErrorsGauge.Inc() + adjustEndpointsErrorsGauge.Gauge.Inc() log.Debugf("Failed to join path, %s", err) return nil, err } b := new(bytes.Buffer) if err := json.NewEncoder(b).Encode(e); err != nil { - adjustEndpointsErrorsGauge.Inc() + adjustEndpointsErrorsGauge.Gauge.Inc() log.Debugf("Failed to encode endpoints, %s", err) return nil, err } req, err := http.NewRequest("POST", u, b) if err != nil { - adjustEndpointsErrorsGauge.Inc() + adjustEndpointsErrorsGauge.Gauge.Inc() log.Debugf("Failed to create new HTTP request, %s", err) return nil, err } @@ -270,14 +271,14 @@ func (p WebhookProvider) AdjustEndpoints(e []*endpoint.Endpoint) ([]*endpoint.En resp, err := p.client.Do(req) if err != nil { - adjustEndpointsErrorsGauge.Inc() + adjustEndpointsErrorsGauge.Gauge.Inc() log.Debugf("Failed executing http request, %s", err) return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - adjustEndpointsErrorsGauge.Inc() + adjustEndpointsErrorsGauge.Gauge.Inc() log.Debugf("Failed to AdjustEndpoints with code %d", resp.StatusCode) err := fmt.Errorf("failed to AdjustEndpoints with code %d", resp.StatusCode) if isRetryableError(resp.StatusCode) { @@ -287,7 +288,7 @@ func (p WebhookProvider) AdjustEndpoints(e []*endpoint.Endpoint) ([]*endpoint.En } if err := json.NewDecoder(resp.Body).Decode(&endpoints); err != nil { - adjustEndpointsErrorsGauge.Inc() + adjustEndpointsErrorsGauge.Gauge.Inc() log.Debugf("Failed to decode response body: %s", err.Error()) return nil, err }