perf(controller): optimize metrics counting (#6124)

* perf(controller): optimize metrics counting

Signed-off-by: ivan katliarchuk <ivan.katliarchuk@gmail.com>

* perf(controller): optimize metrics counting

Signed-off-by: ivan katliarchuk <ivan.katliarchuk@gmail.com>

* perf(controller): optimize metrics counting

Signed-off-by: ivan katliarchuk <ivan.katliarchuk@gmail.com>

* perf(controller): optimize metrics counting

Signed-off-by: ivan katliarchuk <ivan.katliarchuk@gmail.com>

---------

Signed-off-by: ivan katliarchuk <ivan.katliarchuk@gmail.com>
This commit is contained in:
Ivan Ka 2026-01-20 17:21:25 +00:00 committed by GitHub
parent 0eadcb0ef9
commit 5799aefc73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 247 additions and 249 deletions

View File

@ -97,7 +97,7 @@ linters:
path: istio_virtualservice.go|fqdn.go|cloudflare.go
- linters:
- gochecknoinits
path: ^(.*/controller\.go|internal/.*/init\.go|pkg/.*/metrics\.go|.*/webhook\.go|.*/http\.go|apis/.*\.go|.*/cached_provider\.go)$
path: ^(internal/.*/init\.go|.*/metrics\.go|.*/webhook\.go|.*/http\.go|apis/.*\.go|.*/cached_provider\.go)$
- linters:
- modernize
path: ^(apis/.*\.go)$

View File

@ -23,137 +23,16 @@ import (
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
"sigs.k8s.io/external-dns/endpoint"
"sigs.k8s.io/external-dns/pkg/events"
"sigs.k8s.io/external-dns/pkg/metrics"
"sigs.k8s.io/external-dns/plan"
"sigs.k8s.io/external-dns/provider"
"sigs.k8s.io/external-dns/registry"
"sigs.k8s.io/external-dns/source"
)
var (
registryErrorsTotal = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "registry",
Name: "errors_total",
Help: "Number of Registry errors.",
},
)
sourceErrorsTotal = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "source",
Name: "errors_total",
Help: "Number of Source errors.",
},
)
sourceEndpointsTotal = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "source",
Name: "endpoints_total",
Help: "Number of Endpoints in all sources",
},
)
registryEndpointsTotal = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "registry",
Name: "endpoints_total",
Help: "Number of Endpoints in the registry",
},
)
lastSyncTimestamp = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "last_sync_timestamp_seconds",
Help: "Timestamp of last successful sync with the DNS provider",
},
)
lastReconcileTimestamp = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "last_reconcile_timestamp_seconds",
Help: "Timestamp of last attempted sync with the DNS provider",
},
)
controllerNoChangesTotal = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "controller",
Name: "no_op_runs_total",
Help: "Number of reconcile loops ending up with no changes on the DNS provider side.",
},
)
deprecatedRegistryErrors = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "registry",
Name: "errors_total",
Help: "Number of Registry errors.",
},
)
deprecatedSourceErrors = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "source",
Name: "errors_total",
Help: "Number of Source errors.",
},
)
registryRecords = metrics.NewGaugedVectorOpts(
prometheus.GaugeOpts{
Subsystem: "registry",
Name: "records",
Help: "Number of registry records partitioned by label name (vector).",
},
[]string{"record_type"},
)
sourceRecords = metrics.NewGaugedVectorOpts(
prometheus.GaugeOpts{
Subsystem: "source",
Name: "records",
Help: "Number of source records partitioned by label name (vector).",
},
[]string{"record_type"},
)
verifiedRecords = metrics.NewGaugedVectorOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "verified_records",
Help: "Number of DNS records that exists both in source and registry (vector).",
},
[]string{"record_type"},
)
consecutiveSoftErrors = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "consecutive_soft_errors",
Help: "Number of consecutive soft errors in reconciliation loop.",
},
)
)
func init() {
metrics.RegisterMetric.MustRegister(registryErrorsTotal)
metrics.RegisterMetric.MustRegister(sourceErrorsTotal)
metrics.RegisterMetric.MustRegister(sourceEndpointsTotal)
metrics.RegisterMetric.MustRegister(registryEndpointsTotal)
metrics.RegisterMetric.MustRegister(lastSyncTimestamp)
metrics.RegisterMetric.MustRegister(lastReconcileTimestamp)
metrics.RegisterMetric.MustRegister(deprecatedRegistryErrors)
metrics.RegisterMetric.MustRegister(deprecatedSourceErrors)
metrics.RegisterMetric.MustRegister(controllerNoChangesTotal)
metrics.RegisterMetric.MustRegister(registryRecords)
metrics.RegisterMetric.MustRegister(sourceRecords)
metrics.RegisterMetric.MustRegister(verifiedRecords)
metrics.RegisterMetric.MustRegister(consecutiveSoftErrors)
}
// Controller is responsible for orchestrating the different components.
// It works in the following way:
// * Ask the DNS provider for the current list of endpoints.
@ -194,8 +73,6 @@ func (c *Controller) RunOnce(ctx context.Context) error {
c.lastRunAt = time.Now()
c.runAtMutex.Unlock()
regMetrics := newMetricsRecorder()
regRecords, err := c.Registry.Records(ctx)
if err != nil {
registryErrorsTotal.Counter.Inc()
@ -205,7 +82,7 @@ func (c *Controller) RunOnce(ctx context.Context) error {
registryEndpointsTotal.Gauge.Set(float64(len(regRecords)))
countAddressRecords(regMetrics, regRecords, registryRecords)
countAddressRecords(regRecords, registryRecords)
ctx = context.WithValue(ctx, provider.RecordsContextKey, regRecords)
@ -218,11 +95,8 @@ func (c *Controller) RunOnce(ctx context.Context) error {
sourceEndpointsTotal.Gauge.Set(float64(len(sourceEndpoints)))
sourceMetrics := newMetricsRecorder()
countAddressRecords(sourceMetrics, sourceEndpoints, sourceRecords)
vaMetrics := newMetricsRecorder()
countMatchingAddressRecords(vaMetrics, sourceEndpoints, regRecords, verifiedRecords)
countAddressRecords(sourceEndpoints, sourceRecords)
countMatchingAddressRecords(sourceEndpoints, regRecords, verifiedRecords)
endpoints, err := c.Registry.AdjustEndpoints(sourceEndpoints)
if err != nil {
@ -280,43 +154,6 @@ func latest(r time.Time, times ...time.Time) time.Time {
return r
}
// Counts the intersections of records in endpoint and registry.
func countMatchingAddressRecords(rec *metricsRecorder, endpoints []*endpoint.Endpoint, registryRecords []*endpoint.Endpoint, metric metrics.GaugeVecMetric) {
recordsMap := make(map[string]map[string]struct{})
for _, regRecord := range registryRecords {
if _, found := recordsMap[regRecord.DNSName]; !found {
recordsMap[regRecord.DNSName] = make(map[string]struct{})
}
recordsMap[regRecord.DNSName][regRecord.RecordType] = struct{}{}
}
for _, sourceRecord := range endpoints {
if _, found := recordsMap[sourceRecord.DNSName]; found {
if _, ok := recordsMap[sourceRecord.DNSName][sourceRecord.RecordType]; ok {
rec.recordEndpointType(sourceRecord.RecordType)
}
}
}
for _, rt := range endpoint.KnownRecordTypes {
metric.SetWithLabels(rec.loadFloat64(rt), rt)
}
}
// countAddressRecords updates the metricsRecorder with the count of each record type
// found in the provided endpoints slice, and sets the corresponding metrics for each
// known DNS record type using the sourceRecords metric.
func countAddressRecords(rec *metricsRecorder, endpoints []*endpoint.Endpoint, metric metrics.GaugeVecMetric) {
// compute the number of records per type
for _, endPoint := range endpoints {
rec.recordEndpointType(endPoint.RecordType)
}
// set metrics for each record type
for _, rt := range endpoint.KnownRecordTypes {
metric.SetWithLabels(rec.loadFloat64(rt), rt)
}
}
// ScheduleRunOnce makes sure execution happens at most once per interval.
func (c *Controller) ScheduleRunOnce(now time.Time) {
c.runAtMutex.Lock()

View File

@ -16,39 +16,168 @@ limitations under the License.
package controller
import "sigs.k8s.io/external-dns/endpoint"
import (
"github.com/prometheus/client_golang/prometheus"
type metricsRecorder struct {
counterPerEndpointType map[string]int
}
"sigs.k8s.io/external-dns/endpoint"
"sigs.k8s.io/external-dns/pkg/metrics"
)
func newMetricsRecorder() *metricsRecorder {
return &metricsRecorder{
counterPerEndpointType: map[string]int{
endpoint.RecordTypeA: 0,
endpoint.RecordTypeAAAA: 0,
endpoint.RecordTypeCNAME: 0,
endpoint.RecordTypeTXT: 0,
endpoint.RecordTypeSRV: 0,
endpoint.RecordTypeNS: 0,
endpoint.RecordTypePTR: 0,
endpoint.RecordTypeMX: 0,
endpoint.RecordTypeNAPTR: 0,
var (
registryErrorsTotal = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "registry",
Name: "errors_total",
Help: "Number of Registry errors.",
},
)
sourceErrorsTotal = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "source",
Name: "errors_total",
Help: "Number of Source errors.",
},
)
sourceEndpointsTotal = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "source",
Name: "endpoints_total",
Help: "Number of Endpoints in all sources",
},
)
registryEndpointsTotal = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "registry",
Name: "endpoints_total",
Help: "Number of Endpoints in the registry",
},
)
lastSyncTimestamp = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "last_sync_timestamp_seconds",
Help: "Timestamp of last successful sync with the DNS provider",
},
)
lastReconcileTimestamp = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "last_reconcile_timestamp_seconds",
Help: "Timestamp of last attempted sync with the DNS provider",
},
)
controllerNoChangesTotal = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "controller",
Name: "no_op_runs_total",
Help: "Number of reconcile loops ending up with no changes on the DNS provider side.",
},
)
deprecatedRegistryErrors = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "registry",
Name: "errors_total",
Help: "Number of Registry errors.",
},
)
deprecatedSourceErrors = metrics.NewCounterWithOpts(
prometheus.CounterOpts{
Subsystem: "source",
Name: "errors_total",
Help: "Number of Source errors.",
},
)
registryRecords = metrics.NewGaugedVectorOpts(
prometheus.GaugeOpts{
Subsystem: "registry",
Name: "records",
Help: "Number of registry records partitioned by label name (vector).",
},
[]string{"record_type"},
)
sourceRecords = metrics.NewGaugedVectorOpts(
prometheus.GaugeOpts{
Subsystem: "source",
Name: "records",
Help: "Number of source records partitioned by label name (vector).",
},
[]string{"record_type"},
)
verifiedRecords = metrics.NewGaugedVectorOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "verified_records",
Help: "Number of DNS records that exists both in source and registry (vector).",
},
[]string{"record_type"},
)
consecutiveSoftErrors = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Subsystem: "controller",
Name: "consecutive_soft_errors",
Help: "Number of consecutive soft errors in reconciliation loop.",
},
)
)
func init() {
metrics.RegisterMetric.MustRegister(registryErrorsTotal)
metrics.RegisterMetric.MustRegister(sourceErrorsTotal)
metrics.RegisterMetric.MustRegister(sourceEndpointsTotal)
metrics.RegisterMetric.MustRegister(registryEndpointsTotal)
metrics.RegisterMetric.MustRegister(lastSyncTimestamp)
metrics.RegisterMetric.MustRegister(lastReconcileTimestamp)
metrics.RegisterMetric.MustRegister(deprecatedRegistryErrors)
metrics.RegisterMetric.MustRegister(deprecatedSourceErrors)
metrics.RegisterMetric.MustRegister(controllerNoChangesTotal)
metrics.RegisterMetric.MustRegister(registryRecords)
metrics.RegisterMetric.MustRegister(sourceRecords)
metrics.RegisterMetric.MustRegister(verifiedRecords)
metrics.RegisterMetric.MustRegister(consecutiveSoftErrors)
}
type dnsKey struct {
name string
recordType string
}
// countMatchingAddressRecords counts records that exist in both endpoints and registry.
func countMatchingAddressRecords(endpoints []*endpoint.Endpoint, registryRecords []*endpoint.Endpoint, metric metrics.GaugeVecMetric) {
metric.Gauge.Reset()
registry := make(map[dnsKey]struct{}, len(registryRecords))
for _, r := range registryRecords {
registry[dnsKey{r.DNSName, r.RecordType}] = struct{}{}
}
counts := make(map[string]float64)
for _, ep := range endpoints {
if _, found := registry[dnsKey{ep.DNSName, ep.RecordType}]; found {
counts[ep.RecordType]++
}
}
for recordType, count := range counts {
metric.AddWithLabels(count, recordType)
}
}
func (m *metricsRecorder) recordEndpointType(endpointType string) {
m.counterPerEndpointType[endpointType]++
}
// countAddressRecords counts each record type in the provided endpoints slice.
func countAddressRecords(endpoints []*endpoint.Endpoint, metric metrics.GaugeVecMetric) {
metric.Gauge.Reset()
func (m *metricsRecorder) getEndpointTypeCount(endpointType string) int {
if count, ok := m.counterPerEndpointType[endpointType]; ok {
return count
counts := make(map[string]float64)
for _, ep := range endpoints {
counts[ep.RecordType]++
}
return 0
}
func (m *metricsRecorder) loadFloat64(endpointType string) float64 {
return float64(m.getEndpointTypeCount(endpointType))
for recordType, count := range counts {
metric.AddWithLabels(count, recordType)
}
}

View File

@ -20,7 +20,6 @@ import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"sigs.k8s.io/external-dns/endpoint"
"sigs.k8s.io/external-dns/internal/testutils"
"sigs.k8s.io/external-dns/pkg/apis/externaldns"
@ -28,41 +27,6 @@ import (
"sigs.k8s.io/external-dns/registry"
)
func TestRecordKnownEndpointType(t *testing.T) {
mr := newMetricsRecorder()
// Recording a built-in type should start at 1 and increment
mr.recordEndpointType(endpoint.RecordTypeA)
assert.Equal(t, 1, mr.getEndpointTypeCount(endpoint.RecordTypeA))
mr.recordEndpointType(endpoint.RecordTypeA)
assert.Equal(t, 2, mr.getEndpointTypeCount(endpoint.RecordTypeA))
}
func TestRecordUnknownEndpointType(t *testing.T) {
mr := newMetricsRecorder()
const customType = "CUSTOM"
// Unknown types start at zero
assert.Equal(t, 0, mr.getEndpointTypeCount(customType))
// First record sets to 1
mr.recordEndpointType(customType)
assert.Equal(t, 1, mr.getEndpointTypeCount(customType))
// Subsequent records increment
mr.recordEndpointType(customType)
assert.Equal(t, 2, mr.getEndpointTypeCount(customType))
}
func TestLoadFloat64(t *testing.T) {
mr := newMetricsRecorder()
// loadFloat64 should return the float64 representation of the count
mr.recordEndpointType(endpoint.RecordTypeAAAA)
assert.InDelta(t, float64(1), mr.loadFloat64(endpoint.RecordTypeAAAA), 0.0001)
}
func TestVerifyARecords(t *testing.T) {
testControllerFiltersDomains(
t,
@ -322,6 +286,26 @@ func TestAAAARecords(t *testing.T) {
}
func TestGaugeMetricsWithMixedRecords(t *testing.T) {
ctrl := newMixedRecordsFixture()
assert.NoError(t, ctrl.RunOnce(t.Context()))
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 534, sourceRecords.Gauge, map[string]string{"record_type": "a"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 324, sourceRecords.Gauge, map[string]string{"record_type": "aaaa"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 2, sourceRecords.Gauge, map[string]string{"record_type": "cname"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 11, sourceRecords.Gauge, map[string]string{"record_type": "srv"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 5334, registryRecords.Gauge, map[string]string{"record_type": "a"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 324, registryRecords.Gauge, map[string]string{"record_type": "aaaa"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 0, registryRecords.Gauge, map[string]string{"record_type": "mx"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 43, registryRecords.Gauge, map[string]string{"record_type": "ptr"})
}
type mixedRecordsFixture struct {
ctrl *Controller
}
func newMixedRecordsFixture() *Controller {
configuredEndpoints := testutils.GenerateTestEndpointsByType(map[string]int{
endpoint.RecordTypeA: 534,
endpoint.RecordTypeAAAA: 324,
@ -351,27 +335,23 @@ func TestGaugeMetricsWithMixedRecords(t *testing.T) {
provider := &filteredMockProvider{
RecordsStore: providerEndpoints,
}
r, err := registry.SelectRegistry(cfg, provider)
r, _ := registry.SelectRegistry(cfg, provider)
require.NoError(t, err)
ctrl := &Controller{
return &Controller{
Source: source,
Registry: r,
Policy: &plan.SyncPolicy{},
DomainFilter: endpoint.NewDomainFilter([]string{}),
ManagedRecordTypes: cfg.ManagedDNSRecordTypes,
}
assert.NoError(t, ctrl.RunOnce(t.Context()))
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 534, sourceRecords.Gauge, map[string]string{"record_type": "a"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 324, sourceRecords.Gauge, map[string]string{"record_type": "aaaa"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 2, sourceRecords.Gauge, map[string]string{"record_type": "cname"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 11, sourceRecords.Gauge, map[string]string{"record_type": "srv"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 5334, registryRecords.Gauge, map[string]string{"record_type": "a"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 324, registryRecords.Gauge, map[string]string{"record_type": "aaaa"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 0, registryRecords.Gauge, map[string]string{"record_type": "mx"})
testutils.TestHelperVerifyMetricsGaugeVectorWithLabels(t, 43, registryRecords.Gauge, map[string]string{"record_type": "ptr"})
}
func BenchmarkGaugeMetricsWithMixedRecords(b *testing.B) {
ctrl := newMixedRecordsFixture()
for b.Loop() {
if err := ctrl.RunOnce(b.Context()); err != nil {
b.Fatal(err)
}
}
}

View File

@ -81,10 +81,16 @@ func (g GaugeVecMetric) Get() *Metric {
// SetWithLabels sets the value of the Gauge metric for the specified label values.
// All label values are converted to lowercase before being applied.
func (g GaugeVecMetric) SetWithLabels(value float64, lvs ...string) {
for i, v := range lvs {
lvs[i] = strings.ToLower(v)
}
g.Gauge.WithLabelValues(lvs...).Set(value)
g.Gauge.WithLabelValues(toLower(lvs)...).Set(value)
}
// AddWithLabels adds the value to the Gauge metric for the specified label values.
// All label values are converted to lowercase before being applied.
//
// Without Reset(), values accumulate and reset only on process restart.
// Use Reset() + AddWithLabels() pattern for per-cycle counts.
func (g GaugeVecMetric) AddWithLabels(value float64, lvs ...string) {
g.Gauge.WithLabelValues(toLower(lvs)...).Add(value)
}
func NewGaugeWithOpts(opts prometheus.GaugeOpts) GaugeMetric {
@ -209,3 +215,13 @@ func PathProcessor(path string) string {
parts := strings.Split(path, "/")
return parts[len(parts)-1]
}
// toLower converts all label values to lowercase.
// The Prometheus maintainers have intentionally avoided magic transformations to keep label handling explicit and predictable.
// We expect consistent casing, normalizing at ingestion is the standard practice.
func toLower(lvs []string) []string {
for i := range lvs {
lvs[i] = strings.ToLower(lvs[i])
}
return lvs
}

View File

@ -241,3 +241,39 @@ func TestPathProcessor(t *testing.T) {
})
}
}
func TestGaugeV_AddWithLabels(t *testing.T) {
opts := prometheus.GaugeOpts{
Name: "test_gauge_add",
Namespace: "test_ns",
Subsystem: "test_sub",
Help: "help text",
}
gv := NewGaugedVectorOpts(opts, []string{"label1", "label2"})
// Add with mixed case labels - should be lowercased
gv.AddWithLabels(1.0, "Alpha", "BETA")
g, err := gv.Gauge.GetMetricWithLabelValues("alpha", "beta")
assert.NoError(t, err)
var m dto.Metric
err = g.Write(&m)
assert.NoError(t, err)
assert.NotNil(t, m.Gauge)
assert.InDelta(t, 1.0, *m.Gauge.Value, 0.01)
// Add again - should increment, not override
gv.AddWithLabels(2.0, "ALPHA", "beta")
err = g.Write(&m)
assert.NoError(t, err)
assert.InDelta(t, 3.0, *m.Gauge.Value, 0.01) // 1.0 + 2.0 = 3.0
// Add one more time
gv.AddWithLabels(0.5, "alpha", "Beta")
err = g.Write(&m)
assert.NoError(t, err)
assert.InDelta(t, 3.5, *m.Gauge.Value, 0.01) // 3.0 + 0.5 = 3.5
assert.Len(t, m.Label, 2)
}