diff --git a/controller/controller.go b/controller/controller.go index d98fb8104..5ee193fa2 100644 --- a/controller/controller.go +++ b/controller/controller.go @@ -153,6 +153,14 @@ var ( Help: "Number of DNS AAAA-records that exists both in source and registry.", }, ) + consecutiveSoftErrors = metrics.NewGaugeWithOpts( + prometheus.GaugeOpts{ + Namespace: "external_dns", + Subsystem: "controller", + Name: "consecutive_soft_errors", + Help: "Number of consecutive soft errors in reconciliation loop.", + }, + ) ) func init() { @@ -171,6 +179,7 @@ func init() { metrics.RegisterMetric.MustRegister(sourceAAAARecords) metrics.RegisterMetric.MustRegister(verifiedARecords) metrics.RegisterMetric.MustRegister(verifiedAAAARecords) + metrics.RegisterMetric.MustRegister(consecutiveSoftErrors) } // Controller is responsible for orchestrating the different components. @@ -356,14 +365,23 @@ func (c *Controller) ShouldRunOnce(now time.Time) bool { func (c *Controller) Run(ctx context.Context) { ticker := time.NewTicker(time.Second) defer ticker.Stop() + var softErrorCount int for { if c.ShouldRunOnce(time.Now()) { if err := c.RunOnce(ctx); err != nil { if errors.Is(err, provider.SoftError) { - log.Errorf("Failed to do run once: %v", err) + softErrorCount++ + consecutiveSoftErrors.Gauge.Set(float64(softErrorCount)) + log.Errorf("Failed to do run once: %v (consecutive soft errors: %d)", err, softErrorCount) } else { log.Fatalf("Failed to do run once: %v", err) } + } else { + if softErrorCount > 0 { + log.Infof("Reconciliation succeeded after %d consecutive soft errors", softErrorCount) + } + softErrorCount = 0 + consecutiveSoftErrors.Gauge.Set(0) } } select { diff --git a/controller/controller_test.go b/controller/controller_test.go index a5fca2c51..e1f8a2b5a 100644 --- a/controller/controller_test.go +++ b/controller/controller_test.go @@ -22,6 +22,7 @@ import ( "math" "reflect" "sort" + "sync" "testing" "time" @@ -739,3 +740,68 @@ func TestAAAARecords(t *testing.T) { assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords.Gauge)) assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords.Gauge)) } + +type toggleRegistry struct { + registry.NoopRegistry + failCount int + failCountMu sync.Mutex // protects failCount +} + +func (r *toggleRegistry) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { + r.failCountMu.Lock() + defer r.failCountMu.Unlock() + if r.failCount < 3 { + r.failCount++ + return nil, provider.SoftError + } + return []*endpoint.Endpoint{}, nil +} + +func (r *toggleRegistry) ApplyChanges(ctx context.Context, changes *plan.Changes) error { + return nil +} + +func TestToggleRegistry(t *testing.T) { + source := getTestSource() + cfg := getTestConfig() + r := &toggleRegistry{} + + ctrl := &Controller{ + Source: source, + Registry: r, + Policy: &plan.SyncPolicy{}, + ManagedRecordTypes: cfg.ManagedDNSRecordTypes, + Interval: 10 * time.Millisecond, + } + ctrl.nextRunAt = time.Now().Add(-time.Millisecond) + ctx, cancel := context.WithCancel(context.Background()) + stopped := make(chan struct{}) + go func() { + ctrl.Run(ctx) + close(stopped) + }() + + // Wait up to 2 seconds for failCount to reach at least 3 + deadline := time.Now().Add(2 * time.Second) + for { + r.failCountMu.Lock() + count := r.failCount + r.failCountMu.Unlock() + if count >= 3 { + break + } + if time.Now().After(deadline) { + break + } + time.Sleep(10 * time.Millisecond) + } + cancel() + <-stopped + + r.failCountMu.Lock() + finalCount := r.failCount + r.failCountMu.Unlock() + if finalCount < 3 { + t.Fatalf("failCount should be at least 3 after waiting up to 2s, got %d", finalCount) + } +} diff --git a/docs/monitoring/metrics.md b/docs/monitoring/metrics.md index 8d73b202a..aa38ae9ee 100644 --- a/docs/monitoring/metrics.md +++ b/docs/monitoring/metrics.md @@ -20,6 +20,7 @@ curl https://localhost:7979/metrics | Name | Metric Type | Subsystem | Help | |:---------------------------------|:------------|:------------|:------------------------------------------------------| +| consecutive_soft_errors | Gauge | controller | Number of consecutive soft errors in reconciliation loop. | | last_reconcile_timestamp_seconds | Gauge | controller | Timestamp of last attempted sync with the DNS provider | | last_sync_timestamp_seconds | Gauge | controller | Timestamp of last successful sync with the DNS provider | | no_op_runs_total | Counter | controller | Number of reconcile loops ending up with no changes on the DNS provider side. | @@ -87,5 +88,3 @@ curl https://localhost:7979/metrics | process_start_time_seconds | | process_virtual_memory_bytes | | process_virtual_memory_max_bytes | -| process_network_receive_bytes_total | -| process_network_transmit_bytes_total | diff --git a/internal/gen/docs/metrics/main_test.go b/internal/gen/docs/metrics/main_test.go index dd9884e95..6f23c4344 100644 --- a/internal/gen/docs/metrics/main_test.go +++ b/internal/gen/docs/metrics/main_test.go @@ -37,7 +37,7 @@ func TestComputeMetrics(t *testing.T) { t.Errorf("Expected not empty metrics registry, got %d", len(reg.Metrics)) } - assert.Len(t, reg.Metrics, 21) + assert.Len(t, reg.Metrics, 22) } func TestGenerateMarkdownTableRenderer(t *testing.T) {