feat(metrics): consecutiveSoftErrors (#5502)

* feat(controller): add consecutive soft error metric and improve retry test

* docs: clean up metrics.md Go runtime metrics table, remove duplicates, ensure CI compliance

* style: gofmt

* style: gofmt
This commit is contained in:
Andrew Hay 2025-06-06 12:52:39 -04:00 committed by GitHub
parent 0b3e40579b
commit 93d4d47bff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 87 additions and 4 deletions

View File

@ -153,6 +153,14 @@ var (
Help: "Number of DNS AAAA-records that exists both in source and registry.",
},
)
consecutiveSoftErrors = metrics.NewGaugeWithOpts(
prometheus.GaugeOpts{
Namespace: "external_dns",
Subsystem: "controller",
Name: "consecutive_soft_errors",
Help: "Number of consecutive soft errors in reconciliation loop.",
},
)
)
func init() {
@ -171,6 +179,7 @@ func init() {
metrics.RegisterMetric.MustRegister(sourceAAAARecords)
metrics.RegisterMetric.MustRegister(verifiedARecords)
metrics.RegisterMetric.MustRegister(verifiedAAAARecords)
metrics.RegisterMetric.MustRegister(consecutiveSoftErrors)
}
// Controller is responsible for orchestrating the different components.
@ -356,14 +365,23 @@ func (c *Controller) ShouldRunOnce(now time.Time) bool {
func (c *Controller) Run(ctx context.Context) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
var softErrorCount int
for {
if c.ShouldRunOnce(time.Now()) {
if err := c.RunOnce(ctx); err != nil {
if errors.Is(err, provider.SoftError) {
log.Errorf("Failed to do run once: %v", err)
softErrorCount++
consecutiveSoftErrors.Gauge.Set(float64(softErrorCount))
log.Errorf("Failed to do run once: %v (consecutive soft errors: %d)", err, softErrorCount)
} else {
log.Fatalf("Failed to do run once: %v", err)
}
} else {
if softErrorCount > 0 {
log.Infof("Reconciliation succeeded after %d consecutive soft errors", softErrorCount)
}
softErrorCount = 0
consecutiveSoftErrors.Gauge.Set(0)
}
}
select {

View File

@ -22,6 +22,7 @@ import (
"math"
"reflect"
"sort"
"sync"
"testing"
"time"
@ -739,3 +740,68 @@ func TestAAAARecords(t *testing.T) {
assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords.Gauge))
assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords.Gauge))
}
type toggleRegistry struct {
registry.NoopRegistry
failCount int
failCountMu sync.Mutex // protects failCount
}
func (r *toggleRegistry) Records(ctx context.Context) ([]*endpoint.Endpoint, error) {
r.failCountMu.Lock()
defer r.failCountMu.Unlock()
if r.failCount < 3 {
r.failCount++
return nil, provider.SoftError
}
return []*endpoint.Endpoint{}, nil
}
func (r *toggleRegistry) ApplyChanges(ctx context.Context, changes *plan.Changes) error {
return nil
}
func TestToggleRegistry(t *testing.T) {
source := getTestSource()
cfg := getTestConfig()
r := &toggleRegistry{}
ctrl := &Controller{
Source: source,
Registry: r,
Policy: &plan.SyncPolicy{},
ManagedRecordTypes: cfg.ManagedDNSRecordTypes,
Interval: 10 * time.Millisecond,
}
ctrl.nextRunAt = time.Now().Add(-time.Millisecond)
ctx, cancel := context.WithCancel(context.Background())
stopped := make(chan struct{})
go func() {
ctrl.Run(ctx)
close(stopped)
}()
// Wait up to 2 seconds for failCount to reach at least 3
deadline := time.Now().Add(2 * time.Second)
for {
r.failCountMu.Lock()
count := r.failCount
r.failCountMu.Unlock()
if count >= 3 {
break
}
if time.Now().After(deadline) {
break
}
time.Sleep(10 * time.Millisecond)
}
cancel()
<-stopped
r.failCountMu.Lock()
finalCount := r.failCount
r.failCountMu.Unlock()
if finalCount < 3 {
t.Fatalf("failCount should be at least 3 after waiting up to 2s, got %d", finalCount)
}
}

View File

@ -20,6 +20,7 @@ curl https://localhost:7979/metrics
| Name | Metric Type | Subsystem | Help |
|:---------------------------------|:------------|:------------|:------------------------------------------------------|
| consecutive_soft_errors | Gauge | controller | Number of consecutive soft errors in reconciliation loop. |
| last_reconcile_timestamp_seconds | Gauge | controller | Timestamp of last attempted sync with the DNS provider |
| last_sync_timestamp_seconds | Gauge | controller | Timestamp of last successful sync with the DNS provider |
| no_op_runs_total | Counter | controller | Number of reconcile loops ending up with no changes on the DNS provider side. |
@ -87,5 +88,3 @@ curl https://localhost:7979/metrics
| process_start_time_seconds |
| process_virtual_memory_bytes |
| process_virtual_memory_max_bytes |
| process_network_receive_bytes_total |
| process_network_transmit_bytes_total |

View File

@ -37,7 +37,7 @@ func TestComputeMetrics(t *testing.T) {
t.Errorf("Expected not empty metrics registry, got %d", len(reg.Metrics))
}
assert.Len(t, reg.Metrics, 21)
assert.Len(t, reg.Metrics, 22)
}
func TestGenerateMarkdownTableRenderer(t *testing.T) {