mirror of
https://github.com/kubernetes-sigs/external-dns.git
synced 2025-08-06 09:36:58 +02:00
feat(metrics): consecutiveSoftErrors (#5502)
* feat(controller): add consecutive soft error metric and improve retry test * docs: clean up metrics.md Go runtime metrics table, remove duplicates, ensure CI compliance * style: gofmt * style: gofmt
This commit is contained in:
parent
0b3e40579b
commit
93d4d47bff
@ -153,6 +153,14 @@ var (
|
||||
Help: "Number of DNS AAAA-records that exists both in source and registry.",
|
||||
},
|
||||
)
|
||||
consecutiveSoftErrors = metrics.NewGaugeWithOpts(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "external_dns",
|
||||
Subsystem: "controller",
|
||||
Name: "consecutive_soft_errors",
|
||||
Help: "Number of consecutive soft errors in reconciliation loop.",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
@ -171,6 +179,7 @@ func init() {
|
||||
metrics.RegisterMetric.MustRegister(sourceAAAARecords)
|
||||
metrics.RegisterMetric.MustRegister(verifiedARecords)
|
||||
metrics.RegisterMetric.MustRegister(verifiedAAAARecords)
|
||||
metrics.RegisterMetric.MustRegister(consecutiveSoftErrors)
|
||||
}
|
||||
|
||||
// Controller is responsible for orchestrating the different components.
|
||||
@ -356,14 +365,23 @@ func (c *Controller) ShouldRunOnce(now time.Time) bool {
|
||||
func (c *Controller) Run(ctx context.Context) {
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
var softErrorCount int
|
||||
for {
|
||||
if c.ShouldRunOnce(time.Now()) {
|
||||
if err := c.RunOnce(ctx); err != nil {
|
||||
if errors.Is(err, provider.SoftError) {
|
||||
log.Errorf("Failed to do run once: %v", err)
|
||||
softErrorCount++
|
||||
consecutiveSoftErrors.Gauge.Set(float64(softErrorCount))
|
||||
log.Errorf("Failed to do run once: %v (consecutive soft errors: %d)", err, softErrorCount)
|
||||
} else {
|
||||
log.Fatalf("Failed to do run once: %v", err)
|
||||
}
|
||||
} else {
|
||||
if softErrorCount > 0 {
|
||||
log.Infof("Reconciliation succeeded after %d consecutive soft errors", softErrorCount)
|
||||
}
|
||||
softErrorCount = 0
|
||||
consecutiveSoftErrors.Gauge.Set(0)
|
||||
}
|
||||
}
|
||||
select {
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
"math"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -739,3 +740,68 @@ func TestAAAARecords(t *testing.T) {
|
||||
assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords.Gauge))
|
||||
assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords.Gauge))
|
||||
}
|
||||
|
||||
type toggleRegistry struct {
|
||||
registry.NoopRegistry
|
||||
failCount int
|
||||
failCountMu sync.Mutex // protects failCount
|
||||
}
|
||||
|
||||
func (r *toggleRegistry) Records(ctx context.Context) ([]*endpoint.Endpoint, error) {
|
||||
r.failCountMu.Lock()
|
||||
defer r.failCountMu.Unlock()
|
||||
if r.failCount < 3 {
|
||||
r.failCount++
|
||||
return nil, provider.SoftError
|
||||
}
|
||||
return []*endpoint.Endpoint{}, nil
|
||||
}
|
||||
|
||||
func (r *toggleRegistry) ApplyChanges(ctx context.Context, changes *plan.Changes) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func TestToggleRegistry(t *testing.T) {
|
||||
source := getTestSource()
|
||||
cfg := getTestConfig()
|
||||
r := &toggleRegistry{}
|
||||
|
||||
ctrl := &Controller{
|
||||
Source: source,
|
||||
Registry: r,
|
||||
Policy: &plan.SyncPolicy{},
|
||||
ManagedRecordTypes: cfg.ManagedDNSRecordTypes,
|
||||
Interval: 10 * time.Millisecond,
|
||||
}
|
||||
ctrl.nextRunAt = time.Now().Add(-time.Millisecond)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
stopped := make(chan struct{})
|
||||
go func() {
|
||||
ctrl.Run(ctx)
|
||||
close(stopped)
|
||||
}()
|
||||
|
||||
// Wait up to 2 seconds for failCount to reach at least 3
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for {
|
||||
r.failCountMu.Lock()
|
||||
count := r.failCount
|
||||
r.failCountMu.Unlock()
|
||||
if count >= 3 {
|
||||
break
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-stopped
|
||||
|
||||
r.failCountMu.Lock()
|
||||
finalCount := r.failCount
|
||||
r.failCountMu.Unlock()
|
||||
if finalCount < 3 {
|
||||
t.Fatalf("failCount should be at least 3 after waiting up to 2s, got %d", finalCount)
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ curl https://localhost:7979/metrics
|
||||
|
||||
| Name | Metric Type | Subsystem | Help |
|
||||
|:---------------------------------|:------------|:------------|:------------------------------------------------------|
|
||||
| consecutive_soft_errors | Gauge | controller | Number of consecutive soft errors in reconciliation loop. |
|
||||
| last_reconcile_timestamp_seconds | Gauge | controller | Timestamp of last attempted sync with the DNS provider |
|
||||
| last_sync_timestamp_seconds | Gauge | controller | Timestamp of last successful sync with the DNS provider |
|
||||
| no_op_runs_total | Counter | controller | Number of reconcile loops ending up with no changes on the DNS provider side. |
|
||||
@ -87,5 +88,3 @@ curl https://localhost:7979/metrics
|
||||
| process_start_time_seconds |
|
||||
| process_virtual_memory_bytes |
|
||||
| process_virtual_memory_max_bytes |
|
||||
| process_network_receive_bytes_total |
|
||||
| process_network_transmit_bytes_total |
|
||||
|
@ -37,7 +37,7 @@ func TestComputeMetrics(t *testing.T) {
|
||||
t.Errorf("Expected not empty metrics registry, got %d", len(reg.Metrics))
|
||||
}
|
||||
|
||||
assert.Len(t, reg.Metrics, 21)
|
||||
assert.Len(t, reg.Metrics, 22)
|
||||
}
|
||||
|
||||
func TestGenerateMarkdownTableRenderer(t *testing.T) {
|
||||
|
Loading…
Reference in New Issue
Block a user