mirror of
https://github.com/kubernetes-sigs/external-dns.git
synced 2025-08-06 01:26:59 +02:00
feat(metrics): consecutiveSoftErrors (#5502)
* feat(controller): add consecutive soft error metric and improve retry test * docs: clean up metrics.md Go runtime metrics table, remove duplicates, ensure CI compliance * style: gofmt * style: gofmt
This commit is contained in:
parent
0b3e40579b
commit
93d4d47bff
@ -153,6 +153,14 @@ var (
|
|||||||
Help: "Number of DNS AAAA-records that exists both in source and registry.",
|
Help: "Number of DNS AAAA-records that exists both in source and registry.",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
consecutiveSoftErrors = metrics.NewGaugeWithOpts(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: "external_dns",
|
||||||
|
Subsystem: "controller",
|
||||||
|
Name: "consecutive_soft_errors",
|
||||||
|
Help: "Number of consecutive soft errors in reconciliation loop.",
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@ -171,6 +179,7 @@ func init() {
|
|||||||
metrics.RegisterMetric.MustRegister(sourceAAAARecords)
|
metrics.RegisterMetric.MustRegister(sourceAAAARecords)
|
||||||
metrics.RegisterMetric.MustRegister(verifiedARecords)
|
metrics.RegisterMetric.MustRegister(verifiedARecords)
|
||||||
metrics.RegisterMetric.MustRegister(verifiedAAAARecords)
|
metrics.RegisterMetric.MustRegister(verifiedAAAARecords)
|
||||||
|
metrics.RegisterMetric.MustRegister(consecutiveSoftErrors)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Controller is responsible for orchestrating the different components.
|
// Controller is responsible for orchestrating the different components.
|
||||||
@ -356,14 +365,23 @@ func (c *Controller) ShouldRunOnce(now time.Time) bool {
|
|||||||
func (c *Controller) Run(ctx context.Context) {
|
func (c *Controller) Run(ctx context.Context) {
|
||||||
ticker := time.NewTicker(time.Second)
|
ticker := time.NewTicker(time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
var softErrorCount int
|
||||||
for {
|
for {
|
||||||
if c.ShouldRunOnce(time.Now()) {
|
if c.ShouldRunOnce(time.Now()) {
|
||||||
if err := c.RunOnce(ctx); err != nil {
|
if err := c.RunOnce(ctx); err != nil {
|
||||||
if errors.Is(err, provider.SoftError) {
|
if errors.Is(err, provider.SoftError) {
|
||||||
log.Errorf("Failed to do run once: %v", err)
|
softErrorCount++
|
||||||
|
consecutiveSoftErrors.Gauge.Set(float64(softErrorCount))
|
||||||
|
log.Errorf("Failed to do run once: %v (consecutive soft errors: %d)", err, softErrorCount)
|
||||||
} else {
|
} else {
|
||||||
log.Fatalf("Failed to do run once: %v", err)
|
log.Fatalf("Failed to do run once: %v", err)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if softErrorCount > 0 {
|
||||||
|
log.Infof("Reconciliation succeeded after %d consecutive soft errors", softErrorCount)
|
||||||
|
}
|
||||||
|
softErrorCount = 0
|
||||||
|
consecutiveSoftErrors.Gauge.Set(0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
|
@ -22,6 +22,7 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"reflect"
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -739,3 +740,68 @@ func TestAAAARecords(t *testing.T) {
|
|||||||
assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords.Gauge))
|
assert.Equal(t, math.Float64bits(2), valueFromMetric(sourceAAAARecords.Gauge))
|
||||||
assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords.Gauge))
|
assert.Equal(t, math.Float64bits(1), valueFromMetric(registryAAAARecords.Gauge))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type toggleRegistry struct {
|
||||||
|
registry.NoopRegistry
|
||||||
|
failCount int
|
||||||
|
failCountMu sync.Mutex // protects failCount
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *toggleRegistry) Records(ctx context.Context) ([]*endpoint.Endpoint, error) {
|
||||||
|
r.failCountMu.Lock()
|
||||||
|
defer r.failCountMu.Unlock()
|
||||||
|
if r.failCount < 3 {
|
||||||
|
r.failCount++
|
||||||
|
return nil, provider.SoftError
|
||||||
|
}
|
||||||
|
return []*endpoint.Endpoint{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *toggleRegistry) ApplyChanges(ctx context.Context, changes *plan.Changes) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToggleRegistry(t *testing.T) {
|
||||||
|
source := getTestSource()
|
||||||
|
cfg := getTestConfig()
|
||||||
|
r := &toggleRegistry{}
|
||||||
|
|
||||||
|
ctrl := &Controller{
|
||||||
|
Source: source,
|
||||||
|
Registry: r,
|
||||||
|
Policy: &plan.SyncPolicy{},
|
||||||
|
ManagedRecordTypes: cfg.ManagedDNSRecordTypes,
|
||||||
|
Interval: 10 * time.Millisecond,
|
||||||
|
}
|
||||||
|
ctrl.nextRunAt = time.Now().Add(-time.Millisecond)
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
stopped := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
ctrl.Run(ctx)
|
||||||
|
close(stopped)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Wait up to 2 seconds for failCount to reach at least 3
|
||||||
|
deadline := time.Now().Add(2 * time.Second)
|
||||||
|
for {
|
||||||
|
r.failCountMu.Lock()
|
||||||
|
count := r.failCount
|
||||||
|
r.failCountMu.Unlock()
|
||||||
|
if count >= 3 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
<-stopped
|
||||||
|
|
||||||
|
r.failCountMu.Lock()
|
||||||
|
finalCount := r.failCount
|
||||||
|
r.failCountMu.Unlock()
|
||||||
|
if finalCount < 3 {
|
||||||
|
t.Fatalf("failCount should be at least 3 after waiting up to 2s, got %d", finalCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -20,6 +20,7 @@ curl https://localhost:7979/metrics
|
|||||||
|
|
||||||
| Name | Metric Type | Subsystem | Help |
|
| Name | Metric Type | Subsystem | Help |
|
||||||
|:---------------------------------|:------------|:------------|:------------------------------------------------------|
|
|:---------------------------------|:------------|:------------|:------------------------------------------------------|
|
||||||
|
| consecutive_soft_errors | Gauge | controller | Number of consecutive soft errors in reconciliation loop. |
|
||||||
| last_reconcile_timestamp_seconds | Gauge | controller | Timestamp of last attempted sync with the DNS provider |
|
| last_reconcile_timestamp_seconds | Gauge | controller | Timestamp of last attempted sync with the DNS provider |
|
||||||
| last_sync_timestamp_seconds | Gauge | controller | Timestamp of last successful sync with the DNS provider |
|
| last_sync_timestamp_seconds | Gauge | controller | Timestamp of last successful sync with the DNS provider |
|
||||||
| no_op_runs_total | Counter | controller | Number of reconcile loops ending up with no changes on the DNS provider side. |
|
| no_op_runs_total | Counter | controller | Number of reconcile loops ending up with no changes on the DNS provider side. |
|
||||||
@ -87,5 +88,3 @@ curl https://localhost:7979/metrics
|
|||||||
| process_start_time_seconds |
|
| process_start_time_seconds |
|
||||||
| process_virtual_memory_bytes |
|
| process_virtual_memory_bytes |
|
||||||
| process_virtual_memory_max_bytes |
|
| process_virtual_memory_max_bytes |
|
||||||
| process_network_receive_bytes_total |
|
|
||||||
| process_network_transmit_bytes_total |
|
|
||||||
|
@ -37,7 +37,7 @@ func TestComputeMetrics(t *testing.T) {
|
|||||||
t.Errorf("Expected not empty metrics registry, got %d", len(reg.Metrics))
|
t.Errorf("Expected not empty metrics registry, got %d", len(reg.Metrics))
|
||||||
}
|
}
|
||||||
|
|
||||||
assert.Len(t, reg.Metrics, 21)
|
assert.Len(t, reg.Metrics, 22)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGenerateMarkdownTableRenderer(t *testing.T) {
|
func TestGenerateMarkdownTableRenderer(t *testing.T) {
|
||||||
|
Loading…
Reference in New Issue
Block a user