From df94a1487076f744742d5b5c3a234d628bfd2bb5 Mon Sep 17 00:00:00 2001 From: Tom Proctor Date: Thu, 5 Dec 2024 12:11:22 +0000 Subject: [PATCH] cmd/k8s-operator: don't error for transient failures (#14073) Every so often, the ProxyGroup and other controllers lose an optimistic locking race with other controllers that update the objects they create. Stop treating this as an error event, and instead just log an info level log line for it. Fixes #14072 Signed-off-by: Tom Proctor --- cmd/k8s-operator/connector.go | 17 +++++++++++++---- cmd/k8s-operator/dnsrecords.go | 11 ++++++++++- cmd/k8s-operator/egress-services.go | 10 +++++++++- cmd/k8s-operator/ingress.go | 10 +++++++++- cmd/k8s-operator/nameserver.go | 8 +++++++- cmd/k8s-operator/proxygroup.go | 18 +++++++++++++++--- cmd/k8s-operator/svc.go | 10 +++++++++- cmd/k8s-operator/tsrecorder.go | 17 ++++++++++++----- 8 files changed, 84 insertions(+), 17 deletions(-) diff --git a/cmd/k8s-operator/connector.go b/cmd/k8s-operator/connector.go index 1cce02fbb..c243036cb 100644 --- a/cmd/k8s-operator/connector.go +++ b/cmd/k8s-operator/connector.go @@ -10,6 +10,7 @@ import ( "fmt" "net/netip" "slices" + "strings" "sync" "time" @@ -35,6 +36,7 @@ import ( const ( reasonConnectorCreationFailed = "ConnectorCreationFailed" + reasonConnectorCreating = "ConnectorCreating" reasonConnectorCreated = "ConnectorCreated" reasonConnectorInvalid = "ConnectorInvalid" @@ -134,17 +136,24 @@ func (a *ConnectorReconciler) Reconcile(ctx context.Context, req reconcile.Reque } if err := a.validate(cn); err != nil { - logger.Errorf("error validating Connector spec: %w", err) message := fmt.Sprintf(messageConnectorInvalid, err) a.recorder.Eventf(cn, corev1.EventTypeWarning, reasonConnectorInvalid, message) return setStatus(cn, tsapi.ConnectorReady, metav1.ConditionFalse, reasonConnectorInvalid, message) } if err = a.maybeProvisionConnector(ctx, logger, cn); err != nil { - logger.Errorf("error creating Connector resources: %w", err) + reason := reasonConnectorCreationFailed message := fmt.Sprintf(messageConnectorCreationFailed, err) - a.recorder.Eventf(cn, corev1.EventTypeWarning, reasonConnectorCreationFailed, message) - return setStatus(cn, tsapi.ConnectorReady, metav1.ConditionFalse, reasonConnectorCreationFailed, message) + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + reason = reasonConnectorCreating + message = fmt.Sprintf("optimistic lock error, retrying: %s", err) + err = nil + logger.Info(message) + } else { + a.recorder.Eventf(cn, corev1.EventTypeWarning, reason, message) + } + + return setStatus(cn, tsapi.ConnectorReady, metav1.ConditionFalse, reason, message) } logger.Info("Connector resources synced") diff --git a/cmd/k8s-operator/dnsrecords.go b/cmd/k8s-operator/dnsrecords.go index bba87bf25..f91dd49ec 100644 --- a/cmd/k8s-operator/dnsrecords.go +++ b/cmd/k8s-operator/dnsrecords.go @@ -10,6 +10,7 @@ import ( "encoding/json" "fmt" "slices" + "strings" "go.uber.org/zap" corev1 "k8s.io/api/core/v1" @@ -98,7 +99,15 @@ func (dnsRR *dnsRecordsReconciler) Reconcile(ctx context.Context, req reconcile. return reconcile.Result{}, nil } - return reconcile.Result{}, dnsRR.maybeProvision(ctx, headlessSvc, logger) + if err := dnsRR.maybeProvision(ctx, headlessSvc, logger); err != nil { + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + logger.Infof("optimistic lock error, retrying: %s", err) + } else { + return reconcile.Result{}, err + } + } + + return reconcile.Result{}, nil } // maybeProvision ensures that dnsrecords ConfigMap contains a record for the diff --git a/cmd/k8s-operator/egress-services.go b/cmd/k8s-operator/egress-services.go index a08c0b715..7544376fb 100644 --- a/cmd/k8s-operator/egress-services.go +++ b/cmd/k8s-operator/egress-services.go @@ -156,7 +156,15 @@ func (esr *egressSvcsReconciler) Reconcile(ctx context.Context, req reconcile.Re return res, err } - return res, esr.maybeProvision(ctx, svc, l) + if err := esr.maybeProvision(ctx, svc, l); err != nil { + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + l.Infof("optimistic lock error, retrying: %s", err) + } else { + return reconcile.Result{}, err + } + } + + return res, nil } func (esr *egressSvcsReconciler) maybeProvision(ctx context.Context, svc *corev1.Service, l *zap.SugaredLogger) (err error) { diff --git a/cmd/k8s-operator/ingress.go b/cmd/k8s-operator/ingress.go index 749869b22..3eb47dfb0 100644 --- a/cmd/k8s-operator/ingress.go +++ b/cmd/k8s-operator/ingress.go @@ -76,7 +76,15 @@ func (a *IngressReconciler) Reconcile(ctx context.Context, req reconcile.Request return reconcile.Result{}, a.maybeCleanup(ctx, logger, ing) } - return reconcile.Result{}, a.maybeProvision(ctx, logger, ing) + if err := a.maybeProvision(ctx, logger, ing); err != nil { + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + logger.Infof("optimistic lock error, retrying: %s", err) + } else { + return reconcile.Result{}, err + } + } + + return reconcile.Result{}, nil } func (a *IngressReconciler) maybeCleanup(ctx context.Context, logger *zap.SugaredLogger, ing *networkingv1.Ingress) error { diff --git a/cmd/k8s-operator/nameserver.go b/cmd/k8s-operator/nameserver.go index 6a9a6be93..ef0762a12 100644 --- a/cmd/k8s-operator/nameserver.go +++ b/cmd/k8s-operator/nameserver.go @@ -9,6 +9,7 @@ import ( "context" "fmt" "slices" + "strings" "sync" _ "embed" @@ -131,7 +132,12 @@ func (a *NameserverReconciler) Reconcile(ctx context.Context, req reconcile.Requ } } if err := a.maybeProvision(ctx, &dnsCfg, logger); err != nil { - return reconcile.Result{}, fmt.Errorf("error provisioning nameserver resources: %w", err) + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + logger.Infof("optimistic lock error, retrying: %s", err) + return reconcile.Result{}, nil + } else { + return reconcile.Result{}, fmt.Errorf("error provisioning nameserver resources: %w", err) + } } a.mu.Lock() diff --git a/cmd/k8s-operator/proxygroup.go b/cmd/k8s-operator/proxygroup.go index 344cd9ae0..39b7ccc01 100644 --- a/cmd/k8s-operator/proxygroup.go +++ b/cmd/k8s-operator/proxygroup.go @@ -12,6 +12,7 @@ import ( "fmt" "net/http" "slices" + "strings" "sync" "github.com/pkg/errors" @@ -45,6 +46,9 @@ const ( reasonProxyGroupReady = "ProxyGroupReady" reasonProxyGroupCreating = "ProxyGroupCreating" reasonProxyGroupInvalid = "ProxyGroupInvalid" + + // Copied from k8s.io/apiserver/pkg/registry/generic/registry/store.go@cccad306d649184bf2a0e319ba830c53f65c445c + optimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again" ) var gaugeProxyGroupResources = clientmetric.NewGauge(kubetypes.MetricProxyGroupEgressCount) @@ -166,9 +170,17 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ } if err = r.maybeProvision(ctx, pg, proxyClass); err != nil { - err = fmt.Errorf("error provisioning ProxyGroup resources: %w", err) - r.recorder.Eventf(pg, corev1.EventTypeWarning, reasonProxyGroupCreationFailed, err.Error()) - return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreationFailed, err.Error()) + reason := reasonProxyGroupCreationFailed + msg := fmt.Sprintf("error provisioning ProxyGroup resources: %s", err) + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + reason = reasonProxyGroupCreating + msg = fmt.Sprintf("optimistic lock error, retrying: %s", err) + err = nil + logger.Info(msg) + } else { + r.recorder.Eventf(pg, corev1.EventTypeWarning, reason, msg) + } + return setStatusReady(pg, metav1.ConditionFalse, reason, msg) } desiredReplicas := int(pgReplicas(pg)) diff --git a/cmd/k8s-operator/svc.go b/cmd/k8s-operator/svc.go index 314ac2398..70c810b25 100644 --- a/cmd/k8s-operator/svc.go +++ b/cmd/k8s-operator/svc.go @@ -121,7 +121,15 @@ func (a *ServiceReconciler) Reconcile(ctx context.Context, req reconcile.Request return reconcile.Result{}, a.maybeCleanup(ctx, logger, svc) } - return reconcile.Result{}, a.maybeProvision(ctx, logger, svc) + if err := a.maybeProvision(ctx, logger, svc); err != nil { + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + logger.Infof("optimistic lock error, retrying: %s", err) + } else { + return reconcile.Result{}, err + } + } + + return reconcile.Result{}, nil } // maybeCleanup removes any existing resources related to serving svc over tailscale. diff --git a/cmd/k8s-operator/tsrecorder.go b/cmd/k8s-operator/tsrecorder.go index 4445578a6..44ce731fe 100644 --- a/cmd/k8s-operator/tsrecorder.go +++ b/cmd/k8s-operator/tsrecorder.go @@ -11,6 +11,7 @@ import ( "fmt" "net/http" "slices" + "strings" "sync" "github.com/pkg/errors" @@ -38,6 +39,7 @@ import ( const ( reasonRecorderCreationFailed = "RecorderCreationFailed" + reasonRecorderCreating = "RecorderCreating" reasonRecorderCreated = "RecorderCreated" reasonRecorderInvalid = "RecorderInvalid" @@ -119,23 +121,28 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques logger.Infof("ensuring Recorder is set up") tsr.Finalizers = append(tsr.Finalizers, FinalizerName) if err := r.Update(ctx, tsr); err != nil { - logger.Errorf("error adding finalizer: %w", err) return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderCreationFailed, reasonRecorderCreationFailed) } } if err := r.validate(tsr); err != nil { - logger.Errorf("error validating Recorder spec: %w", err) message := fmt.Sprintf("Recorder is invalid: %s", err) r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderInvalid, message) return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderInvalid, message) } if err = r.maybeProvision(ctx, tsr); err != nil { - logger.Errorf("error creating Recorder resources: %w", err) + reason := reasonRecorderCreationFailed message := fmt.Sprintf("failed creating Recorder: %s", err) - r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderCreationFailed, message) - return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderCreationFailed, message) + if strings.Contains(err.Error(), optimisticLockErrorMsg) { + reason = reasonRecorderCreating + message = fmt.Sprintf("optimistic lock error, retrying: %s", err) + err = nil + logger.Info(message) + } else { + r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderCreationFailed, message) + } + return setStatusReady(tsr, metav1.ConditionFalse, reason, message) } logger.Info("Recorder resources synced")