From fa4ad9d5d5db7d5e5e796174ec277ab53467e1d1 Mon Sep 17 00:00:00 2001 From: Luke Kosewski Date: Thu, 30 Apr 2026 14:37:12 -0700 Subject: [PATCH] Add watchIPNBus-compatible notification for node going away. Without this signal, tsnet consumers (e.g. Aperture) cannot distinguish a deleted node from a transient control-plane outage. Such a node will sit in BackendState=Running and indefinitely wile mapRoutine has been silently 404'ing on /machine/map every 30 seconds for 5 years. With NodeNotFoundWarnable exposed on the IPN bus, the entire consumer-side reaction collapses to a few lines in its existing notify watcher: for { notify, err := watcher.Next() if err != nil { return } if notify.Health != nil { _, deleted := notify.Health.Warnings[health.NodeNotFoundWarnable.Code] if deleted && !nodeDeleted { nodeDeleted = true // surface to the operator (UI badge, alert, action, etc.) } } } A subsequent successful registration with a fresh node key clears the warnable on its own, so re-authentication flows recover without any extra plumbing on the consumer side. And no persistence is required: if a consumer process crashes while the node is in this state, mapRoutine re-hits the 404 within ~30s of restart and the warnable trips again. Updates #19326 Signed-off-by: Luke Kosewski --- control/controlclient/auto.go | 5 +++++ control/controlclient/direct.go | 6 +++++- health/health.go | 24 ++++++++++++++++++++++++ health/warnings.go | 13 +++++++++++++ tsconst/health.go | 1 + 5 files changed, 48 insertions(+), 1 deletion(-) diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index 5b5b06def..b8a0c050e 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -550,6 +550,7 @@ func (c *Auto) mapRoutine() { err := c.direct.PollNetMap(ctx, mrs) c.direct.health.SetOutOfPollNetMap() + c.direct.health.SetMapRoutineNodeNotFound(err != nil && errors.Is(err, ErrNodeNotFound)) c.mu.Lock() c.inMapPoll = false paused := c.paused @@ -780,6 +781,10 @@ func (c *Auto) Login(flags LoginFlags) { var ErrClientClosed = errors.New("client closed") +// ErrNodeNotFound is wrapped into errors returned by [Direct.PollNetMap] +// when control responds to /machine/map with HTTP 404. +var ErrNodeNotFound = errors.New("node not found") + func (c *Auto) Logout(ctx context.Context) error { c.logf("client.Logout()") c.mu.Lock() diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index d873cc745..57b072a0f 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -1141,8 +1141,12 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap if res.StatusCode != 200 { msg, _ := io.ReadAll(res.Body) res.Body.Close() - return fmt.Errorf("initial fetch failed %d: %.200s", + err := fmt.Errorf("initial fetch failed %d: %.200s", res.StatusCode, strings.TrimSpace(string(msg))) + if res.StatusCode == http.StatusNotFound { + err = fmt.Errorf("%w: %w", ErrNodeNotFound, err) + } + return err } defer res.Body.Close() diff --git a/health/health.go b/health/health.go index 1829bd482..d95cb2981 100644 --- a/health/health.go +++ b/health/health.go @@ -129,6 +129,7 @@ type Tracker struct { lastNotifiedControlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages processed, kept for change detection controlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages received lastLoginErr error + mapRoutineNodeNotFound bool // control returned 404 on /machine/map localLogConfigErr error tlsConnectionErrors map[string]error // map[ServerName]error metricHealthMessage any // nil or *metrics.MultiLabelMap[metricHealthMessageLabel] @@ -932,6 +933,22 @@ func (t *Tracker) SetAuthRoutineInError(err error) { t.selfCheckLocked() } +// SetMapRoutineNodeNotFound records whether the control plane has reported +// (via HTTP 404 on /machine/map) that this node no longer exists in the +// tailnet. +func (t *Tracker) SetMapRoutineNodeNotFound(notFound bool) { + if t.nil() { + return + } + t.mu.Lock() + defer t.mu.Unlock() + if t.mapRoutineNodeNotFound == notFound { + return + } + t.mapRoutineNodeNotFound = notFound + t.selfCheckLocked() +} + // SetLatestVersion records the latest version of the Tailscale client. // v can be nil if unknown. func (t *Tracker) SetLatestVersion(v *tailcfg.ClientVersion) { @@ -1180,6 +1197,13 @@ func (t *Tracker) updateBuiltinWarnablesLocked() { t.setHealthyLocked(LoginStateWarnable) } + if t.mapRoutineNodeNotFound { + t.setUnhealthyLocked(NodeNotFoundWarnable, nil) + return + } else { + t.setHealthyLocked(NodeNotFoundWarnable) + } + if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) { t.setUnhealthyLocked(notInMapPollWarnable, nil) return diff --git a/health/warnings.go b/health/warnings.go index 416cb8ab0..5487c0046 100644 --- a/health/warnings.go +++ b/health/warnings.go @@ -234,6 +234,19 @@ var mapResponseTimeoutWarnable = condRegister(func() *Warnable { } }) +// NodeNotFoundWarnable is a Warnable that warns the user that the control plane +// has reported this node as not present in the tailnet. +var NodeNotFoundWarnable = condRegister(func() *Warnable { + return &Warnable{ + Code: tsconst.HealthWarnableNodeNotFound, + Title: "Node not found", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable, IPNStateWarnable}, + Text: StaticMessage("Tailscale reports this node is not present in the tailnet. It will not reconnect until re-registered."), + ImpactsConnectivity: true, + } +}) + // tlsConnectionFailedWarnable is a Warnable that warns the user that Tailscale could not establish an encrypted connection with a server. var tlsConnectionFailedWarnable = condRegister(func() *Warnable { return &Warnable{ diff --git a/tsconst/health.go b/tsconst/health.go index 93c6550ef..e1d11ab3f 100644 --- a/tsconst/health.go +++ b/tsconst/health.go @@ -18,6 +18,7 @@ const ( HealthWarnableDERPRegionError = "derp-region-error" HealthWarnableNoUDP4Bind = "no-udp4-bind" HealthWarnableMapResponseTimeout = "mapresponse-timeout" + HealthWarnableNodeNotFound = "node-not-found" HealthWarnableTLSConnectionFailed = "tls-connection-failed" HealthWarnableMagicsockReceiveFuncError = "magicsock-receive-func-error" HealthWarnableTestWarnable = "test-warnable"