Add watchIPNBus-compatible notification for node going away.

Without this signal, tsnet consumers (e.g. Aperture) cannot distinguish
a deleted node from a transient control-plane outage. Such a node will
sit in BackendState=Running and indefinitely wile mapRoutine has been
silently 404'ing on /machine/map every 30 seconds for 5 years.

With NodeNotFoundWarnable exposed on the IPN bus, the entire
consumer-side reaction collapses to a few lines in its existing notify
watcher:

  for {
      notify, err := watcher.Next()
      if err != nil {
          return
      }
      if notify.Health != nil {
          _, deleted := notify.Health.Warnings[health.NodeNotFoundWarnable.Code]
          if deleted && !nodeDeleted {
              nodeDeleted = true
              // surface to the operator (UI badge, alert, action, etc.)
          }
      }
  }

A subsequent successful registration with a fresh node key clears the
warnable on its own, so re-authentication flows recover without any
extra plumbing on the consumer side. And no persistence is required: if
a consumer process crashes while the node is in this state, mapRoutine
re-hits the 404 within ~30s of restart and the warnable trips again.

Updates #19326

Signed-off-by: Luke Kosewski <lkosewsk@tailscale.com>
This commit is contained in:
Luke Kosewski 2026-04-30 14:37:12 -07:00
parent ff4d495e39
commit fa4ad9d5d5
5 changed files with 48 additions and 1 deletions

View File

@ -550,6 +550,7 @@ func (c *Auto) mapRoutine() {
err := c.direct.PollNetMap(ctx, mrs)
c.direct.health.SetOutOfPollNetMap()
c.direct.health.SetMapRoutineNodeNotFound(err != nil && errors.Is(err, ErrNodeNotFound))
c.mu.Lock()
c.inMapPoll = false
paused := c.paused
@ -780,6 +781,10 @@ func (c *Auto) Login(flags LoginFlags) {
var ErrClientClosed = errors.New("client closed")
// ErrNodeNotFound is wrapped into errors returned by [Direct.PollNetMap]
// when control responds to /machine/map with HTTP 404.
var ErrNodeNotFound = errors.New("node not found")
func (c *Auto) Logout(ctx context.Context) error {
c.logf("client.Logout()")
c.mu.Lock()

View File

@ -1141,8 +1141,12 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap
if res.StatusCode != 200 {
msg, _ := io.ReadAll(res.Body)
res.Body.Close()
return fmt.Errorf("initial fetch failed %d: %.200s",
err := fmt.Errorf("initial fetch failed %d: %.200s",
res.StatusCode, strings.TrimSpace(string(msg)))
if res.StatusCode == http.StatusNotFound {
err = fmt.Errorf("%w: %w", ErrNodeNotFound, err)
}
return err
}
defer res.Body.Close()

View File

@ -129,6 +129,7 @@ type Tracker struct {
lastNotifiedControlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages processed, kept for change detection
controlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages received
lastLoginErr error
mapRoutineNodeNotFound bool // control returned 404 on /machine/map
localLogConfigErr error
tlsConnectionErrors map[string]error // map[ServerName]error
metricHealthMessage any // nil or *metrics.MultiLabelMap[metricHealthMessageLabel]
@ -932,6 +933,22 @@ func (t *Tracker) SetAuthRoutineInError(err error) {
t.selfCheckLocked()
}
// SetMapRoutineNodeNotFound records whether the control plane has reported
// (via HTTP 404 on /machine/map) that this node no longer exists in the
// tailnet.
func (t *Tracker) SetMapRoutineNodeNotFound(notFound bool) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if t.mapRoutineNodeNotFound == notFound {
return
}
t.mapRoutineNodeNotFound = notFound
t.selfCheckLocked()
}
// SetLatestVersion records the latest version of the Tailscale client.
// v can be nil if unknown.
func (t *Tracker) SetLatestVersion(v *tailcfg.ClientVersion) {
@ -1180,6 +1197,13 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
t.setHealthyLocked(LoginStateWarnable)
}
if t.mapRoutineNodeNotFound {
t.setUnhealthyLocked(NodeNotFoundWarnable, nil)
return
} else {
t.setHealthyLocked(NodeNotFoundWarnable)
}
if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) {
t.setUnhealthyLocked(notInMapPollWarnable, nil)
return

View File

@ -234,6 +234,19 @@ var mapResponseTimeoutWarnable = condRegister(func() *Warnable {
}
})
// NodeNotFoundWarnable is a Warnable that warns the user that the control plane
// has reported this node as not present in the tailnet.
var NodeNotFoundWarnable = condRegister(func() *Warnable {
return &Warnable{
Code: tsconst.HealthWarnableNodeNotFound,
Title: "Node not found",
Severity: SeverityHigh,
DependsOn: []*Warnable{NetworkStatusWarnable, IPNStateWarnable},
Text: StaticMessage("Tailscale reports this node is not present in the tailnet. It will not reconnect until re-registered."),
ImpactsConnectivity: true,
}
})
// tlsConnectionFailedWarnable is a Warnable that warns the user that Tailscale could not establish an encrypted connection with a server.
var tlsConnectionFailedWarnable = condRegister(func() *Warnable {
return &Warnable{

View File

@ -18,6 +18,7 @@ const (
HealthWarnableDERPRegionError = "derp-region-error"
HealthWarnableNoUDP4Bind = "no-udp4-bind"
HealthWarnableMapResponseTimeout = "mapresponse-timeout"
HealthWarnableNodeNotFound = "node-not-found"
HealthWarnableTLSConnectionFailed = "tls-connection-failed"
HealthWarnableMagicsockReceiveFuncError = "magicsock-receive-func-error"
HealthWarnableTestWarnable = "test-warnable"