mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-05 12:16:44 +02:00
Add watchIPNBus-compatible notification for node going away.
Without this signal, tsnet consumers (e.g. Aperture) cannot distinguish
a deleted node from a transient control-plane outage. Such a node will
sit in BackendState=Running and indefinitely wile mapRoutine has been
silently 404'ing on /machine/map every 30 seconds for 5 years.
With NodeNotFoundWarnable exposed on the IPN bus, the entire
consumer-side reaction collapses to a few lines in its existing notify
watcher:
for {
notify, err := watcher.Next()
if err != nil {
return
}
if notify.Health != nil {
_, deleted := notify.Health.Warnings[health.NodeNotFoundWarnable.Code]
if deleted && !nodeDeleted {
nodeDeleted = true
// surface to the operator (UI badge, alert, action, etc.)
}
}
}
A subsequent successful registration with a fresh node key clears the
warnable on its own, so re-authentication flows recover without any
extra plumbing on the consumer side. And no persistence is required: if
a consumer process crashes while the node is in this state, mapRoutine
re-hits the 404 within ~30s of restart and the warnable trips again.
Updates #19326
Signed-off-by: Luke Kosewski <lkosewsk@tailscale.com>
This commit is contained in:
parent
ff4d495e39
commit
fa4ad9d5d5
@ -550,6 +550,7 @@ func (c *Auto) mapRoutine() {
|
||||
err := c.direct.PollNetMap(ctx, mrs)
|
||||
|
||||
c.direct.health.SetOutOfPollNetMap()
|
||||
c.direct.health.SetMapRoutineNodeNotFound(err != nil && errors.Is(err, ErrNodeNotFound))
|
||||
c.mu.Lock()
|
||||
c.inMapPoll = false
|
||||
paused := c.paused
|
||||
@ -780,6 +781,10 @@ func (c *Auto) Login(flags LoginFlags) {
|
||||
|
||||
var ErrClientClosed = errors.New("client closed")
|
||||
|
||||
// ErrNodeNotFound is wrapped into errors returned by [Direct.PollNetMap]
|
||||
// when control responds to /machine/map with HTTP 404.
|
||||
var ErrNodeNotFound = errors.New("node not found")
|
||||
|
||||
func (c *Auto) Logout(ctx context.Context) error {
|
||||
c.logf("client.Logout()")
|
||||
c.mu.Lock()
|
||||
|
||||
@ -1141,8 +1141,12 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap
|
||||
if res.StatusCode != 200 {
|
||||
msg, _ := io.ReadAll(res.Body)
|
||||
res.Body.Close()
|
||||
return fmt.Errorf("initial fetch failed %d: %.200s",
|
||||
err := fmt.Errorf("initial fetch failed %d: %.200s",
|
||||
res.StatusCode, strings.TrimSpace(string(msg)))
|
||||
if res.StatusCode == http.StatusNotFound {
|
||||
err = fmt.Errorf("%w: %w", ErrNodeNotFound, err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
|
||||
@ -129,6 +129,7 @@ type Tracker struct {
|
||||
lastNotifiedControlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages processed, kept for change detection
|
||||
controlMessages map[tailcfg.DisplayMessageID]tailcfg.DisplayMessage // latest control messages received
|
||||
lastLoginErr error
|
||||
mapRoutineNodeNotFound bool // control returned 404 on /machine/map
|
||||
localLogConfigErr error
|
||||
tlsConnectionErrors map[string]error // map[ServerName]error
|
||||
metricHealthMessage any // nil or *metrics.MultiLabelMap[metricHealthMessageLabel]
|
||||
@ -932,6 +933,22 @@ func (t *Tracker) SetAuthRoutineInError(err error) {
|
||||
t.selfCheckLocked()
|
||||
}
|
||||
|
||||
// SetMapRoutineNodeNotFound records whether the control plane has reported
|
||||
// (via HTTP 404 on /machine/map) that this node no longer exists in the
|
||||
// tailnet.
|
||||
func (t *Tracker) SetMapRoutineNodeNotFound(notFound bool) {
|
||||
if t.nil() {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
if t.mapRoutineNodeNotFound == notFound {
|
||||
return
|
||||
}
|
||||
t.mapRoutineNodeNotFound = notFound
|
||||
t.selfCheckLocked()
|
||||
}
|
||||
|
||||
// SetLatestVersion records the latest version of the Tailscale client.
|
||||
// v can be nil if unknown.
|
||||
func (t *Tracker) SetLatestVersion(v *tailcfg.ClientVersion) {
|
||||
@ -1180,6 +1197,13 @@ func (t *Tracker) updateBuiltinWarnablesLocked() {
|
||||
t.setHealthyLocked(LoginStateWarnable)
|
||||
}
|
||||
|
||||
if t.mapRoutineNodeNotFound {
|
||||
t.setUnhealthyLocked(NodeNotFoundWarnable, nil)
|
||||
return
|
||||
} else {
|
||||
t.setHealthyLocked(NodeNotFoundWarnable)
|
||||
}
|
||||
|
||||
if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) {
|
||||
t.setUnhealthyLocked(notInMapPollWarnable, nil)
|
||||
return
|
||||
|
||||
@ -234,6 +234,19 @@ var mapResponseTimeoutWarnable = condRegister(func() *Warnable {
|
||||
}
|
||||
})
|
||||
|
||||
// NodeNotFoundWarnable is a Warnable that warns the user that the control plane
|
||||
// has reported this node as not present in the tailnet.
|
||||
var NodeNotFoundWarnable = condRegister(func() *Warnable {
|
||||
return &Warnable{
|
||||
Code: tsconst.HealthWarnableNodeNotFound,
|
||||
Title: "Node not found",
|
||||
Severity: SeverityHigh,
|
||||
DependsOn: []*Warnable{NetworkStatusWarnable, IPNStateWarnable},
|
||||
Text: StaticMessage("Tailscale reports this node is not present in the tailnet. It will not reconnect until re-registered."),
|
||||
ImpactsConnectivity: true,
|
||||
}
|
||||
})
|
||||
|
||||
// tlsConnectionFailedWarnable is a Warnable that warns the user that Tailscale could not establish an encrypted connection with a server.
|
||||
var tlsConnectionFailedWarnable = condRegister(func() *Warnable {
|
||||
return &Warnable{
|
||||
|
||||
@ -18,6 +18,7 @@ const (
|
||||
HealthWarnableDERPRegionError = "derp-region-error"
|
||||
HealthWarnableNoUDP4Bind = "no-udp4-bind"
|
||||
HealthWarnableMapResponseTimeout = "mapresponse-timeout"
|
||||
HealthWarnableNodeNotFound = "node-not-found"
|
||||
HealthWarnableTLSConnectionFailed = "tls-connection-failed"
|
||||
HealthWarnableMagicsockReceiveFuncError = "magicsock-receive-func-error"
|
||||
HealthWarnableTestWarnable = "test-warnable"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user