diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index 224838d56..9f5bf38ae 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -12,7 +12,6 @@ import ( "sync/atomic" "time" - "tailscale.com/health" "tailscale.com/net/sockstats" "tailscale.com/tailcfg" "tailscale.com/tstime" @@ -23,7 +22,6 @@ import ( "tailscale.com/types/structs" "tailscale.com/util/backoff" "tailscale.com/util/clientmetric" - "tailscale.com/util/eventbus" "tailscale.com/util/execqueue" ) @@ -123,8 +121,6 @@ type Auto struct { observerQueue execqueue.ExecQueue shutdownFn func() // to be called prior to shutdown or nil - eventSubs eventbus.Monitor - mu sync.Mutex // mutex guards the following fields wantLoggedIn bool // whether the user wants to be logged in per last method call @@ -195,10 +191,6 @@ func NewNoStart(opts Options) (_ *Auto, err error) { shutdownFn: opts.Shutdown, } - // Set up eventbus client and subscriber - ec := opts.Bus.Client("controlClient.Auto") - c.eventSubs = ec.Monitor(c.consumeEventbusTopics(ec)) - c.authCtx, c.authCancel = context.WithCancel(context.Background()) c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, opts.Logf) @@ -208,27 +200,6 @@ func NewNoStart(opts Options) (_ *Auto, err error) { return c, nil } -// consumeEventbusTopics consumes events from all relevant -// [eventbus.Subscriber]'s and passes them to their related handler. Events are -// always handled in the order they are received, i.e. the next event is not -// read until the previous event's handler has returned. It returns when the -// [eventbus.Client] is closed. -func (c *Auto) consumeEventbusTopics(ec *eventbus.Client) func(*eventbus.Client) { - healthChangeSub := eventbus.Subscribe[health.Change](ec) - return func(cli *eventbus.Client) { - for { - select { - case <-cli.Done(): - return - case change := <-healthChangeSub.Events(): - if change.WarnableChanged { - c.direct.ReportWarnableChange(change.Warnable, change.UnhealthyState) - } - } - } - } -} - // SetPaused controls whether HTTP activity should be paused. // // The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once. @@ -782,8 +753,6 @@ func (c *Auto) UpdateEndpoints(endpoints []tailcfg.Endpoint) { } func (c *Auto) Shutdown() { - c.eventSubs.Close() - c.mu.Lock() if c.closed { c.mu.Unlock() diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index c77e93e1c..de577bea4 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -1678,47 +1678,6 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc return nil } -// ReportWarnableChange reports to the control plane a change to this node's -// health. w must be non-nil. us can be nil to indicate a healthy state for w. -func (c *Direct) ReportWarnableChange(w *health.Warnable, us *health.UnhealthyState) { - if w == health.NetworkStatusWarnable || w == health.IPNStateWarnable || w == health.LoginStateWarnable { - // We don't report these. These include things like the network is down - // (in which case we can't report anyway) or the user wanted things - // stopped, as opposed to the more unexpected failure types in the other - // subsystems. - return - } - np, err := c.getNoiseClient() - if err != nil { - // Don't report errors to control if the server doesn't support noise. - return - } - nodeKey, ok := c.GetPersist().PublicNodeKeyOK() - if !ok { - return - } - if c.panicOnUse { - panic("tainted client") - } - // TODO(angott): at some point, update `Subsys` in the request to be `Warnable` - req := &tailcfg.HealthChangeRequest{ - Subsys: string(w.Code), - NodeKey: nodeKey, - } - if us != nil { - req.Error = us.Text - } - - // Best effort, no logging: - ctx, cancel := context.WithTimeout(c.closedCtx, 5*time.Second) - defer cancel() - res, err := np.Post(ctx, "/machine/update-health", nodeKey, req) - if err != nil { - return - } - res.Body.Close() -} - // SetDeviceAttrs does a synchronous call to the control plane to update // the node's attributes. // diff --git a/health/state.go b/health/state.go index 116518629..2efff92b1 100644 --- a/health/state.go +++ b/health/state.go @@ -14,6 +14,9 @@ import ( // State contains the health status of the backend, and is // provided to the client UI via LocalAPI through ipn.Notify. +// +// It is also exposed via c2n for debugging purposes, so try +// not to change its structure too gratuitously. type State struct { // Each key-value pair in Warnings represents a Warnable that is currently // unhealthy. If a Warnable is healthy, it will not be present in this map. diff --git a/ipn/ipnlocal/c2n.go b/ipn/ipnlocal/c2n.go index 4b5b581aa..0c228060f 100644 --- a/ipn/ipnlocal/c2n.go +++ b/ipn/ipnlocal/c2n.go @@ -18,6 +18,7 @@ import ( "tailscale.com/control/controlclient" "tailscale.com/feature" "tailscale.com/feature/buildfeatures" + "tailscale.com/health" "tailscale.com/ipn" "tailscale.com/net/sockstats" "tailscale.com/tailcfg" @@ -63,6 +64,7 @@ func init() { RegisterC2N("/debug/component-logging", handleC2NDebugComponentLogging) RegisterC2N("/debug/logheap", handleC2NDebugLogHeap) RegisterC2N("/debug/netmap", handleC2NDebugNetMap) + RegisterC2N("/debug/health", handleC2NDebugHealth) } if runtime.GOOS == "linux" && buildfeatures.HasOSRouter { RegisterC2N("POST /netfilter-kind", handleC2NSetNetfilterKind) @@ -145,6 +147,14 @@ func handleC2NLogtailFlush(b *LocalBackend, w http.ResponseWriter, r *http.Reque } } +func handleC2NDebugHealth(b *LocalBackend, w http.ResponseWriter, r *http.Request) { + var st *health.State + if buildfeatures.HasDebug && b.health != nil { + st = b.health.CurrentState() + } + writeJSON(w, st) +} + func handleC2NDebugNetMap(b *LocalBackend, w http.ResponseWriter, r *http.Request) { if !buildfeatures.HasDebug { http.Error(w, feature.ErrUnavailable.Error(), http.StatusNotImplemented) diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 88cda044f..01ecc96b3 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -172,7 +172,8 @@ type CapabilityVersion int // - 125: 2025-08-11: dnstype.Resolver adds UseWithExitNode field. // - 126: 2025-09-17: Client uses seamless key renewal unless disabled by control (tailscale/corp#31479) // - 127: 2025-09-19: can handle C2N /debug/netmap. -const CurrentCapabilityVersion CapabilityVersion = 127 +// - 128: 2025-10-02: can handle C2N /debug/health. +const CurrentCapabilityVersion CapabilityVersion = 128 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2734,6 +2735,9 @@ type SetDNSResponse struct{} // node health changes to: // // POST https:///machine/update-health. +// +// As of 2025-10-02, we stopped sending this to the control plane proactively. +// It was never useful enough with its current design and needs more thought. type HealthChangeRequest struct { Subsys string // a health.Subsystem value in string form Error string // or empty if cleared