mirror of
https://github.com/tailscale/tailscale.git
synced 2025-10-04 20:12:16 +02:00
control/controlclient,health,ipn/ipnlocal,health: fix deadlock by deleting health reporting
A recent change (009d702adfa0fc) introduced a deadlock where the /machine/update-health network request to report the client's health status update to the control plane was moved to being synchronous within the eventbus's pump machinery. I started to instead make the health reporting be async, but then we realized in the three years since we added that, it's barely been used and doesn't pay for itself, for how many HTTP requests it makes. Instead, delete it all and replace it with a c2n handler, which provides much more helpful information. Fixes tailscale/corp#32952 Change-Id: I9e8a5458269ebfdda1c752d7bbb8af2780d71b04 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
parent
a208cb9fd5
commit
24e38eb729
@ -12,7 +12,6 @@ import (
|
|||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"tailscale.com/health"
|
|
||||||
"tailscale.com/net/sockstats"
|
"tailscale.com/net/sockstats"
|
||||||
"tailscale.com/tailcfg"
|
"tailscale.com/tailcfg"
|
||||||
"tailscale.com/tstime"
|
"tailscale.com/tstime"
|
||||||
@ -23,7 +22,6 @@ import (
|
|||||||
"tailscale.com/types/structs"
|
"tailscale.com/types/structs"
|
||||||
"tailscale.com/util/backoff"
|
"tailscale.com/util/backoff"
|
||||||
"tailscale.com/util/clientmetric"
|
"tailscale.com/util/clientmetric"
|
||||||
"tailscale.com/util/eventbus"
|
|
||||||
"tailscale.com/util/execqueue"
|
"tailscale.com/util/execqueue"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -123,8 +121,6 @@ type Auto struct {
|
|||||||
observerQueue execqueue.ExecQueue
|
observerQueue execqueue.ExecQueue
|
||||||
shutdownFn func() // to be called prior to shutdown or nil
|
shutdownFn func() // to be called prior to shutdown or nil
|
||||||
|
|
||||||
eventSubs eventbus.Monitor
|
|
||||||
|
|
||||||
mu sync.Mutex // mutex guards the following fields
|
mu sync.Mutex // mutex guards the following fields
|
||||||
|
|
||||||
wantLoggedIn bool // whether the user wants to be logged in per last method call
|
wantLoggedIn bool // whether the user wants to be logged in per last method call
|
||||||
@ -195,10 +191,6 @@ func NewNoStart(opts Options) (_ *Auto, err error) {
|
|||||||
shutdownFn: opts.Shutdown,
|
shutdownFn: opts.Shutdown,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set up eventbus client and subscriber
|
|
||||||
ec := opts.Bus.Client("controlClient.Auto")
|
|
||||||
c.eventSubs = ec.Monitor(c.consumeEventbusTopics(ec))
|
|
||||||
|
|
||||||
c.authCtx, c.authCancel = context.WithCancel(context.Background())
|
c.authCtx, c.authCancel = context.WithCancel(context.Background())
|
||||||
c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, opts.Logf)
|
c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, opts.Logf)
|
||||||
|
|
||||||
@ -208,27 +200,6 @@ func NewNoStart(opts Options) (_ *Auto, err error) {
|
|||||||
return c, nil
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// consumeEventbusTopics consumes events from all relevant
|
|
||||||
// [eventbus.Subscriber]'s and passes them to their related handler. Events are
|
|
||||||
// always handled in the order they are received, i.e. the next event is not
|
|
||||||
// read until the previous event's handler has returned. It returns when the
|
|
||||||
// [eventbus.Client] is closed.
|
|
||||||
func (c *Auto) consumeEventbusTopics(ec *eventbus.Client) func(*eventbus.Client) {
|
|
||||||
healthChangeSub := eventbus.Subscribe[health.Change](ec)
|
|
||||||
return func(cli *eventbus.Client) {
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-cli.Done():
|
|
||||||
return
|
|
||||||
case change := <-healthChangeSub.Events():
|
|
||||||
if change.WarnableChanged {
|
|
||||||
c.direct.ReportWarnableChange(change.Warnable, change.UnhealthyState)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetPaused controls whether HTTP activity should be paused.
|
// SetPaused controls whether HTTP activity should be paused.
|
||||||
//
|
//
|
||||||
// The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.
|
// The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.
|
||||||
@ -782,8 +753,6 @@ func (c *Auto) UpdateEndpoints(endpoints []tailcfg.Endpoint) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *Auto) Shutdown() {
|
func (c *Auto) Shutdown() {
|
||||||
c.eventSubs.Close()
|
|
||||||
|
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
if c.closed {
|
if c.closed {
|
||||||
c.mu.Unlock()
|
c.mu.Unlock()
|
||||||
|
@ -1678,47 +1678,6 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReportWarnableChange reports to the control plane a change to this node's
|
|
||||||
// health. w must be non-nil. us can be nil to indicate a healthy state for w.
|
|
||||||
func (c *Direct) ReportWarnableChange(w *health.Warnable, us *health.UnhealthyState) {
|
|
||||||
if w == health.NetworkStatusWarnable || w == health.IPNStateWarnable || w == health.LoginStateWarnable {
|
|
||||||
// We don't report these. These include things like the network is down
|
|
||||||
// (in which case we can't report anyway) or the user wanted things
|
|
||||||
// stopped, as opposed to the more unexpected failure types in the other
|
|
||||||
// subsystems.
|
|
||||||
return
|
|
||||||
}
|
|
||||||
np, err := c.getNoiseClient()
|
|
||||||
if err != nil {
|
|
||||||
// Don't report errors to control if the server doesn't support noise.
|
|
||||||
return
|
|
||||||
}
|
|
||||||
nodeKey, ok := c.GetPersist().PublicNodeKeyOK()
|
|
||||||
if !ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if c.panicOnUse {
|
|
||||||
panic("tainted client")
|
|
||||||
}
|
|
||||||
// TODO(angott): at some point, update `Subsys` in the request to be `Warnable`
|
|
||||||
req := &tailcfg.HealthChangeRequest{
|
|
||||||
Subsys: string(w.Code),
|
|
||||||
NodeKey: nodeKey,
|
|
||||||
}
|
|
||||||
if us != nil {
|
|
||||||
req.Error = us.Text
|
|
||||||
}
|
|
||||||
|
|
||||||
// Best effort, no logging:
|
|
||||||
ctx, cancel := context.WithTimeout(c.closedCtx, 5*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
res, err := np.Post(ctx, "/machine/update-health", nodeKey, req)
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
res.Body.Close()
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetDeviceAttrs does a synchronous call to the control plane to update
|
// SetDeviceAttrs does a synchronous call to the control plane to update
|
||||||
// the node's attributes.
|
// the node's attributes.
|
||||||
//
|
//
|
||||||
|
@ -14,6 +14,9 @@ import (
|
|||||||
|
|
||||||
// State contains the health status of the backend, and is
|
// State contains the health status of the backend, and is
|
||||||
// provided to the client UI via LocalAPI through ipn.Notify.
|
// provided to the client UI via LocalAPI through ipn.Notify.
|
||||||
|
//
|
||||||
|
// It is also exposed via c2n for debugging purposes, so try
|
||||||
|
// not to change its structure too gratuitously.
|
||||||
type State struct {
|
type State struct {
|
||||||
// Each key-value pair in Warnings represents a Warnable that is currently
|
// Each key-value pair in Warnings represents a Warnable that is currently
|
||||||
// unhealthy. If a Warnable is healthy, it will not be present in this map.
|
// unhealthy. If a Warnable is healthy, it will not be present in this map.
|
||||||
|
@ -18,6 +18,7 @@ import (
|
|||||||
"tailscale.com/control/controlclient"
|
"tailscale.com/control/controlclient"
|
||||||
"tailscale.com/feature"
|
"tailscale.com/feature"
|
||||||
"tailscale.com/feature/buildfeatures"
|
"tailscale.com/feature/buildfeatures"
|
||||||
|
"tailscale.com/health"
|
||||||
"tailscale.com/ipn"
|
"tailscale.com/ipn"
|
||||||
"tailscale.com/net/sockstats"
|
"tailscale.com/net/sockstats"
|
||||||
"tailscale.com/tailcfg"
|
"tailscale.com/tailcfg"
|
||||||
@ -63,6 +64,7 @@ func init() {
|
|||||||
RegisterC2N("/debug/component-logging", handleC2NDebugComponentLogging)
|
RegisterC2N("/debug/component-logging", handleC2NDebugComponentLogging)
|
||||||
RegisterC2N("/debug/logheap", handleC2NDebugLogHeap)
|
RegisterC2N("/debug/logheap", handleC2NDebugLogHeap)
|
||||||
RegisterC2N("/debug/netmap", handleC2NDebugNetMap)
|
RegisterC2N("/debug/netmap", handleC2NDebugNetMap)
|
||||||
|
RegisterC2N("/debug/health", handleC2NDebugHealth)
|
||||||
}
|
}
|
||||||
if runtime.GOOS == "linux" && buildfeatures.HasOSRouter {
|
if runtime.GOOS == "linux" && buildfeatures.HasOSRouter {
|
||||||
RegisterC2N("POST /netfilter-kind", handleC2NSetNetfilterKind)
|
RegisterC2N("POST /netfilter-kind", handleC2NSetNetfilterKind)
|
||||||
@ -145,6 +147,14 @@ func handleC2NLogtailFlush(b *LocalBackend, w http.ResponseWriter, r *http.Reque
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func handleC2NDebugHealth(b *LocalBackend, w http.ResponseWriter, r *http.Request) {
|
||||||
|
var st *health.State
|
||||||
|
if buildfeatures.HasDebug && b.health != nil {
|
||||||
|
st = b.health.CurrentState()
|
||||||
|
}
|
||||||
|
writeJSON(w, st)
|
||||||
|
}
|
||||||
|
|
||||||
func handleC2NDebugNetMap(b *LocalBackend, w http.ResponseWriter, r *http.Request) {
|
func handleC2NDebugNetMap(b *LocalBackend, w http.ResponseWriter, r *http.Request) {
|
||||||
if !buildfeatures.HasDebug {
|
if !buildfeatures.HasDebug {
|
||||||
http.Error(w, feature.ErrUnavailable.Error(), http.StatusNotImplemented)
|
http.Error(w, feature.ErrUnavailable.Error(), http.StatusNotImplemented)
|
||||||
|
@ -172,7 +172,8 @@ type CapabilityVersion int
|
|||||||
// - 125: 2025-08-11: dnstype.Resolver adds UseWithExitNode field.
|
// - 125: 2025-08-11: dnstype.Resolver adds UseWithExitNode field.
|
||||||
// - 126: 2025-09-17: Client uses seamless key renewal unless disabled by control (tailscale/corp#31479)
|
// - 126: 2025-09-17: Client uses seamless key renewal unless disabled by control (tailscale/corp#31479)
|
||||||
// - 127: 2025-09-19: can handle C2N /debug/netmap.
|
// - 127: 2025-09-19: can handle C2N /debug/netmap.
|
||||||
const CurrentCapabilityVersion CapabilityVersion = 127
|
// - 128: 2025-10-02: can handle C2N /debug/health.
|
||||||
|
const CurrentCapabilityVersion CapabilityVersion = 128
|
||||||
|
|
||||||
// ID is an integer ID for a user, node, or login allocated by the
|
// ID is an integer ID for a user, node, or login allocated by the
|
||||||
// control plane.
|
// control plane.
|
||||||
@ -2734,6 +2735,9 @@ type SetDNSResponse struct{}
|
|||||||
// node health changes to:
|
// node health changes to:
|
||||||
//
|
//
|
||||||
// POST https://<control-plane>/machine/update-health.
|
// POST https://<control-plane>/machine/update-health.
|
||||||
|
//
|
||||||
|
// As of 2025-10-02, we stopped sending this to the control plane proactively.
|
||||||
|
// It was never useful enough with its current design and needs more thought.
|
||||||
type HealthChangeRequest struct {
|
type HealthChangeRequest struct {
|
||||||
Subsys string // a health.Subsystem value in string form
|
Subsys string // a health.Subsystem value in string form
|
||||||
Error string // or empty if cleared
|
Error string // or empty if cleared
|
||||||
|
Loading…
x
Reference in New Issue
Block a user