Brad Fitzpatrick 24e38eb729 control/controlclient,health,ipn/ipnlocal,health: fix deadlock by deleting health reporting
A recent change (009d702adfa0fc) introduced a deadlock where the
/machine/update-health network request to report the client's health
status update to the control plane was moved to being synchronous
within the eventbus's pump machinery.

I started to instead make the health reporting be async, but then we
realized in the three years since we added that, it's barely been used
and doesn't pay for itself, for how many HTTP requests it makes.

Instead, delete it all and replace it with a c2n handler, which
provides much more helpful information.

Fixes tailscale/corp#32952

Change-Id: I9e8a5458269ebfdda1c752d7bbb8af2780d71b04
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2025-10-02 12:48:22 -07:00

817 lines
21 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package controlclient
import (
"context"
"errors"
"fmt"
"net/http"
"sync"
"sync/atomic"
"time"
"tailscale.com/net/sockstats"
"tailscale.com/tailcfg"
"tailscale.com/tstime"
"tailscale.com/types/key"
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
"tailscale.com/types/persist"
"tailscale.com/types/structs"
"tailscale.com/util/backoff"
"tailscale.com/util/clientmetric"
"tailscale.com/util/execqueue"
)
type LoginGoal struct {
_ structs.Incomparable
flags LoginFlags // flags to use when logging in
url string // auth url that needs to be visited
}
var _ Client = (*Auto)(nil)
// waitUnpause waits until either the client is unpaused or the Auto client is
// shut down. It reports whether the client should keep running (i.e. it's not
// closed).
func (c *Auto) waitUnpause(routineLogName string) (keepRunning bool) {
c.mu.Lock()
if !c.paused || c.closed {
defer c.mu.Unlock()
return !c.closed
}
unpaused := c.unpausedChanLocked()
c.mu.Unlock()
c.logf("%s: awaiting unpause", routineLogName)
return <-unpaused
}
// updateRoutine is responsible for informing the server of worthy changes to
// our local state. It runs in its own goroutine.
func (c *Auto) updateRoutine() {
defer close(c.updateDone)
bo := backoff.NewBackoff("updateRoutine", c.logf, 30*time.Second)
// lastUpdateGenInformed is the value of lastUpdateAt that we've successfully
// informed the server of.
var lastUpdateGenInformed updateGen
for {
if !c.waitUnpause("updateRoutine") {
c.logf("updateRoutine: exiting")
return
}
c.mu.Lock()
gen := c.lastUpdateGen
ctx := c.mapCtx
needUpdate := gen > 0 && gen != lastUpdateGenInformed && c.loggedIn
c.mu.Unlock()
if !needUpdate {
// Nothing to do, wait for a signal.
select {
case <-ctx.Done():
continue
case <-c.updateCh:
continue
}
}
t0 := c.clock.Now()
err := c.direct.SendUpdate(ctx)
d := time.Since(t0).Round(time.Millisecond)
if err != nil {
if ctx.Err() == nil {
c.direct.logf("lite map update error after %v: %v", d, err)
}
bo.BackOff(ctx, err)
continue
}
bo.BackOff(ctx, nil)
c.direct.logf("[v1] successful lite map update in %v", d)
lastUpdateGenInformed = gen
}
}
// atomicGen is an atomic int64 generator. It is used to generate monotonically
// increasing numbers for updateGen.
var atomicGen atomic.Int64
func nextUpdateGen() updateGen {
return updateGen(atomicGen.Add(1))
}
// updateGen is a monotonically increasing number that represents a particular
// update to the local state.
type updateGen int64
// Auto connects to a tailcontrol server for a node.
// It's a concrete implementation of the Client interface.
type Auto struct {
direct *Direct // our interface to the server APIs
clock tstime.Clock
logf logger.Logf
closed bool
updateCh chan struct{} // readable when we should inform the server of a change
observer Observer // called to update Client status; always non-nil
observerQueue execqueue.ExecQueue
shutdownFn func() // to be called prior to shutdown or nil
mu sync.Mutex // mutex guards the following fields
wantLoggedIn bool // whether the user wants to be logged in per last method call
urlToVisit string // the last url we were told to visit
expiry time.Time
// lastUpdateGen is the gen of last update we had an update worth sending to
// the server.
lastUpdateGen updateGen
lastStatus atomic.Pointer[Status]
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan bool // chans that gets sent true (once) on wake, or false on Shutdown
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
inMapPoll bool // true once we get the first MapResponse in a stream; false when HTTP response ends
state State // TODO(bradfitz): delete this, make it computed by method from other state
authCtx context.Context // context used for auth requests
mapCtx context.Context // context used for netmap and update requests
authCancel func() // cancel authCtx
mapCancel func() // cancel mapCtx
authDone chan struct{} // when closed, authRoutine is done
mapDone chan struct{} // when closed, mapRoutine is done
updateDone chan struct{} // when closed, updateRoutine is done
}
// New creates and starts a new Auto.
func New(opts Options) (*Auto, error) {
c, err := NewNoStart(opts)
if c != nil {
c.Start()
}
return c, err
}
// NewNoStart creates a new Auto, but without calling Start on it.
func NewNoStart(opts Options) (_ *Auto, err error) {
direct, err := NewDirect(opts)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
direct.Close()
}
}()
if opts.Observer == nil {
return nil, errors.New("missing required Options.Observer")
}
if opts.Logf == nil {
opts.Logf = func(fmt string, args ...any) {}
}
if opts.Clock == nil {
opts.Clock = tstime.StdClock{}
}
c := &Auto{
direct: direct,
clock: opts.Clock,
logf: opts.Logf,
updateCh: make(chan struct{}, 1),
authDone: make(chan struct{}),
mapDone: make(chan struct{}),
updateDone: make(chan struct{}),
observer: opts.Observer,
shutdownFn: opts.Shutdown,
}
c.authCtx, c.authCancel = context.WithCancel(context.Background())
c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, opts.Logf)
c.mapCtx, c.mapCancel = context.WithCancel(context.Background())
c.mapCtx = sockstats.WithSockStats(c.mapCtx, sockstats.LabelControlClientAuto, opts.Logf)
return c, nil
}
// SetPaused controls whether HTTP activity should be paused.
//
// The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.
func (c *Auto) SetPaused(paused bool) {
c.mu.Lock()
defer c.mu.Unlock()
if paused == c.paused || c.closed {
return
}
c.logf("setPaused(%v)", paused)
c.paused = paused
if paused {
c.cancelMapCtxLocked()
c.cancelAuthCtxLocked()
return
}
for _, ch := range c.unpauseWaiters {
ch <- true
}
c.unpauseWaiters = nil
}
// Start starts the client's goroutines.
//
// It should only be called for clients created by NewNoStart.
func (c *Auto) Start() {
go c.authRoutine()
go c.mapRoutine()
go c.updateRoutine()
}
// updateControl sends a new OmitPeers, non-streaming map request (to just send
// Hostinfo/Netinfo/Endpoints info, while keeping an existing streaming response
// open).
//
// It should be called whenever there's something new to tell the server.
func (c *Auto) updateControl() {
gen := nextUpdateGen()
c.mu.Lock()
if gen < c.lastUpdateGen {
// This update is out of date.
c.mu.Unlock()
return
}
c.lastUpdateGen = gen
c.mu.Unlock()
select {
case c.updateCh <- struct{}{}:
default:
}
}
// cancelAuthCtxLocked is like cancelAuthCtx, but assumes the caller holds c.mu.
func (c *Auto) cancelAuthCtxLocked() {
if c.authCancel != nil {
c.authCancel()
}
if !c.closed {
c.authCtx, c.authCancel = context.WithCancel(context.Background())
c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, c.logf)
}
}
// cancelMapCtxLocked is like cancelMapCtx, but assumes the caller holds c.mu.
func (c *Auto) cancelMapCtxLocked() {
if c.mapCancel != nil {
c.mapCancel()
}
if !c.closed {
c.mapCtx, c.mapCancel = context.WithCancel(context.Background())
c.mapCtx = sockstats.WithSockStats(c.mapCtx, sockstats.LabelControlClientAuto, c.logf)
}
}
// restartMap cancels the existing mapPoll and liteUpdates, and then starts a
// new one.
func (c *Auto) restartMap() {
c.mu.Lock()
c.cancelMapCtxLocked()
synced := c.inMapPoll
c.mu.Unlock()
c.logf("[v1] restartMap: synced=%v", synced)
c.updateControl()
}
func (c *Auto) authRoutine() {
defer close(c.authDone)
bo := backoff.NewBackoff("authRoutine", c.logf, 30*time.Second)
for {
if !c.waitUnpause("authRoutine") {
c.logf("authRoutine: exiting")
return
}
c.mu.Lock()
goal := c.loginGoal
ctx := c.authCtx
if goal != nil {
c.logf("[v1] authRoutine: %s; wantLoggedIn=%v", c.state, true)
} else {
c.logf("[v1] authRoutine: %s; goal=nil paused=%v", c.state, c.paused)
}
c.mu.Unlock()
report := func(err error, msg string) {
c.logf("[v1] %s: %v", msg, err)
// don't send status updates for context errors,
// since context cancelation is always on purpose.
if ctx.Err() == nil {
c.sendStatus("authRoutine-report", err, "", nil)
}
}
if goal == nil {
c.direct.health.SetAuthRoutineInError(nil)
// Wait for user to Login or Logout.
<-ctx.Done()
c.logf("[v1] authRoutine: context done.")
continue
}
c.mu.Lock()
c.urlToVisit = goal.url
if goal.url != "" {
c.state = StateURLVisitRequired
} else {
c.state = StateAuthenticating
}
c.mu.Unlock()
var url string
var err error
var f string
if goal.url != "" {
url, err = c.direct.WaitLoginURL(ctx, goal.url)
f = "WaitLoginURL"
} else {
url, err = c.direct.TryLogin(ctx, goal.flags)
f = "TryLogin"
}
if err != nil {
c.direct.health.SetAuthRoutineInError(err)
report(err, f)
bo.BackOff(ctx, err)
continue
}
if url != "" {
// goal.url ought to be empty here. However, not all control servers
// get this right, and logging about it here just generates noise.
//
// TODO(bradfitz): I don't follow that comment. Our own testcontrol
// used by tstest/integration hits this path, in fact.
if c.direct.panicOnUse {
panic("tainted client")
}
c.mu.Lock()
c.urlToVisit = url
c.loginGoal = &LoginGoal{
flags: LoginDefault,
url: url,
}
c.state = StateURLVisitRequired
c.mu.Unlock()
c.sendStatus("authRoutine-url", err, url, nil)
if goal.url == url {
// The server sent us the same URL we already tried,
// backoff to avoid a busy loop.
bo.BackOff(ctx, errors.New("login URL not changing"))
} else {
bo.BackOff(ctx, nil)
}
continue
}
// success
c.direct.health.SetAuthRoutineInError(nil)
c.mu.Lock()
c.urlToVisit = ""
c.loggedIn = true
c.loginGoal = nil
c.state = StateAuthenticated
c.mu.Unlock()
c.sendStatus("authRoutine-success", nil, "", nil)
c.restartMap()
bo.BackOff(ctx, nil)
}
}
// ExpiryForTests returns the credential expiration time, or the zero value if
// the expiration time isn't known. It's used in tests only.
func (c *Auto) ExpiryForTests() time.Time {
c.mu.Lock()
defer c.mu.Unlock()
return c.expiry
}
// DirectForTest returns the underlying direct client object.
// It's used in tests only.
func (c *Auto) DirectForTest() *Direct {
return c.direct
}
// unpausedChanLocked returns a new channel that gets sent
// either a true when unpaused or false on Auto.Shutdown.
//
// c.mu must be held
func (c *Auto) unpausedChanLocked() <-chan bool {
unpaused := make(chan bool, 1)
c.unpauseWaiters = append(c.unpauseWaiters, unpaused)
return unpaused
}
// ClientID returns the ClientID of the direct controlClient
func (c *Auto) ClientID() int64 {
return c.direct.ClientID()
}
// mapRoutineState is the state of Auto.mapRoutine while it's running.
type mapRoutineState struct {
c *Auto
bo *backoff.Backoff
}
var _ NetmapDeltaUpdater = mapRoutineState{}
func (mrs mapRoutineState) UpdateFullNetmap(nm *netmap.NetworkMap) {
c := mrs.c
c.mu.Lock()
ctx := c.mapCtx
c.inMapPoll = true
if c.loggedIn {
c.state = StateSynchronized
}
c.expiry = nm.Expiry
stillAuthed := c.loggedIn
c.logf("[v1] mapRoutine: netmap received: %s", c.state)
c.mu.Unlock()
if stillAuthed {
c.sendStatus("mapRoutine-got-netmap", nil, "", nm)
}
// Reset the backoff timer if we got a netmap.
mrs.bo.BackOff(ctx, nil)
}
func (mrs mapRoutineState) UpdateNetmapDelta(muts []netmap.NodeMutation) bool {
c := mrs.c
c.mu.Lock()
goodState := c.loggedIn && c.inMapPoll
ndu, canDelta := c.observer.(NetmapDeltaUpdater)
c.mu.Unlock()
if !goodState || !canDelta {
return false
}
ctx, cancel := context.WithTimeout(c.mapCtx, 2*time.Second)
defer cancel()
var ok bool
err := c.observerQueue.RunSync(ctx, func() {
ok = ndu.UpdateNetmapDelta(muts)
})
return err == nil && ok
}
// mapRoutine is responsible for keeping a read-only streaming connection to the
// control server, and keeping the netmap up to date.
func (c *Auto) mapRoutine() {
defer close(c.mapDone)
mrs := mapRoutineState{
c: c,
bo: backoff.NewBackoff("mapRoutine", c.logf, 30*time.Second),
}
for {
if !c.waitUnpause("mapRoutine") {
c.logf("mapRoutine: exiting")
return
}
c.mu.Lock()
c.logf("[v1] mapRoutine: %s", c.state)
loggedIn := c.loggedIn
ctx := c.mapCtx
c.mu.Unlock()
report := func(err error, msg string) {
c.logf("[v1] %s: %v", msg, err)
err = fmt.Errorf("%s: %w", msg, err)
// don't send status updates for context errors,
// since context cancelation is always on purpose.
if ctx.Err() == nil {
c.sendStatus("mapRoutine1", err, "", nil)
}
}
if !loggedIn {
// Wait for something interesting to happen
c.mu.Lock()
c.inMapPoll = false
c.mu.Unlock()
<-ctx.Done()
c.logf("[v1] mapRoutine: context done.")
continue
}
c.direct.health.SetOutOfPollNetMap()
err := c.direct.PollNetMap(ctx, mrs)
c.direct.health.SetOutOfPollNetMap()
c.mu.Lock()
c.inMapPoll = false
if c.state == StateSynchronized {
c.state = StateAuthenticated
}
paused := c.paused
c.mu.Unlock()
if paused {
mrs.bo.BackOff(ctx, nil)
c.logf("mapRoutine: paused")
} else {
mrs.bo.BackOff(ctx, err)
report(err, "PollNetMap")
}
}
}
func (c *Auto) AuthCantContinue() bool {
if c == nil {
return true
}
c.mu.Lock()
defer c.mu.Unlock()
return !c.loggedIn && (c.loginGoal == nil || c.loginGoal.url != "")
}
func (c *Auto) SetHostinfo(hi *tailcfg.Hostinfo) {
if hi == nil {
panic("nil Hostinfo")
}
if !c.direct.SetHostinfo(hi) {
// No changes. Don't log.
return
}
// Send new Hostinfo to server
c.updateControl()
}
func (c *Auto) SetNetInfo(ni *tailcfg.NetInfo) {
if ni == nil {
panic("nil NetInfo")
}
if !c.direct.SetNetInfo(ni) {
return
}
// Send new NetInfo to server
c.updateControl()
}
// SetTKAHead updates the TKA head hash that map-request infrastructure sends.
func (c *Auto) SetTKAHead(headHash string) {
if !c.direct.SetTKAHead(headHash) {
return
}
// Send new TKAHead to server
c.updateControl()
}
// sendStatus can not be called with the c.mu held.
func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkMap) {
c.mu.Lock()
if c.closed {
c.mu.Unlock()
return
}
state := c.state
loggedIn := c.loggedIn
inMapPoll := c.inMapPoll
c.mu.Unlock()
c.logf("[v1] sendStatus: %s: %v", who, state)
var p persist.PersistView
if nm != nil && loggedIn && inMapPoll {
p = c.direct.GetPersist()
} else {
// don't send netmap status, as it's misleading when we're
// not logged in.
nm = nil
}
newSt := &Status{
URL: url,
Persist: p,
NetMap: nm,
Err: err,
state: state,
}
c.lastStatus.Store(newSt)
// Launch a new goroutine to avoid blocking the caller while the observer
// does its thing, which may result in a call back into the client.
metricQueued.Add(1)
c.observerQueue.Add(func() {
if canSkipStatus(newSt, c.lastStatus.Load()) {
metricSkippable.Add(1)
if !c.direct.controlKnobs.DisableSkipStatusQueue.Load() {
metricSkipped.Add(1)
return
}
}
c.observer.SetControlClientStatus(c, *newSt)
// Best effort stop retaining the memory now that we've sent it to the
// observer (LocalBackend). We CAS here because the caller goroutine is
// doing a Store which we want to win a race. This is only a memory
// optimization and is not for correctness.
//
// If the CAS fails, that means somebody else's Store replaced our
// pointer (so mission accomplished: our netmap is no longer retained in
// any case) and that Store caller will be responsible for removing
// their own netmap (or losing their race too, down the chain).
// Eventually the last caller will win this CAS and zero lastStatus.
c.lastStatus.CompareAndSwap(newSt, nil)
})
}
var (
metricQueued = clientmetric.NewCounter("controlclient_auto_status_queued")
metricSkippable = clientmetric.NewCounter("controlclient_auto_status_queue_skippable")
metricSkipped = clientmetric.NewCounter("controlclient_auto_status_queue_skipped")
)
// canSkipStatus reports whether we can skip sending s1, knowing
// that s2 is enqueued sometime in the future after s1.
//
// s1 must be non-nil. s2 may be nil.
func canSkipStatus(s1, s2 *Status) bool {
if s2 == nil {
// Nothing in the future.
return false
}
if s1 == s2 {
// If the last item in the queue is the same as s1,
// we can't skip it.
return false
}
if s1.Err != nil || s1.URL != "" {
// If s1 has an error or a URL, we shouldn't skip it, lest the error go
// away in s2 or in-between. We want to make sure all the subsystems see
// it. Plus there aren't many of these, so not worth skipping.
return false
}
if !s1.Persist.Equals(s2.Persist) || s1.state != s2.state {
// If s1 has a different Persist or state than s2,
// don't skip it. We only care about skipping the typical
// entries where the only difference is the NetMap.
return false
}
// If nothing above precludes it, and both s1 and s2 have NetMaps, then
// we can skip it, because s2's NetMap is a newer version and we can
// jump straight from whatever state we had before to s2's state,
// without passing through s1's state first. A NetMap is regrettably a
// full snapshot of the state, not an incremental delta. We're slowly
// moving towards passing around only deltas around internally at all
// layers, but this is explicitly the case where we didn't have a delta
// path for the message we received over the wire and had to resort
// to the legacy full NetMap path. And then we can get behind processing
// these full NetMap snapshots in LocalBackend/wgengine/magicsock/netstack
// and this path (when it returns true) lets us skip over useless work
// and not get behind in the queue. This matters in particular for tailnets
// that are both very large + very churny.
return s1.NetMap != nil && s2.NetMap != nil
}
func (c *Auto) Login(flags LoginFlags) {
c.logf("client.Login(%v)", flags)
c.mu.Lock()
defer c.mu.Unlock()
if c.closed {
return
}
if c.direct != nil && c.direct.panicOnUse {
panic("tainted client")
}
c.wantLoggedIn = true
c.loginGoal = &LoginGoal{
flags: flags,
}
c.cancelMapCtxLocked()
c.cancelAuthCtxLocked()
}
var ErrClientClosed = errors.New("client closed")
func (c *Auto) Logout(ctx context.Context) error {
c.logf("client.Logout()")
c.mu.Lock()
c.wantLoggedIn = false
c.loginGoal = nil
closed := c.closed
if c.direct != nil && c.direct.panicOnUse {
panic("tainted client")
}
c.mu.Unlock()
if closed {
return ErrClientClosed
}
if err := c.direct.TryLogout(ctx); err != nil {
return err
}
c.mu.Lock()
c.loggedIn = false
c.state = StateNotAuthenticated
c.cancelAuthCtxLocked()
c.cancelMapCtxLocked()
c.mu.Unlock()
c.sendStatus("authRoutine-wantout", nil, "", nil)
return nil
}
func (c *Auto) SetExpirySooner(ctx context.Context, expiry time.Time) error {
return c.direct.SetExpirySooner(ctx, expiry)
}
// UpdateEndpoints sets the client's discovered endpoints and sends
// them to the control server if they've changed.
//
// It does not retain the provided slice.
func (c *Auto) UpdateEndpoints(endpoints []tailcfg.Endpoint) {
changed := c.direct.SetEndpoints(endpoints)
if changed {
c.updateControl()
}
}
func (c *Auto) Shutdown() {
c.mu.Lock()
if c.closed {
c.mu.Unlock()
return
}
c.logf("client.Shutdown ...")
shutdownFn := c.shutdownFn
direct := c.direct
c.closed = true
c.observerQueue.Shutdown()
c.cancelAuthCtxLocked()
c.cancelMapCtxLocked()
for _, w := range c.unpauseWaiters {
w <- false
}
c.unpauseWaiters = nil
c.mu.Unlock()
if shutdownFn != nil {
shutdownFn()
}
<-c.authDone
<-c.mapDone
<-c.updateDone
if direct != nil {
direct.Close()
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
c.observerQueue.Wait(ctx)
c.logf("Client.Shutdown done.")
}
// NodePublicKey returns the node public key currently in use. This is
// used exclusively in tests.
func (c *Auto) TestOnlyNodePublicKey() key.NodePublic {
priv := c.direct.GetPersist()
return priv.PrivateNodeKey().Public()
}
func (c *Auto) TestOnlySetAuthKey(authkey string) {
c.direct.mu.Lock()
defer c.direct.mu.Unlock()
c.direct.authKey = authkey
}
func (c *Auto) TestOnlyTimeNow() time.Time {
return c.clock.Now()
}
// SetDNS sends the SetDNSRequest request to the control plane server,
// requesting a DNS record be created or updated.
func (c *Auto) SetDNS(ctx context.Context, req *tailcfg.SetDNSRequest) error {
return c.direct.SetDNS(ctx, req)
}
func (c *Auto) DoNoiseRequest(req *http.Request) (*http.Response, error) {
return c.direct.DoNoiseRequest(req)
}