mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-05 20:26:47 +02:00
ipn/ipnlocal: fix setAuthURL / setWgengineStatus race condition
If we received a wg engine status while processing an auth URL, there was a race condition where the authURL could be reset to "" immediately after we set it. To fix this we need to check that we are moving from a non-Running state to a Running state rather than always resetting the URL when we "move" into a Running state even if that is the current state. We also need to make sure that we do not return from stopEngineAndWait until the engine is stopped: before, we would return as soon as we received any engine status update, but that might have been an update already in-flight before we asked the engine to stop. Now we wait until we see an update that is indicative of a stopped engine. Updates #17388 Signed-off-by: James Sanderson <jsanderson@tailscale.com>
This commit is contained in:
parent
05a4c8e839
commit
ee87f9ac2d
@ -313,9 +313,8 @@ type LocalBackend struct {
|
||||
serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic
|
||||
serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy
|
||||
|
||||
// statusLock must be held before calling statusChanged.Wait() or
|
||||
// mu must be held before calling statusChanged.Wait() or
|
||||
// statusChanged.Broadcast().
|
||||
statusLock sync.Mutex
|
||||
statusChanged *sync.Cond
|
||||
|
||||
// dialPlan is any dial plan that we've received from the control
|
||||
@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
|
||||
|
||||
b.setTCPPortsIntercepted(nil)
|
||||
|
||||
b.statusChanged = sync.NewCond(&b.statusLock)
|
||||
b.statusChanged = sync.NewCond(&b.mu)
|
||||
b.e.SetStatusCallback(b.setWgengineStatus)
|
||||
|
||||
b.prevIfState = netMon.InterfaceState()
|
||||
@ -2237,14 +2236,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) {
|
||||
b.send(ipn.Notify{Engine: &es})
|
||||
}
|
||||
|
||||
// broadcastStatusChanged must not be called with b.mu held.
|
||||
func (b *LocalBackend) broadcastStatusChanged() {
|
||||
// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
|
||||
// In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before
|
||||
// In this particular case, we must acquire b.mu. Otherwise we might broadcast before
|
||||
// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
|
||||
// the waiter can get stuck indefinitely. See PR 2865.
|
||||
b.statusLock.Lock()
|
||||
b.mu.Lock()
|
||||
b.statusChanged.Broadcast()
|
||||
b.statusLock.Unlock()
|
||||
b.mu.Unlock()
|
||||
}
|
||||
|
||||
// SetNotifyCallback sets the function to call when the backend has something to
|
||||
@ -3308,11 +3308,12 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient
|
||||
if !b.seamlessRenewalEnabled() || keyExpired {
|
||||
b.blockEngineUpdates(true)
|
||||
b.stopEngineAndWait()
|
||||
|
||||
if b.State() == ipn.Running {
|
||||
b.enterState(ipn.Starting)
|
||||
}
|
||||
}
|
||||
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
|
||||
if b.State() == ipn.Running {
|
||||
b.enterState(ipn.Starting)
|
||||
}
|
||||
}
|
||||
|
||||
// validPopBrowserURL reports whether urlStr is a valid value for a
|
||||
@ -5433,7 +5434,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
|
||||
activeLogin := b.activeLogin
|
||||
authURL := b.authURL
|
||||
if newState == ipn.Running {
|
||||
b.resetAuthURLLocked()
|
||||
// TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be
|
||||
// necessary when logging in or authenticating. When do we need to reset it
|
||||
// here, rather than the other places it is reset? We should test if it is
|
||||
// necessary and add unit tests to cover those cases, or remove it.
|
||||
if oldState != ipn.Running {
|
||||
b.resetAuthURLLocked()
|
||||
}
|
||||
|
||||
// Start a captive portal detection loop if none has been
|
||||
// started. Create a new context if none is present, since it
|
||||
@ -5670,29 +5677,34 @@ func (u unlockOnce) UnlockEarly() {
|
||||
}
|
||||
|
||||
// stopEngineAndWait deconfigures the local network data plane, and
|
||||
// waits for it to deliver a status update before returning.
|
||||
//
|
||||
// TODO(danderson): this may be racy. We could unblock upon receiving
|
||||
// a status update that predates the "I've shut down" update.
|
||||
// waits for it to deliver a status update indicating it has stopped
|
||||
// before returning.
|
||||
func (b *LocalBackend) stopEngineAndWait() {
|
||||
b.logf("stopEngineAndWait...")
|
||||
b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
|
||||
b.requestEngineStatusAndWait()
|
||||
b.requestEngineStatusAndWaitForStopped()
|
||||
b.logf("stopEngineAndWait: done.")
|
||||
}
|
||||
|
||||
// Requests the wgengine status, and does not return until the status
|
||||
// was delivered (to the usual callback).
|
||||
func (b *LocalBackend) requestEngineStatusAndWait() {
|
||||
b.logf("requestEngineStatusAndWait")
|
||||
// Requests the wgengine status, and does not return until a status was
|
||||
// delivered (to the usual callback) that indicates the engine is stopped.
|
||||
func (b *LocalBackend) requestEngineStatusAndWaitForStopped() {
|
||||
b.logf("requestEngineStatusAndWaitForStopped")
|
||||
|
||||
b.statusLock.Lock()
|
||||
defer b.statusLock.Unlock()
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
|
||||
b.goTracker.Go(b.e.RequestStatus)
|
||||
b.logf("requestEngineStatusAndWait: waiting...")
|
||||
b.statusChanged.Wait() // temporarily releases lock while waiting
|
||||
b.logf("requestEngineStatusAndWait: got status update.")
|
||||
b.logf("requestEngineStatusAndWaitForStopped: waiting...")
|
||||
for {
|
||||
b.statusChanged.Wait() // temporarily releases lock while waiting
|
||||
|
||||
if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 {
|
||||
b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.")
|
||||
break
|
||||
}
|
||||
b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...")
|
||||
}
|
||||
}
|
||||
|
||||
// setControlClientLocked sets the control client to cc,
|
||||
|
||||
@ -1561,6 +1561,151 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
|
||||
// processing an auth URL doesn't result in the auth URL being cleared.
|
||||
func TestStateMachineURLRace(t *testing.T) {
|
||||
runTestStateMachineURLRace(t, false)
|
||||
}
|
||||
|
||||
func TestStateMachineURLRaceSeamless(t *testing.T) {
|
||||
runTestStateMachineURLRace(t, true)
|
||||
}
|
||||
|
||||
func runTestStateMachineURLRace(t *testing.T, seamless bool) {
|
||||
var cc *mockControl
|
||||
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
|
||||
cc = newClient(t, opts)
|
||||
return cc
|
||||
})
|
||||
|
||||
var stateLock sync.Mutex
|
||||
var state ipn.State
|
||||
stateChanged := make(chan struct{})
|
||||
|
||||
b.SetNotifyCallback(func(n ipn.Notify) {
|
||||
if n.State != nil || (n.Prefs != nil && n.Prefs.Valid()) || n.BrowseToURL != nil || n.LoginFinished != nil {
|
||||
b.logf("Received notify: %+v", n)
|
||||
if n.State != nil {
|
||||
stateLock.Lock()
|
||||
state = *n.State
|
||||
stateLock.Unlock()
|
||||
stateChanged <- struct{}{}
|
||||
}
|
||||
} else {
|
||||
b.logf("(ignored) %v", n)
|
||||
}
|
||||
})
|
||||
|
||||
waitForState := func(want ipn.State) {
|
||||
t.Helper()
|
||||
t.Logf("waiting for state: %v", want)
|
||||
for {
|
||||
stateLock.Lock()
|
||||
s := state
|
||||
stateLock.Unlock()
|
||||
if s == want {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case <-stateChanged:
|
||||
case <-time.After(6 * time.Second):
|
||||
t.Fatalf("timed out waiting for state %v; currently %v", want, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("Start")
|
||||
b.Start(ipn.Options{
|
||||
UpdatePrefs: &ipn.Prefs{
|
||||
WantRunning: true,
|
||||
ControlURL: "https://localhost:1/",
|
||||
},
|
||||
})
|
||||
|
||||
waitForState(ipn.NeedsLogin)
|
||||
|
||||
t.Logf("LoginFinished")
|
||||
cc.persist.UserProfile.LoginName = "user1"
|
||||
cc.persist.NodeID = "node1"
|
||||
|
||||
if seamless {
|
||||
b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
|
||||
}
|
||||
|
||||
cc.send(nil, "", true, &netmap.NetworkMap{
|
||||
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
|
||||
})
|
||||
waitForState(ipn.Starting)
|
||||
|
||||
t.Logf("Running")
|
||||
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
|
||||
waitForState(ipn.Running)
|
||||
|
||||
t.Logf("Re-auth (StartLoginInteractive)")
|
||||
b.StartLoginInteractive(t.Context())
|
||||
|
||||
stop := make(chan struct{})
|
||||
stopSpamming := sync.OnceFunc(func() {
|
||||
stop <- struct{}{}
|
||||
})
|
||||
// if seamless renewal is enabled, the engine won't be disabled, and we won't
|
||||
// ever call stopSpamming, so make sure it does get called
|
||||
defer stopSpamming()
|
||||
|
||||
// Intercept updates between the engine and localBackend, so that we can see
|
||||
// when the "stopped" update comes in and ensure we stop sending our "we're
|
||||
// up" updates after that point.
|
||||
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
|
||||
// This is not one of our fake status updates, this is generated from the
|
||||
// engine in response to LocalBackend calling RequestStatus. Stop spamming
|
||||
// our fake statuses.
|
||||
//
|
||||
// TODO(zofrex): This is fragile, it works right now but would break if the
|
||||
// calling pattern of RequestStatus changes. We should ensure that we keep
|
||||
// sending "we're up" statuses right until Reconfig is called with
|
||||
// zero-valued configs, and after that point only send "stopped" statuses.
|
||||
stopSpamming()
|
||||
|
||||
// Once stopSpamming returns we are guaranteed to not send any more updates,
|
||||
// so we can now send the real update (indicating shutdown) and be certain
|
||||
// it will be received after any fake updates we sent. This is possibly a
|
||||
// stronger guarantee than we get from the real engine?
|
||||
b.setWgengineStatus(s, err)
|
||||
})
|
||||
|
||||
// time needs to be >= last time for the status to be accepted, send all our
|
||||
// spam with the same stale time so that when a real update comes in it will
|
||||
// definitely be accepted.
|
||||
time := b.lastStatusTime
|
||||
|
||||
// Flood localBackend with a lot of wgengine status updates, so if there are
|
||||
// any race conditions in the multiple locks/unlocks that happen as we process
|
||||
// the received auth URL, we will hit them.
|
||||
go func() {
|
||||
t.Logf("sending lots of fake wgengine status updates")
|
||||
for {
|
||||
select {
|
||||
case <-stop:
|
||||
t.Logf("stopping fake wgengine status updates")
|
||||
return
|
||||
default:
|
||||
b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
t.Logf("Re-auth (receive URL)")
|
||||
url1 := "https://localhost:1/1"
|
||||
cc.send(nil, url1, false, nil)
|
||||
|
||||
// Don't need to wait on anything else - once .send completes, authURL should
|
||||
// be set, and once .send has completed, any opportunities for a WG engine
|
||||
// status update to trample it have ended as well.
|
||||
if b.authURL == "" {
|
||||
t.Fatalf("expected authURL to be set")
|
||||
}
|
||||
}
|
||||
|
||||
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
|
||||
const (
|
||||
firstAutoUserID = tailcfg.UserID(10000)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user