wgengine/magicsock: replace peers slice with peersByID map; add Upsert/RemovePeer

Replace Conn.peers (sorted views.Slice) with peersByID, a
map[tailcfg.NodeID]tailcfg.NodeView. The only caller that needed
the sorted slice (the disco message receive path's binary search)
becomes a single map lookup. Drop nodesEqual.

Add Conn.UpsertPeer / Conn.RemovePeer for O(1) single-peer endpoint
work. RemovePeer also performs a targeted single-disco-key cleanup
(previously that scan was O(discoInfo)).

Extract the shared per-peer upsert body as upsertPeerLocked; still
used by SetNetworkMap's bulk path. SetNetworkMap is documented as
the bulk / initial / self-change path; UpsertPeer and RemovePeer
are preferred for single-peer changes.

Make the relay server set update O(1) per peer: add serverUpsertCh
/ serverRemoveCh to relayManager with matching run-loop handlers.
UpsertPeer / RemovePeer evaluate the per-peer relay predicate
locally and dispatch upsert or remove. The full-rebuild
updateRelayServersSet stays for the initial netmap, filter
changes, and fallback.

Move the hasPeerRelayServers atomic from Conn onto relayManager,
next to the serversByNodeKey map it summarizes. The run loop is
now the single writer and needs no back-pointer to Conn;
endpoint's two hot-path readers take one extra hop to
de.c.relayManager.hasPeerRelayServers but the cost is the same
atomic load.

No callers use UpsertPeer/RemovePeer yet; a subsequent change will
plumb per-peer add/remove through the incremental map update path.

Updates #12542

Change-Id: If6a3442fe29ccbd77890ea61b754a4d1ad6ef225
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2026-04-21 19:19:35 +00:00 committed by Brad Fitzpatrick
parent f289f7e77c
commit 311dd3839d
5 changed files with 360 additions and 193 deletions

View File

@ -108,8 +108,8 @@ func (c *Conn) ServeHTTPDebug(w http.ResponseWriter, r *http.Request) {
}
sort.Slice(ent, func(i, j int) bool { return ent[i].pub.Less(ent[j].pub) })
peers := map[key.NodePublic]tailcfg.NodeView{}
for _, p := range c.peers.All() {
peers := make(map[key.NodePublic]tailcfg.NodeView, len(c.peersByID))
for _, p := range c.peersByID {
peers[p.Key()] = p
}

View File

@ -897,7 +897,7 @@ func (de *endpoint) wantUDPRelayPathDiscoveryLocked(now mono.Time) bool {
if runtime.GOOS == "js" {
return false
}
if !de.c.hasPeerRelayServers.Load() {
if !de.c.relayManager.hasPeerRelayServers.Load() {
// Changes in this value between its access and a call to
// [endpoint.discoverUDPRelayPathsLocked] are fine, we will eventually
// do the "right" thing during future path discovery. The worst case is
@ -2093,7 +2093,7 @@ func (de *endpoint) setDERPHome(regionID uint16) {
de.mu.Lock()
defer de.mu.Unlock()
de.derpAddr = netip.AddrPortFrom(tailcfg.DerpMagicIPAddr, uint16(regionID))
if de.c.hasPeerRelayServers.Load() {
if de.c.relayManager.hasPeerRelayServers.Load() {
de.c.relayManager.handleDERPHomeChange(de.publicKey, regionID)
}
}

View File

@ -269,12 +269,6 @@ type Conn struct {
// captureHook, if non-nil, is the pcap logging callback when capturing.
captureHook syncs.AtomicValue[packet.CaptureCallback]
// hasPeerRelayServers is whether [relayManager] is configured with at least
// one peer relay server via [relayManager.handleRelayServersSet]. It exists
// to suppress calls into [relayManager] leading to wasted work involving
// channel operations and goroutine creation.
hasPeerRelayServers atomic.Bool
// discoAtomic is the current disco private and public keypair for this conn.
discoAtomic discoAtomic
@ -361,18 +355,18 @@ type Conn struct {
// magicsock could do with any complexity reduction it can get.
netInfoLast *tailcfg.NetInfo
derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled
self tailcfg.NodeView // from last SetNetworkMap
peers views.Slice[tailcfg.NodeView] // from last SetNetworkMap, sorted by Node.ID; Note: [netmap.NodeMutation]'s rx'd in UpdateNetmapDelta are never applied
filt *filter.Filter // from last SetFilter
relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers
lastFlags debugFlags // at time of last SetNetworkMap
privateKey key.NodePrivate // WireGuard private key for this node
everHadKey bool // whether we ever had a non-zero private key
myDerp int // nearest DERP region ID; 0 means none/unknown
homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0)
derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close
activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region
derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled
self tailcfg.NodeView // from last SetNetworkMap
peersByID map[tailcfg.NodeID]tailcfg.NodeView // current peer set, keyed by NodeID. Maintained by SetNetworkMap/UpsertPeer/RemovePeer. Note: per-field NodeMutation patches received in UpdateNetmapDelta are never applied to these snapshots.
filt *filter.Filter // from last SetFilter
relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers
lastFlags debugFlags // at time of last SetNetworkMap
privateKey key.NodePrivate // WireGuard private key for this node
everHadKey bool // whether we ever had a non-zero private key
myDerp int // nearest DERP region ID; 0 means none/unknown
homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0)
derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close
activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region
prevDerp map[int]*syncs.WaitGroupChan
// derpRoute contains optional alternate routes to use as an
@ -2430,24 +2424,12 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake
if c.filt == nil {
return
}
// Binary search of peers is O(log n) while c.mu is held.
// TODO: We might be able to use ep.nodeAddr instead of all addresses,
// or we might be able to release c.mu before doing this work. Keep it
// simple and slow for now. c.peers.AsSlice is a copy. We may need to
// write our own binary search for a [views.Slice].
peerI, ok := slices.BinarySearchFunc(c.peers.AsSlice(), ep.nodeID, func(peer tailcfg.NodeView, target tailcfg.NodeID) int {
if peer.ID() < target {
return -1
} else if peer.ID() > target {
return 1
}
return 0
})
peer, ok := c.peersByID[ep.nodeID]
if !ok {
// unexpected
return
}
if !nodeHasCap(c.filt, c.peers.At(peerI), c.self, tailcfg.PeerCapabilityRelay) {
if !nodeHasCap(c.filt, peer, c.self, tailcfg.PeerCapabilityRelay) {
return
}
// [Conn.mu] must not be held while publishing, or [Conn.onUDPRelayAllocResp]
@ -2784,18 +2766,6 @@ func (c *Conn) UpdatePeers(newPeers set.Set[key.NodePublic]) {
}
}
func nodesEqual(x, y views.Slice[tailcfg.NodeView]) bool {
if x.Len() != y.Len() {
return false
}
for i := range x.Len() {
if !x.At(i).Equal(y.At(i)) {
return false
}
}
return true
}
// debugRingBufferSize returns a maximum size for our set of endpoint ring
// buffers by assuming that a single large update is ~500 bytes, and that we
// want to not use more than 1MiB of memory on phones / 4MiB on other devices.
@ -2883,7 +2853,7 @@ func (c *Conn) SetFilter(f *filter.Filter) {
c.mu.Lock()
c.filt = f
self := c.self
peers := c.peers
peers := c.peerSnapshotLocked()
relayClientEnabled := c.relayClientEnabled
c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n)
@ -2897,11 +2867,26 @@ func (c *Conn) SetFilter(f *filter.Filter) {
c.updateRelayServersSet(f, self, peers)
}
// peerSnapshotLocked returns a freshly-allocated slice of the current peers.
// It's used by callers that need to pass peer state to an O(m * n) callee
// (like [Conn.updateRelayServersSet]) after releasing c.mu. c.mu must be held.
func (c *Conn) peerSnapshotLocked() []tailcfg.NodeView {
if len(c.peersByID) == 0 {
return nil
}
out := make([]tailcfg.NodeView, 0, len(c.peersByID))
for _, p := range c.peersByID {
out = append(out, p)
}
return out
}
// updateRelayServersSet iterates all peers and self, evaluating filt for each
// one in order to determine which are relay server candidates. filt, self, and
// peers are passed as args (vs c.mu-guarded fields) to enable callers to
// release c.mu before calling as this is O(m * n) (we iterate all cap rules 'm'
// in filt for every peer 'n').
// in filt for every peer 'n'). peers must be a snapshot owned by the caller;
// this function does not retain it after return.
//
// Calls to updateRelayServersSet must never run concurrent to
// [endpoint.setDERPHome], otherwise [candidatePeerRelay] DERP home changes may
@ -2913,9 +2898,9 @@ func (c *Conn) SetFilter(f *filter.Filter) {
// them.
// 2. Moving this work upstream into [nodeBackend] or similar, and publishing
// the computed result over the eventbus instead.
func (c *Conn) updateRelayServersSet(filt *filter.Filter, self tailcfg.NodeView, peers views.Slice[tailcfg.NodeView]) {
func (c *Conn) updateRelayServersSet(filt *filter.Filter, self tailcfg.NodeView, peers []tailcfg.NodeView) {
relayServers := make(set.Set[candidatePeerRelay])
nodes := append(peers.AsSlice(), self)
nodes := append(peers, self)
for _, maybeCandidate := range nodes {
if maybeCandidate.ID() != self.ID() && !capVerIsRelayCapable(maybeCandidate.Cap()) {
// If maybeCandidate's [tailcfg.CapabilityVersion] is not relay-capable,
@ -2933,12 +2918,9 @@ func (c *Conn) updateRelayServersSet(filt *filter.Filter, self tailcfg.NodeView,
derpHomeRegionID: uint16(maybeCandidate.HomeDERP()),
})
}
// [relayManager]'s run loop updates [relayManager.hasPeerRelayServers]
// to reflect the new server count.
c.relayManager.handleRelayServersSet(relayServers)
if len(relayServers) > 0 {
c.hasPeerRelayServers.Store(true)
} else {
c.hasPeerRelayServers.Store(false)
}
}
// nodeHasCap returns true if src has cap on dst, otherwise it returns false.
@ -2990,6 +2972,12 @@ func (c *candidatePeerRelay) isValid() bool {
// magicsock has the current state before subsequent operations proceed.
//
// self may be invalid if there's no network map.
//
// SetNetworkMap takes the full peer list and walks all of it. For incremental
// updates where only a single peer changes, prefer the O(1) [Conn.UpsertPeer]
// and [Conn.RemovePeer] methods. SetNetworkMap remains the right call for the
// initial netmap and for changes to self or to global state (filter, DERP,
// etc.) that aren't covered by the per-peer methods.
func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
peersChanged := c.updateNodes(self, peers)
@ -3002,7 +2990,7 @@ func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
c.relayClientEnabled = relayClientEnabled
filt := c.filt
selfView := c.self
peersView := c.peers
peersSnap := c.peerSnapshotLocked()
isClosed := c.closed
c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n)
@ -3012,16 +3000,16 @@ func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
if peersChanged || relayClientChanged {
if !relayClientEnabled {
// [relayManager]'s run loop updates [relayManager.hasPeerRelayServers].
c.relayManager.handleRelayServersSet(nil)
c.hasPeerRelayServers.Store(false)
} else {
c.updateRelayServersSet(filt, selfView, peersView)
c.updateRelayServersSet(filt, selfView, peersSnap)
}
}
}
// updateNodes updates [Conn] to reflect the given self node and peers.
// It reports whether the peers were changed from before.
// It reports whether the peer set (membership or any field) changed.
func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (peersChanged bool) {
c.mu.Lock()
defer c.mu.Unlock()
@ -3030,13 +3018,9 @@ func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (pee
return false
}
priorPeers := c.peers
metricNumPeers.Set(int64(len(peers)))
// Update c.self & c.peers regardless, before the following early return.
c.self = self
curPeers := views.SliceOf(peers)
c.peers = curPeers
// [debugFlags] are mutable in [Conn.SetSilentDisco] &
// [Conn.SetProbeUDPLifetime]. These setters are passed [controlknobs.Knobs]
@ -3049,137 +3033,43 @@ func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (pee
// TODO: mutate [debugFlags] here instead of in various [Conn] setters.
flags := c.debugFlagsLocked()
peersChanged = !nodesEqual(priorPeers, curPeers)
if !peersChanged && c.lastFlags == flags {
// The rest of this function is all adjusting state for peers that have
// changed. But if the set of peers is equal and the debug flags (for
// silent disco and probe UDP lifetime) haven't changed, there is no
// need to do anything else.
return
// Fast path: if the peer set and every peer's NodeView are unchanged,
// and flags are unchanged, skip all further work.
if c.lastFlags == flags && len(peers) == len(c.peersByID) {
allSame := true
for _, n := range peers {
if prev, ok := c.peersByID[n.ID()]; !ok || !prev.Equal(n) {
allSame = false
break
}
}
if allSame {
return false
}
}
c.lastFlags = flags
c.logf("[v1] magicsock: got updated network map; %d peers", len(peers))
entriesPerBuffer := debugRingBufferSize(len(peers))
// Try a pass of just upserting nodes and creating missing
// endpoints. If the set of nodes is the same, this is an
// efficient alloc-free update. If the set of nodes is different,
// we'll fall through to the next pass, which allocates but can
// handle full set updates.
// Build the new peer map while upserting each peer.
newPeers := make(map[tailcfg.NodeID]tailcfg.NodeView, len(peers))
for _, n := range peers {
if n.ID() == 0 {
devPanicf("node with zero ID")
continue
}
if n.Key().IsZero() {
devPanicf("node with zero key")
continue
}
ep, ok := c.peerMap.endpointForNodeID(n.ID())
if ok && ep.publicKey != n.Key() {
// The node rotated public keys. Delete the old endpoint and create
// it anew.
c.peerMap.deleteEndpoint(ep)
ok = false
}
if ok {
// At this point we're modifying an existing endpoint (ep) whose
// public key and nodeID match n. Its other fields (such as disco
// key or endpoints) might've changed.
if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
// Discokey transitioned from non-zero to zero? This should not
// happen in the wild, however it could mean:
// 1. A node was downgraded from post 0.100 to pre 0.100.
// 2. A Tailscale node key was extracted and used on a
// non-Tailscale node (should not enter here due to the
// IsWireGuardOnly check)
// 3. The server is misbehaving.
c.peerMap.deleteEndpoint(ep)
continue
}
var oldDiscoKey key.DiscoPublic
if epDisco := ep.disco.Load(); epDisco != nil {
oldDiscoKey = epDisco.key
}
ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
c.peerMap.upsertEndpoint(ep, oldDiscoKey) // maybe update discokey mappings in peerMap
continue
}
if ep, ok := c.peerMap.endpointForNodeKey(n.Key()); ok {
// At this point n.Key() should be for a key we've never seen before. If
// ok was true above, it was an update to an existing matching key and
// we don't get this far. If ok was false above, that means it's a key
// that differs from the one the NodeID had. But double check.
if ep.nodeID != n.ID() {
// Server error. This is known to be a particular issue for Mullvad
// nodes (http://go/corp/27300), so log a distinct error for the
// Mullvad and non-Mullvad cases. The error will be logged either way,
// so an approximate heuristic is fine.
//
// When #27300 is fixed, we can delete this branch and log the same
// panic for any public key moving.
if strings.HasSuffix(n.Name(), ".mullvad.ts.net.") {
devPanicf("public key moved between Mullvad nodeIDs (old=%v new=%v, key=%s); see http://go/corp/27300", ep.nodeID, n.ID(), n.Key().String())
} else {
devPanicf("public key moved between nodeIDs (old=%v new=%v, key=%s)", ep.nodeID, n.ID(), n.Key().String())
}
} else {
// Internal data structures out of sync.
devPanicf("public key found in peerMap but not by nodeID")
}
continue
}
if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
// Ancient pre-0.100 node, which does not have a disco key.
// No longer supported.
continue
}
ep = &endpoint{
c: c,
nodeID: n.ID(),
publicKey: n.Key(),
publicKeyHex: n.Key().UntypedHexString(),
sentPing: map[stun.TxID]sentPing{},
endpointState: map[netip.AddrPort]*endpointState{},
heartbeatDisabled: flags.heartbeatDisabled,
isWireguardOnly: n.IsWireGuardOnly(),
}
switch runtime.GOOS {
case "ios", "android":
// Omit, to save memory. Prior to 2024-03-20 we used to limit it to
// ~1MB on mobile but we never used the data so the memory was just
// wasted.
default:
ep.debugUpdates = ringlog.New[EndpointChange](entriesPerBuffer)
}
if n.Addresses().Len() > 0 {
ep.nodeAddr = n.Addresses().At(0).Addr()
}
ep.initFakeUDPAddr()
ep.updateDiscoKey(n.DiscoKey())
if debugPeerMap() {
c.logEndpointCreated(n)
}
ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
c.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
newPeers[n.ID()] = n
c.upsertPeerLocked(n, flags, entriesPerBuffer)
}
if len(newPeers) != len(peers) {
// Duplicate NodeIDs in the input shouldn't happen, but log if so.
c.logf("[unexpected] magicsock.updateNodes: %d peers input but %d unique IDs", len(peers), len(newPeers))
}
c.peersByID = newPeers
// If the set of nodes changed since the last SetNetworkMap, the
// upsert loop just above made c.peerMap contain the union of the
// old and new peers - which will be larger than the set from the
// current netmap. If that happens, go through the allocful
// deletion path to clean up moribund nodes.
if c.peerMap.nodeCount() != len(peers) {
// If the upsert pass left stale endpoints in peerMap (peers removed
// relative to before), clean them up.
if c.peerMap.nodeCount() != len(newPeers) {
keep := set.Set[key.NodePublic]{}
for _, n := range peers {
for _, n := range newPeers {
keep.Add(n.Key())
}
c.peerMap.forEachEndpoint(func(ep *endpoint) {
@ -3189,14 +3079,226 @@ func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (pee
})
}
// discokeys might have changed in the above. Discard unused info.
// discokeys might have changed above. Discard unused info.
for dk := range c.discoInfo {
if !c.peerMap.knownPeerDiscoKey(dk) {
delete(c.discoInfo, dk)
}
}
return peersChanged
return true
}
// upsertPeerLocked upserts a single peer's endpoint in c.peerMap. It is the
// per-peer body shared by [Conn.SetNetworkMap]'s upsert pass and by the
// efficient per-peer [Conn.UpsertPeer] path.
//
// c.mu must be held.
func (c *Conn) upsertPeerLocked(n tailcfg.NodeView, flags debugFlags, entriesPerBuffer int) {
if n.ID() == 0 {
devPanicf("node with zero ID")
return
}
if n.Key().IsZero() {
devPanicf("node with zero key")
return
}
ep, ok := c.peerMap.endpointForNodeID(n.ID())
if ok && ep.publicKey != n.Key() {
// The node rotated public keys. Delete the old endpoint and create
// it anew.
c.peerMap.deleteEndpoint(ep)
ok = false
}
if ok {
// At this point we're modifying an existing endpoint (ep) whose
// public key and nodeID match n. Its other fields (such as disco
// key or endpoints) might've changed.
if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
// Discokey transitioned from non-zero to zero? This should not
// happen in the wild, however it could mean:
// 1. A node was downgraded from post 0.100 to pre 0.100.
// 2. A Tailscale node key was extracted and used on a
// non-Tailscale node (should not enter here due to the
// IsWireGuardOnly check)
// 3. The server is misbehaving.
c.peerMap.deleteEndpoint(ep)
return
}
var oldDiscoKey key.DiscoPublic
if epDisco := ep.disco.Load(); epDisco != nil {
oldDiscoKey = epDisco.key
}
ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
c.peerMap.upsertEndpoint(ep, oldDiscoKey) // maybe update discokey mappings in peerMap
return
}
if ep, ok := c.peerMap.endpointForNodeKey(n.Key()); ok {
// At this point n.Key() should be for a key we've never seen before. If
// ok was true above, it was an update to an existing matching key and
// we don't get this far. If ok was false above, that means it's a key
// that differs from the one the NodeID had. But double check.
if ep.nodeID != n.ID() {
// Server error. This is known to be a particular issue for Mullvad
// nodes (http://go/corp/27300), so log a distinct error for the
// Mullvad and non-Mullvad cases. The error will be logged either way,
// so an approximate heuristic is fine.
//
// When #27300 is fixed, we can delete this branch and log the same
// panic for any public key moving.
if strings.HasSuffix(n.Name(), ".mullvad.ts.net.") {
devPanicf("public key moved between Mullvad nodeIDs (old=%v new=%v, key=%s); see http://go/corp/27300", ep.nodeID, n.ID(), n.Key().String())
} else {
devPanicf("public key moved between nodeIDs (old=%v new=%v, key=%s)", ep.nodeID, n.ID(), n.Key().String())
}
} else {
// Internal data structures out of sync.
devPanicf("public key found in peerMap but not by nodeID")
}
return
}
if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
// Ancient pre-0.100 node, which does not have a disco key.
// No longer supported.
return
}
ep = &endpoint{
c: c,
nodeID: n.ID(),
publicKey: n.Key(),
publicKeyHex: n.Key().UntypedHexString(),
sentPing: map[stun.TxID]sentPing{},
endpointState: map[netip.AddrPort]*endpointState{},
heartbeatDisabled: flags.heartbeatDisabled,
isWireguardOnly: n.IsWireGuardOnly(),
}
switch runtime.GOOS {
case "ios", "android":
// Omit, to save memory. Prior to 2024-03-20 we used to limit it to
// ~1MB on mobile but we never used the data so the memory was just
// wasted.
default:
ep.debugUpdates = ringlog.New[EndpointChange](entriesPerBuffer)
}
if n.Addresses().Len() > 0 {
ep.nodeAddr = n.Addresses().At(0).Addr()
}
ep.initFakeUDPAddr()
ep.updateDiscoKey(n.DiscoKey())
if debugPeerMap() {
c.logEndpointCreated(n)
}
ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
c.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
}
// UpsertPeer adds or updates a single peer in c. It is the efficient
// O(1)-per-peer alternative to [Conn.SetNetworkMap] when a single peer was
// added or its fields changed. The caller is responsible for serializing
// UpsertPeer/RemovePeer/SetNetworkMap calls relative to one another.
//
// UpsertPeer updates the relay-server set incrementally (O(1)) when the
// upserted peer's relay candidacy changed, rather than rebuilding the
// whole set with [Conn.updateRelayServersSet].
func (c *Conn) UpsertPeer(n tailcfg.NodeView) {
c.mu.Lock()
if c.closed {
c.mu.Unlock()
return
}
if n.ID() == 0 {
c.mu.Unlock()
devPanicf("UpsertPeer: node with zero ID")
return
}
flags := c.debugFlagsLocked()
c.peersByID[n.ID()] = n
c.upsertPeerLocked(n, flags, debugRingBufferSize(len(c.peersByID)))
var relayUpsert candidatePeerRelay
relayQualifies := false
if c.relayClientEnabled {
relayQualifies, relayUpsert = c.relayCandidateLocked(n)
}
relayClientEnabled := c.relayClientEnabled
c.mu.Unlock()
if relayClientEnabled {
if relayQualifies {
c.relayManager.handleRelayServerUpsert(relayUpsert)
} else {
// The peer may have previously qualified; remove covers that
// case and is a no-op otherwise.
c.relayManager.handleRelayServerRemove(n.Key())
}
}
}
// RemovePeer removes a single peer from c. It is the efficient
// O(1)-per-peer alternative to [Conn.SetNetworkMap] when a single peer was
// removed. The caller is responsible for serializing UpsertPeer/RemovePeer/
// SetNetworkMap calls relative to one another.
func (c *Conn) RemovePeer(nid tailcfg.NodeID) {
c.mu.Lock()
if c.closed {
c.mu.Unlock()
return
}
prev, ok := c.peersByID[nid]
if !ok {
c.mu.Unlock()
return
}
delete(c.peersByID, nid)
if ep, ok := c.peerMap.endpointForNodeID(nid); ok {
c.peerMap.deleteEndpoint(ep)
}
// If the peer we just removed held the only reference to its disco
// key, drop the now-orphaned c.discoInfo entry. No need to scan the
// whole map — only this peer's disco key can have become unreferenced
// by this single removal.
if dk := prev.DiscoKey(); !dk.IsZero() && !c.peerMap.knownPeerDiscoKey(dk) {
delete(c.discoInfo, dk)
}
relayClientEnabled := c.relayClientEnabled
c.mu.Unlock()
if relayClientEnabled {
// Tell the relay manager to drop the peer. The run loop no-ops
// this if the peer wasn't a relay server.
c.relayManager.handleRelayServerRemove(prev.Key())
}
}
// relayCandidateLocked reports whether peer p is eligible to be a relay
// server candidate for self, and if so returns the [candidatePeerRelay]
// that would be added to the relay-server set. c.mu must be held.
//
// It mirrors the per-peer predicate in [Conn.updateRelayServersSet].
func (c *Conn) relayCandidateLocked(p tailcfg.NodeView) (ok bool, cp candidatePeerRelay) {
if !p.Valid() {
return false, candidatePeerRelay{}
}
// The cap-version gate in updateRelayServersSet only applies to peers
// (not self). This helper is only called for peers, so always check.
if !capVerIsRelayCapable(p.Cap()) {
return false, candidatePeerRelay{}
}
if !nodeHasCap(c.filt, p, c.self, tailcfg.PeerCapabilityRelayTarget) {
return false, candidatePeerRelay{}
}
return true, candidatePeerRelay{
nodeKey: p.Key(),
discoKey: p.DiscoKey(),
derpHomeRegionID: uint16(p.HomeDERP()),
}
}
func devPanicf(format string, a ...any) {

View File

@ -62,7 +62,6 @@ import (
"tailscale.com/types/netlogtype"
"tailscale.com/types/netmap"
"tailscale.com/types/nettype"
"tailscale.com/types/views"
"tailscale.com/util/cibuild"
"tailscale.com/util/clientmetric"
"tailscale.com/util/eventbus"
@ -3847,7 +3846,7 @@ func TestConn_SetNetworkMap_updateRelayServersSet(t *testing.T) {
c.filt = tt.filt
if len(tt.wantRelayServers) == 0 {
// So we can verify it gets flipped back.
c.hasPeerRelayServers.Store(true)
c.relayManager.hasPeerRelayServers.Store(true)
}
c.SetNetworkMap(tt.self, tt.peers)
@ -3855,8 +3854,8 @@ func TestConn_SetNetworkMap_updateRelayServersSet(t *testing.T) {
if !got.Equal(tt.wantRelayServers) {
t.Fatalf("got: %v != want: %v", got, tt.wantRelayServers)
}
if len(tt.wantRelayServers) > 0 != c.hasPeerRelayServers.Load() {
t.Fatalf("c.hasPeerRelayServers: %v != len(tt.wantRelayServers) > 0: %v", c.hasPeerRelayServers.Load(), len(tt.wantRelayServers) > 0)
if got, want := c.relayManager.hasPeerRelayServers.Load(), len(tt.wantRelayServers) > 0; got != want {
t.Fatalf("c.relayManager.hasPeerRelayServers: %v != len(tt.wantRelayServers) > 0: %v", got, want)
}
if c.relayClientEnabled != tt.wantRelayClientEnabled {
t.Fatalf("c.relayClientEnabled: %v != wantRelayClientEnabled: %v", c.relayClientEnabled, tt.wantRelayClientEnabled)
@ -4422,7 +4421,7 @@ func TestReceiveTSMPDiscoKeyAdvertisement(t *testing.T) {
netip.MustParsePrefix("100.64.0.1/32"),
},
}).View()
conn.peers = views.SliceOf([]tailcfg.NodeView{nodeView})
conn.peersByID = map[tailcfg.NodeID]tailcfg.NodeView{nodeView.ID(): nodeView}
conn.mu.Unlock()
conn.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
@ -4468,7 +4467,7 @@ func TestSendingTSMPDiscoTimer(t *testing.T) {
netip.MustParsePrefix("100.64.0.1/32"),
},
}).View()
conn.peers = views.SliceOf([]tailcfg.NodeView{nodeView})
conn.peersByID = map[tailcfg.NodeID]tailcfg.NodeView{nodeView.ID(): nodeView}
conn.mu.Unlock()
conn.peerMap.upsertEndpoint(ep, key.DiscoPublic{})

View File

@ -9,6 +9,7 @@ import (
"fmt"
"net/netip"
"sync"
"sync/atomic"
"time"
"tailscale.com/disco"
@ -34,6 +35,14 @@ import (
type relayManager struct {
initOnce sync.Once
// hasPeerRelayServers is whether relayManager is configured with at
// least one peer relay server via [relayManager.handleRelayServersSet]
// (or per-peer variants). Exposed as an atomic so [endpoint] hot paths
// can short-circuit when there are no relay servers without taking any
// lock or entering the run loop. Written only from runLoop() via
// [relayManager.publishHasServersRunLoop].
hasPeerRelayServers atomic.Bool
// ===================================================================
// The following fields are owned by a single goroutine, runLoop().
serversByNodeKey map[key.NodePublic]candidatePeerRelay
@ -56,6 +65,8 @@ type relayManager struct {
newServerEndpointCh chan newRelayServerEndpointEvent
rxDiscoMsgCh chan relayDiscoMsgEvent
serversCh chan set.Set[candidatePeerRelay]
serverUpsertCh chan candidatePeerRelay
serverRemoveCh chan key.NodePublic
getServersCh chan chan set.Set[candidatePeerRelay]
derpHomeChangeCh chan derpHomeChangeEvent
@ -228,6 +239,16 @@ func (r *relayManager) runLoop() {
if !r.hasActiveWorkRunLoop() {
return
}
case upsert := <-r.serverUpsertCh:
r.handleServerUpsertRunLoop(upsert)
if !r.hasActiveWorkRunLoop() {
return
}
case nk := <-r.serverRemoveCh:
r.handleServerRemoveRunLoop(nk)
if !r.hasActiveWorkRunLoop() {
return
}
case getServersCh := <-r.getServersCh:
r.handleGetServersRunLoop(getServersCh)
if !r.hasActiveWorkRunLoop() {
@ -265,6 +286,34 @@ func (r *relayManager) handleServersUpdateRunLoop(update set.Set[candidatePeerRe
for _, v := range update.Slice() {
r.serversByNodeKey[v.nodeKey] = v
}
r.publishHasServersRunLoop()
}
// handleServerUpsertRunLoop inserts or updates cp in serversByNodeKey. It is
// the per-peer analog of [relayManager.handleServersUpdateRunLoop] used by
// [Conn.UpsertPeer].
func (r *relayManager) handleServerUpsertRunLoop(cp candidatePeerRelay) {
r.serversByNodeKey[cp.nodeKey] = cp
r.publishHasServersRunLoop()
}
// handleServerRemoveRunLoop deletes nk from serversByNodeKey. It is a no-op
// if nk isn't currently a known server. It is the per-peer analog of
// [relayManager.handleServersUpdateRunLoop] used by [Conn.RemovePeer] and by
// [Conn.UpsertPeer] when a peer is upserted with fields that make it no
// longer a relay candidate.
func (r *relayManager) handleServerRemoveRunLoop(nk key.NodePublic) {
if _, ok := r.serversByNodeKey[nk]; !ok {
return
}
delete(r.serversByNodeKey, nk)
r.publishHasServersRunLoop()
}
// publishHasServersRunLoop updates [relayManager.hasPeerRelayServers] to
// reflect whether any relay servers are currently known.
func (r *relayManager) publishHasServersRunLoop() {
r.hasPeerRelayServers.Store(len(r.serversByNodeKey) > 0)
}
type relayDiscoMsgEvent struct {
@ -330,6 +379,8 @@ func (r *relayManager) init() {
r.newServerEndpointCh = make(chan newRelayServerEndpointEvent)
r.rxDiscoMsgCh = make(chan relayDiscoMsgEvent)
r.serversCh = make(chan set.Set[candidatePeerRelay])
r.serverUpsertCh = make(chan candidatePeerRelay)
r.serverRemoveCh = make(chan key.NodePublic)
r.getServersCh = make(chan chan set.Set[candidatePeerRelay])
r.derpHomeChangeCh = make(chan derpHomeChangeEvent)
r.runLoopStoppedCh = make(chan struct{}, 1)
@ -436,6 +487,21 @@ func (r *relayManager) handleRelayServersSet(servers set.Set[candidatePeerRelay]
relayManagerInputEvent(r, nil, &r.serversCh, servers)
}
// handleRelayServerUpsert is the O(1) per-peer variant of
// [relayManager.handleRelayServersSet]: it inserts or updates a single
// relay server entry.
func (r *relayManager) handleRelayServerUpsert(cp candidatePeerRelay) {
relayManagerInputEvent(r, nil, &r.serverUpsertCh, cp)
}
// handleRelayServerRemove is the O(1) per-peer variant of
// [relayManager.handleRelayServersSet]: it removes a single relay server
// entry by node key. It is a no-op if nk is not currently a known relay
// server.
func (r *relayManager) handleRelayServerRemove(nk key.NodePublic) {
relayManagerInputEvent(r, nil, &r.serverRemoveCh, nk)
}
// relayManagerInputEvent initializes [relayManager] if necessary, starts
// relayManager.runLoop() if it is not running, and writes 'event' on 'eventCh'.
//