mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-10 22:56:19 +02:00
For large tailnets (~50k+ nodes) with frequent peer churn (ephemeral
GitHub Actions workers etc.), tailscaled used to rebuild the full
netmap and fan it out on the IPN bus on every MapResponse that
added or removed a peer. Two compounding O(N) costs per delta: the
full netmap rebuild + every Notify.NetMap encode to every bus watcher.
This change tackles both:
1. Plumb O(1) peer add/remove through the delta path. PeersChanged
and PeersRemoved no longer veto the delta path; instead they
mutate the per-node-backend peer map in place.
2. Restrict ipn.Notify.NetMap emission to the platforms whose host
GUIs still depend on it (Windows, macOS, iOS) and migrate
in-tree consumers off it everywhere else:
- Migrate reactive consumers (containerboot, kube agents,
sniproxy, tsconsensus, etc.) off Notify.NetMap to the
previously-added Notify.SelfChange signal so they no longer
have to subscribe to the full netmap.
- Add ipn.NotifyNoNetMap so GUI clients on legacy-emit platforms
that have already migrated can opt out of the per-watcher
NetMap encode.
- Gate Notify.NetMap emission on the producer side by a compile-
time GOOS check, so the supporting code is dead-code-eliminated
on Linux and other geese where no GUI consumer needs it.
Re-running BenchmarkGiantTailnet from tstest/largetailnet, which was
added along with baseline numbers on unmodified main in ad5436af0d57,
the per-delta cost (one peer add+remove pair) is now ~O(1) regardless
of tailnet size N:
N no-watcher (ms/op) bus-watcher (ms/op)
before now factor before now factor
10000 32 0.11 300x 166 0.13 1300x
50000 222 0.11 2000x 865 0.13 6700x
100000 504 0.12 4100x 1765 0.13 13400x
250000 1551 0.12 12500x 4696 0.15 32400x
Updates #12542
Change-Id: I94e34b37331d1a8ec74c299deffadf4d061fda9e
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
242 lines
7.0 KiB
Go
242 lines
7.0 KiB
Go
// Copyright (c) Tailscale Inc & contributors
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package ipnlocal
|
|
|
|
import (
|
|
"context"
|
|
"runtime"
|
|
"time"
|
|
|
|
"tailscale.com/ipn"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/tstime"
|
|
"tailscale.com/util/mak"
|
|
)
|
|
|
|
// goosGetsLegacyNetmapNotify reports whether tailscaled, when running on the
|
|
// current GOOS, still emits the legacy [ipn.Notify.NetMap] field on runtime
|
|
// (non-initial) bus messages. It is true on platforms whose host GUIs have
|
|
// not yet finished migrating to the narrower bus signals
|
|
// ([ipn.Notify.SelfChange] / [ipn.Notify.PeerChanges]) and the on-demand
|
|
// [LocalClient.NetMap] fetch.
|
|
//
|
|
// runtime.GOOS is a compile-time constant, so the producer-side code that
|
|
// builds and ships NetMap on the bus is dead-code-eliminated on Linux and
|
|
// other geese where this is false.
|
|
const goosGetsLegacyNetmapNotify = runtime.GOOS == "windows" ||
|
|
runtime.GOOS == "darwin" ||
|
|
runtime.GOOS == "ios"
|
|
|
|
type rateLimitingBusSender struct {
|
|
fn func(*ipn.Notify) (keepGoing bool)
|
|
lastFlush time.Time // last call to fn, or zero value if none
|
|
interval time.Duration // 0 to flush immediately; non-zero to rate limit sends
|
|
clock tstime.DefaultClock // non-nil for testing
|
|
didSendTestHook func() // non-nil for testing
|
|
|
|
// pending, if non-nil, is the pending notification that we
|
|
// haven't sent yet. We own this memory to mutate.
|
|
pending *ipn.Notify
|
|
|
|
// flushTimer is non-nil if the timer is armed.
|
|
flushTimer tstime.TimerController // effectively a *time.Timer
|
|
flushTimerC <-chan time.Time // ... said ~Timer's C chan
|
|
}
|
|
|
|
func (s *rateLimitingBusSender) close() {
|
|
if s.flushTimer != nil {
|
|
s.flushTimer.Stop()
|
|
}
|
|
}
|
|
|
|
func (s *rateLimitingBusSender) flushChan() <-chan time.Time {
|
|
return s.flushTimerC
|
|
}
|
|
|
|
func (s *rateLimitingBusSender) flush() (keepGoing bool) {
|
|
if n := s.pending; n != nil {
|
|
s.pending = nil
|
|
return s.flushNotify(n)
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (s *rateLimitingBusSender) flushNotify(n *ipn.Notify) (keepGoing bool) {
|
|
s.lastFlush = s.clock.Now()
|
|
return s.fn(n)
|
|
}
|
|
|
|
// send conditionally sends n to the underlying fn, possibly rate
|
|
// limiting it, depending on whether s.interval is set, and whether
|
|
// n is a notable notification that the client (typically a GUI) would
|
|
// want to act on (render) immediately.
|
|
//
|
|
// It returns whether the caller should keep looping.
|
|
//
|
|
// The passed-in memory 'n' is owned by the caller and should
|
|
// not be mutated.
|
|
func (s *rateLimitingBusSender) send(n *ipn.Notify) (keepGoing bool) {
|
|
if s.interval <= 0 {
|
|
// No rate limiting case.
|
|
return s.fn(n)
|
|
}
|
|
if isNotableNotify(n) {
|
|
// Notable notifications are always sent immediately.
|
|
// But first send any boring one that was pending.
|
|
// TODO(bradfitz): there might be a boring one pending
|
|
// with a NetMap or Engine field that is redundant
|
|
// with the new one (n) with NetMap or Engine populated.
|
|
// We should clear the pending one's NetMap/Engine in
|
|
// that case. Or really, merge the two, but mergeBoringNotifies
|
|
// only handles the case of both sides being boring.
|
|
// So for now, flush both.
|
|
if !s.flush() {
|
|
return false
|
|
}
|
|
return s.flushNotify(n)
|
|
}
|
|
s.pending = mergeBoringNotifies(s.pending, n)
|
|
d := s.clock.Now().Sub(s.lastFlush)
|
|
if d > s.interval {
|
|
return s.flush()
|
|
}
|
|
nextFlushIn := s.interval - d
|
|
if s.flushTimer == nil {
|
|
s.flushTimer, s.flushTimerC = s.clock.NewTimer(nextFlushIn)
|
|
} else {
|
|
s.flushTimer.Reset(nextFlushIn)
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (s *rateLimitingBusSender) Run(ctx context.Context, ch <-chan *ipn.Notify) {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case n, ok := <-ch:
|
|
if !ok {
|
|
return
|
|
}
|
|
if !s.send(n) {
|
|
return
|
|
}
|
|
if f := s.didSendTestHook; f != nil {
|
|
f()
|
|
}
|
|
case <-s.flushChan():
|
|
if !s.flush() {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// mergeBoringNotify merges new notify src into possibly-nil dst,
|
|
// either mutating dst or allocating a new one if dst is nil,
|
|
// returning the merged result.
|
|
//
|
|
// dst and src must both be "boring" (i.e. not notable per isNotifiableNotify).
|
|
func mergeBoringNotifies(dst, src *ipn.Notify) *ipn.Notify {
|
|
if dst == nil {
|
|
dst = &ipn.Notify{Version: src.Version}
|
|
}
|
|
if goosGetsLegacyNetmapNotify && src.NetMap != nil {
|
|
// Full netmap supersedes any accumulated peer-change deltas.
|
|
dst.NetMap = src.NetMap
|
|
dst.PeerChangedPatch = nil
|
|
} else if src.PeerChangedPatch != nil {
|
|
dst.PeerChangedPatch = mergePeerChangedPatch(dst.PeerChangedPatch, src.PeerChangedPatch)
|
|
}
|
|
if len(src.PeersChanged) > 0 {
|
|
dst.PeersChanged = append(dst.PeersChanged, src.PeersChanged...)
|
|
}
|
|
if len(src.PeersRemoved) > 0 {
|
|
dst.PeersRemoved = append(dst.PeersRemoved, src.PeersRemoved...)
|
|
}
|
|
for id, up := range src.UserProfiles {
|
|
mak.Set(&dst.UserProfiles, id, up)
|
|
}
|
|
if src.Engine != nil {
|
|
dst.Engine = src.Engine
|
|
}
|
|
return dst
|
|
}
|
|
|
|
// mergePeerChangedPatch merges new peer-changed patches from src into dst,
|
|
// either mutating dst or allocating a new slice if dst is nil, returning the
|
|
// merged result. Values in src override those in dst for the same NodeID.
|
|
func mergePeerChangedPatch(dst, src []*tailcfg.PeerChange) []*tailcfg.PeerChange {
|
|
idxByNode := make(map[tailcfg.NodeID]int, len(dst))
|
|
for i, d := range dst {
|
|
idxByNode[d.NodeID] = i
|
|
}
|
|
|
|
for _, nd := range src {
|
|
if oi, ok := idxByNode[nd.NodeID]; ok {
|
|
dst[oi] = mergePeerChangeForIpnBus(dst[oi], nd)
|
|
continue
|
|
}
|
|
idxByNode[nd.NodeID] = len(dst)
|
|
dst = append(dst, nd)
|
|
}
|
|
return dst
|
|
}
|
|
|
|
// mergePeerChangeForIpnBus merges new with old, returning the result.
|
|
// Fields set in new override those in old; fields only set in old are preserved.
|
|
func mergePeerChangeForIpnBus(old, new *tailcfg.PeerChange) *tailcfg.PeerChange {
|
|
merged := *old
|
|
|
|
// This is a subset of PeerChange that reflects only the fields that can
|
|
// be changed via a NodeMutation. If future fields can be updated via
|
|
// NodeMutations from map responses (and they are relevant to the ipn bus), then
|
|
// they should be added here and merged in the same way.
|
|
if new.DERPRegion != 0 {
|
|
// netmap.NodeMutationDerpHome
|
|
merged.DERPRegion = new.DERPRegion
|
|
}
|
|
if new.Online != nil {
|
|
// netmap.NodeMutationOnline
|
|
merged.Online = new.Online
|
|
}
|
|
if new.LastSeen != nil {
|
|
// netmap.NodeMutationLastSeen
|
|
merged.LastSeen = new.LastSeen
|
|
}
|
|
if new.Endpoints != nil {
|
|
// netmap.NodeMutationEndpoints
|
|
merged.Endpoints = new.Endpoints
|
|
}
|
|
|
|
return &merged
|
|
}
|
|
|
|
// isNotableNotify reports whether n is a "notable" notification that
|
|
// should be sent on the IPN bus immediately (e.g. to GUIs) without
|
|
// rate limiting it for a few seconds.
|
|
//
|
|
// PeerChanges and Engine are the only "boring" (rate-limitable) fields.
|
|
func isNotableNotify(n *ipn.Notify) bool {
|
|
if n == nil {
|
|
return false
|
|
}
|
|
return n.State != nil ||
|
|
n.SessionID != "" ||
|
|
n.BrowseToURL != nil ||
|
|
n.LocalTCPPort != nil ||
|
|
n.ClientVersion != nil ||
|
|
n.Prefs != nil ||
|
|
n.ErrMessage != nil ||
|
|
n.LoginFinished != nil ||
|
|
n.SelfChange != nil ||
|
|
n.InitialStatus != nil ||
|
|
!n.DriveShares.IsNil() ||
|
|
n.Health != nil ||
|
|
len(n.IncomingFiles) > 0 ||
|
|
len(n.OutgoingFiles) > 0 ||
|
|
n.FilesWaiting != nil ||
|
|
n.SuggestedExitNode != nil
|
|
}
|