From f5d3c59a925b2f0ea249a32ddc0decdb43ff7ee9 Mon Sep 17 00:00:00 2001 From: James Tucker Date: Thu, 28 Aug 2025 12:00:03 -0700 Subject: [PATCH] wgengine/magicsock: shorten process internal DERP queue DERP writes go via TCP and the host OS will have plenty of buffer space. We've observed in the wild with a backed up TCP socket kernel side buffers of >2.4MB. The DERP internal queue being larger causes an increase in the probability that the contents of the backbuffer are "dead letters" - packets that were assumed to be lost. A first step to improvement is to size this queue only large enough to avoid some of the initial connect stall problem, but not large enough that it is contributing in a substantial way to buffer bloat / dead-letter retention. Updates tailscale/corp#31762 Signed-off-by: James Tucker --- cmd/k8s-operator/depaware.txt | 1 - cmd/tailscaled/depaware.txt | 1 - cmd/tsidp/depaware.txt | 1 - tsnet/depaware.txt | 1 - wgengine/magicsock/derp.go | 72 ++++++---------------------- wgengine/magicsock/magicsock_test.go | 8 ---- 6 files changed, 15 insertions(+), 69 deletions(-) diff --git a/cmd/k8s-operator/depaware.txt b/cmd/k8s-operator/depaware.txt index 843ce27f2..4b1e4a1e4 100644 --- a/cmd/k8s-operator/depaware.txt +++ b/cmd/k8s-operator/depaware.txt @@ -958,7 +958,6 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/ tailscale.com/util/syspolicy/rsop from tailscale.com/util/syspolicy+ tailscale.com/util/syspolicy/setting from tailscale.com/util/syspolicy+ tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+ - tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock tailscale.com/util/systemd from tailscale.com/control/controlclient+ tailscale.com/util/testenv from tailscale.com/control/controlclient+ tailscale.com/util/truncate from tailscale.com/logtail diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index fdc48718c..c2d9f3d00 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -435,7 +435,6 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/util/syspolicy/rsop from tailscale.com/util/syspolicy+ tailscale.com/util/syspolicy/setting from tailscale.com/util/syspolicy+ tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+ - tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock tailscale.com/util/systemd from tailscale.com/control/controlclient+ tailscale.com/util/testenv from tailscale.com/ipn/ipnlocal+ tailscale.com/util/truncate from tailscale.com/logtail diff --git a/cmd/tsidp/depaware.txt b/cmd/tsidp/depaware.txt index 503454f50..e8bc2b254 100644 --- a/cmd/tsidp/depaware.txt +++ b/cmd/tsidp/depaware.txt @@ -387,7 +387,6 @@ tailscale.com/cmd/tsidp dependencies: (generated by github.com/tailscale/depawar tailscale.com/util/syspolicy/rsop from tailscale.com/ipn/ipnlocal+ tailscale.com/util/syspolicy/setting from tailscale.com/client/local+ tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+ - tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock tailscale.com/util/systemd from tailscale.com/control/controlclient+ tailscale.com/util/testenv from tailscale.com/control/controlclient+ tailscale.com/util/truncate from tailscale.com/logtail diff --git a/tsnet/depaware.txt b/tsnet/depaware.txt index b490fcbca..aea6baf93 100644 --- a/tsnet/depaware.txt +++ b/tsnet/depaware.txt @@ -382,7 +382,6 @@ tailscale.com/tsnet dependencies: (generated by github.com/tailscale/depaware) tailscale.com/util/syspolicy/rsop from tailscale.com/ipn/ipnlocal+ tailscale.com/util/syspolicy/setting from tailscale.com/client/local+ tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+ - tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock tailscale.com/util/systemd from tailscale.com/control/controlclient+ tailscale.com/util/testenv from tailscale.com/control/controlclient+ tailscale.com/util/truncate from tailscale.com/logtail diff --git a/wgengine/magicsock/derp.go b/wgengine/magicsock/derp.go index 5afdbc6d8..9c60e4893 100644 --- a/wgengine/magicsock/derp.go +++ b/wgengine/magicsock/derp.go @@ -11,9 +11,7 @@ import ( "net" "net/netip" "reflect" - "runtime" "slices" - "sync" "time" "unsafe" @@ -32,7 +30,6 @@ import ( "tailscale.com/types/logger" "tailscale.com/util/mak" "tailscale.com/util/rands" - "tailscale.com/util/sysresources" "tailscale.com/util/testenv" ) @@ -282,59 +279,20 @@ func (c *Conn) goDerpConnect(regionID int) { go c.derpWriteChanForRegion(regionID, key.NodePublic{}) } -var ( - bufferedDerpWrites int - bufferedDerpWritesOnce sync.Once -) - -// bufferedDerpWritesBeforeDrop returns how many packets writes can be queued -// up the DERP client to write on the wire before we start dropping. -func bufferedDerpWritesBeforeDrop() int { - // For mobile devices, always return the previous minimum value of 32; - // we can do this outside the sync.Once to avoid that overhead. - if runtime.GOOS == "ios" || runtime.GOOS == "android" { - return 32 - } - - bufferedDerpWritesOnce.Do(func() { - // Some rough sizing: for the previous fixed value of 32, the - // total consumed memory can be: - // = numDerpRegions * messages/region * sizeof(message) - // - // For sake of this calculation, assume 100 DERP regions; at - // time of writing (2023-04-03), we have 24. - // - // A reasonable upper bound for the worst-case average size of - // a message is a *disco.CallMeMaybe message with 16 endpoints; - // since sizeof(netip.AddrPort) = 32, that's 512 bytes. Thus: - // = 100 * 32 * 512 - // = 1638400 (1.6MiB) - // - // On a reasonably-small node with 4GiB of memory that's - // connected to each region and handling a lot of load, 1.6MiB - // is about 0.04% of the total system memory. - // - // For sake of this calculation, then, let's double that memory - // usage to 0.08% and scale based on total system memory. - // - // For a 16GiB Linux box, this should buffer just over 256 - // messages. - systemMemory := sysresources.TotalMemory() - memoryUsable := float64(systemMemory) * 0.0008 - - const ( - theoreticalDERPRegions = 100 - messageMaximumSizeBytes = 512 - ) - bufferedDerpWrites = int(memoryUsable / (theoreticalDERPRegions * messageMaximumSizeBytes)) - - // Never drop below the previous minimum value. - if bufferedDerpWrites < 32 { - bufferedDerpWrites = 32 - } - }) - return bufferedDerpWrites -} +// derpWriteQueueDepth is the depth of the in-process write queue to a single +// DERP region. DERP connections are TCP, and so the actual write queue depth is +// substantially larger than this suggests - often scaling into megabytes +// depending on dynamic TCP parameters and platform TCP tuning. This queue is +// excess of the TCP buffer depth, which means it's almost pure buffer bloat, +// and does not want to be deep - if there are key situations where a node can't +// keep up, either the TCP link to DERP is too slow, or there is a +// synchronization issue in the write path, fixes should be focused on those +// paths, rather than extending this queue. +// TODO(raggi): make this even shorter, ideally this should be a fairly direct +// line into a socket TCP buffer. The challenge at present is that connect and +// reconnect are in the write path and we don't want to block other write +// operations on those. +const derpWriteQueueDepth = 32 // derpWriteChanForRegion returns a channel to which to send DERP packet write // requests. It creates a new DERP connection to regionID if necessary. @@ -429,7 +387,7 @@ func (c *Conn) derpWriteChanForRegion(regionID int, peer key.NodePublic) chan<- dc.DNSCache = dnscache.Get() ctx, cancel := context.WithCancel(c.connCtx) - ch := make(chan derpWriteRequest, bufferedDerpWritesBeforeDrop()) + ch := make(chan derpWriteRequest, derpWriteQueueDepth) ad.c = dc ad.writeCh = ch diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index 5e348b02b..5774432d5 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -2137,14 +2137,6 @@ func TestOnNodeViewsUpdateWithNoPeers(t *testing.T) { } } -func TestBufferedDerpWritesBeforeDrop(t *testing.T) { - vv := bufferedDerpWritesBeforeDrop() - if vv < 32 { - t.Fatalf("got bufferedDerpWritesBeforeDrop=%d, which is < 32", vv) - } - t.Logf("bufferedDerpWritesBeforeDrop = %d", vv) -} - // newWireguard starts up a new wireguard-go device attached to a test tun, and // returns the device, tun and endpoint port. To add peers call device.IpcSet with UAPI instructions. func newWireguard(t *testing.T, uapi string, aips []netip.Prefix) (*device.Device, *tuntest.ChannelTUN, uint16) {