wgengine/magicsock: shorten process internal DERP queue

DERP writes go via TCP and the host OS will have plenty of buffer space.
We've observed in the wild with a backed up TCP socket kernel side
buffers of >2.4MB. The DERP internal queue being larger causes an
increase in the probability that the contents of the backbuffer are
"dead letters" - packets that were assumed to be lost.

A first step to improvement is to size this queue only large enough to
avoid some of the initial connect stall problem, but not large enough
that it is contributing in a substantial way to buffer bloat /
dead-letter retention.

Updates tailscale/corp#31762

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2025-08-28 12:00:03 -07:00 committed by James Tucker
parent d42f0b6a21
commit f5d3c59a92
6 changed files with 15 additions and 69 deletions

View File

@ -958,7 +958,6 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/
tailscale.com/util/syspolicy/rsop from tailscale.com/util/syspolicy+
tailscale.com/util/syspolicy/setting from tailscale.com/util/syspolicy+
tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+
tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock
tailscale.com/util/systemd from tailscale.com/control/controlclient+
tailscale.com/util/testenv from tailscale.com/control/controlclient+
tailscale.com/util/truncate from tailscale.com/logtail

View File

@ -435,7 +435,6 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/util/syspolicy/rsop from tailscale.com/util/syspolicy+
tailscale.com/util/syspolicy/setting from tailscale.com/util/syspolicy+
tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+
tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock
tailscale.com/util/systemd from tailscale.com/control/controlclient+
tailscale.com/util/testenv from tailscale.com/ipn/ipnlocal+
tailscale.com/util/truncate from tailscale.com/logtail

View File

@ -387,7 +387,6 @@ tailscale.com/cmd/tsidp dependencies: (generated by github.com/tailscale/depawar
tailscale.com/util/syspolicy/rsop from tailscale.com/ipn/ipnlocal+
tailscale.com/util/syspolicy/setting from tailscale.com/client/local+
tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+
tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock
tailscale.com/util/systemd from tailscale.com/control/controlclient+
tailscale.com/util/testenv from tailscale.com/control/controlclient+
tailscale.com/util/truncate from tailscale.com/logtail

View File

@ -382,7 +382,6 @@ tailscale.com/tsnet dependencies: (generated by github.com/tailscale/depaware)
tailscale.com/util/syspolicy/rsop from tailscale.com/ipn/ipnlocal+
tailscale.com/util/syspolicy/setting from tailscale.com/client/local+
tailscale.com/util/syspolicy/source from tailscale.com/util/syspolicy+
tailscale.com/util/sysresources from tailscale.com/wgengine/magicsock
tailscale.com/util/systemd from tailscale.com/control/controlclient+
tailscale.com/util/testenv from tailscale.com/control/controlclient+
tailscale.com/util/truncate from tailscale.com/logtail

View File

@ -11,9 +11,7 @@ import (
"net"
"net/netip"
"reflect"
"runtime"
"slices"
"sync"
"time"
"unsafe"
@ -32,7 +30,6 @@ import (
"tailscale.com/types/logger"
"tailscale.com/util/mak"
"tailscale.com/util/rands"
"tailscale.com/util/sysresources"
"tailscale.com/util/testenv"
)
@ -282,59 +279,20 @@ func (c *Conn) goDerpConnect(regionID int) {
go c.derpWriteChanForRegion(regionID, key.NodePublic{})
}
var (
bufferedDerpWrites int
bufferedDerpWritesOnce sync.Once
)
// bufferedDerpWritesBeforeDrop returns how many packets writes can be queued
// up the DERP client to write on the wire before we start dropping.
func bufferedDerpWritesBeforeDrop() int {
// For mobile devices, always return the previous minimum value of 32;
// we can do this outside the sync.Once to avoid that overhead.
if runtime.GOOS == "ios" || runtime.GOOS == "android" {
return 32
}
bufferedDerpWritesOnce.Do(func() {
// Some rough sizing: for the previous fixed value of 32, the
// total consumed memory can be:
// = numDerpRegions * messages/region * sizeof(message)
//
// For sake of this calculation, assume 100 DERP regions; at
// time of writing (2023-04-03), we have 24.
//
// A reasonable upper bound for the worst-case average size of
// a message is a *disco.CallMeMaybe message with 16 endpoints;
// since sizeof(netip.AddrPort) = 32, that's 512 bytes. Thus:
// = 100 * 32 * 512
// = 1638400 (1.6MiB)
//
// On a reasonably-small node with 4GiB of memory that's
// connected to each region and handling a lot of load, 1.6MiB
// is about 0.04% of the total system memory.
//
// For sake of this calculation, then, let's double that memory
// usage to 0.08% and scale based on total system memory.
//
// For a 16GiB Linux box, this should buffer just over 256
// messages.
systemMemory := sysresources.TotalMemory()
memoryUsable := float64(systemMemory) * 0.0008
const (
theoreticalDERPRegions = 100
messageMaximumSizeBytes = 512
)
bufferedDerpWrites = int(memoryUsable / (theoreticalDERPRegions * messageMaximumSizeBytes))
// Never drop below the previous minimum value.
if bufferedDerpWrites < 32 {
bufferedDerpWrites = 32
}
})
return bufferedDerpWrites
}
// derpWriteQueueDepth is the depth of the in-process write queue to a single
// DERP region. DERP connections are TCP, and so the actual write queue depth is
// substantially larger than this suggests - often scaling into megabytes
// depending on dynamic TCP parameters and platform TCP tuning. This queue is
// excess of the TCP buffer depth, which means it's almost pure buffer bloat,
// and does not want to be deep - if there are key situations where a node can't
// keep up, either the TCP link to DERP is too slow, or there is a
// synchronization issue in the write path, fixes should be focused on those
// paths, rather than extending this queue.
// TODO(raggi): make this even shorter, ideally this should be a fairly direct
// line into a socket TCP buffer. The challenge at present is that connect and
// reconnect are in the write path and we don't want to block other write
// operations on those.
const derpWriteQueueDepth = 32
// derpWriteChanForRegion returns a channel to which to send DERP packet write
// requests. It creates a new DERP connection to regionID if necessary.
@ -429,7 +387,7 @@ func (c *Conn) derpWriteChanForRegion(regionID int, peer key.NodePublic) chan<-
dc.DNSCache = dnscache.Get()
ctx, cancel := context.WithCancel(c.connCtx)
ch := make(chan derpWriteRequest, bufferedDerpWritesBeforeDrop())
ch := make(chan derpWriteRequest, derpWriteQueueDepth)
ad.c = dc
ad.writeCh = ch

View File

@ -2137,14 +2137,6 @@ func TestOnNodeViewsUpdateWithNoPeers(t *testing.T) {
}
}
func TestBufferedDerpWritesBeforeDrop(t *testing.T) {
vv := bufferedDerpWritesBeforeDrop()
if vv < 32 {
t.Fatalf("got bufferedDerpWritesBeforeDrop=%d, which is < 32", vv)
}
t.Logf("bufferedDerpWritesBeforeDrop = %d", vv)
}
// newWireguard starts up a new wireguard-go device attached to a test tun, and
// returns the device, tun and endpoint port. To add peers call device.IpcSet with UAPI instructions.
func newWireguard(t *testing.T, uapi string, aips []netip.Prefix) (*device.Device, *tuntest.ChannelTUN, uint16) {