diff --git a/cmd/k8s-operator/depaware.txt b/cmd/k8s-operator/depaware.txt index f12915d78..d8bca1130 100644 --- a/cmd/k8s-operator/depaware.txt +++ b/cmd/k8s-operator/depaware.txt @@ -301,7 +301,6 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/ gvisor.dev/gvisor/pkg/tcpip/header from gvisor.dev/gvisor/pkg/tcpip/header/parse+ gvisor.dev/gvisor/pkg/tcpip/header/parse from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+ gvisor.dev/gvisor/pkg/tcpip/internal/tcp from gvisor.dev/gvisor/pkg/tcpip/stack+ - gvisor.dev/gvisor/pkg/tcpip/link/channel from tailscale.com/wgengine/netstack gvisor.dev/gvisor/pkg/tcpip/network/hash from gvisor.dev/gvisor/pkg/tcpip/network/ipv4 gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+ gvisor.dev/gvisor/pkg/tcpip/network/internal/ip from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+ diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index 3b6ea0002..f6edbe7d7 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -212,7 +212,6 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de gvisor.dev/gvisor/pkg/tcpip/header from gvisor.dev/gvisor/pkg/tcpip/header/parse+ gvisor.dev/gvisor/pkg/tcpip/header/parse from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+ gvisor.dev/gvisor/pkg/tcpip/internal/tcp from gvisor.dev/gvisor/pkg/tcpip/stack+ - gvisor.dev/gvisor/pkg/tcpip/link/channel from tailscale.com/wgengine/netstack gvisor.dev/gvisor/pkg/tcpip/network/hash from gvisor.dev/gvisor/pkg/tcpip/network/ipv4 gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+ gvisor.dev/gvisor/pkg/tcpip/network/internal/ip from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+ diff --git a/go.mod b/go.mod index 6e9f036f3..4ec098a71 100644 --- a/go.mod +++ b/go.mod @@ -80,7 +80,7 @@ require ( github.com/tailscale/peercred v0.0.0-20240214030740-b535050b2aa4 github.com/tailscale/web-client-prebuilt v0.0.0-20240226180453-5db17b287bf1 github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6 - github.com/tailscale/wireguard-go v0.0.0-20240705152531-2f5d148bcfe1 + github.com/tailscale/wireguard-go v0.0.0-20240724015428-60eeedfd624b github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e github.com/tc-hib/winres v0.2.1 github.com/tcnksm/go-httpstat v0.2.0 diff --git a/go.sum b/go.sum index 66a091992..9758ff885 100644 --- a/go.sum +++ b/go.sum @@ -934,8 +934,8 @@ github.com/tailscale/web-client-prebuilt v0.0.0-20240226180453-5db17b287bf1 h1:t github.com/tailscale/web-client-prebuilt v0.0.0-20240226180453-5db17b287bf1/go.mod h1:agQPE6y6ldqCOui2gkIh7ZMztTkIQKH049tv8siLuNQ= github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6 h1:l10Gi6w9jxvinoiq15g8OToDdASBni4CyJOdHY1Hr8M= github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6/go.mod h1:ZXRML051h7o4OcI0d3AaILDIad/Xw0IkXaHM17dic1Y= -github.com/tailscale/wireguard-go v0.0.0-20240705152531-2f5d148bcfe1 h1:ycpNCSYwzZ7x4G4ioPNtKQmIY0G/3o4pVf8wCZq6blY= -github.com/tailscale/wireguard-go v0.0.0-20240705152531-2f5d148bcfe1/go.mod h1:BOm5fXUBFM+m9woLNBoxI9TaBXXhGNP50LX/TGIvGb4= +github.com/tailscale/wireguard-go v0.0.0-20240724015428-60eeedfd624b h1:8U9NaPB32iFoNjJ+H/yPkAVqXw/dudtj+fLTE4edF+Q= +github.com/tailscale/wireguard-go v0.0.0-20240724015428-60eeedfd624b/go.mod h1:BOm5fXUBFM+m9woLNBoxI9TaBXXhGNP50LX/TGIvGb4= github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e h1:zOGKqN5D5hHhiYUp091JqK7DPCqSARyUfduhGUY8Bek= github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e/go.mod h1:orPd6JZXXRyuDusYilywte7k094d7dycXXU5YnWsrwg= github.com/tc-hib/winres v0.2.1 h1:YDE0FiP0VmtRaDn7+aaChp1KiF4owBiJa5l964l5ujA= diff --git a/net/tstun/wrap.go b/net/tstun/wrap.go index 8ea73b4b2..44b07b1ff 100644 --- a/net/tstun/wrap.go +++ b/net/tstun/wrap.go @@ -10,6 +10,7 @@ "net/netip" "os" "reflect" + "runtime" "slices" "strings" "sync" @@ -17,6 +18,7 @@ "time" "github.com/gaissmai/bart" + "github.com/tailscale/wireguard-go/conn" "github.com/tailscale/wireguard-go/device" "github.com/tailscale/wireguard-go/tun" "go4.org/mem" @@ -894,13 +896,7 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) { return 0, res.err } if res.data == nil { - n, err := t.injectedRead(res.injected, buffs[0], offset) - sizes[0] = n - if err != nil && n == 0 { - return 0, err - } - - return 1, err + return t.injectedRead(res.injected, buffs, sizes, offset) } metricPacketOut.Add(int64(len(res.data))) @@ -955,27 +951,85 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) { return buffsPos, res.err } +const ( + minTCPHeaderSize = 20 +) + +func stackGSOToTunGSO(pkt []byte, gso stack.GSO) (tun.GSOOptions, error) { + options := tun.GSOOptions{ + CsumStart: gso.L3HdrLen, + CsumOffset: gso.CsumOffset, + GSOSize: gso.MSS, + NeedsCsum: gso.NeedsCsum, + } + switch gso.Type { + case stack.GSONone: + options.GSOType = tun.GSONone + return options, nil + case stack.GSOTCPv4: + options.GSOType = tun.GSOTCPv4 + case stack.GSOTCPv6: + options.GSOType = tun.GSOTCPv6 + default: + return tun.GSOOptions{}, fmt.Errorf("unsupported gVisor GSOType: %v", gso.Type) + } + // options.HdrLen is both layer 3 and 4 together, whereas gVisor only + // gives us layer 3 length. We have to gather TCP header length + // ourselves. + if len(pkt) < int(gso.L3HdrLen)+minTCPHeaderSize { + return tun.GSOOptions{}, errors.New("gVisor GSOTCP packet length too short") + } + tcphLen := uint16(pkt[int(gso.L3HdrLen)+12] >> 4 * 4) + options.HdrLen = gso.L3HdrLen + tcphLen + return options, nil +} + +func invertGSOChecksum(pkt []byte, gso stack.GSO) { + if gso.NeedsCsum != true { + return + } + at := int(gso.L3HdrLen + gso.CsumOffset) + if at+1 > len(pkt)-1 { + return + } + pkt[at] = ^pkt[at] + pkt[at+1] = ^pkt[at+1] +} + // injectedRead handles injected reads, which bypass filters. -func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int, error) { - metricPacketOut.Add(1) +func (t *Wrapper) injectedRead(res tunInjectedRead, outBuffs [][]byte, sizes []int, offset int) (n int, err error) { + var gso stack.GSO - var n int - if !res.packet.IsNil() { - - n = copy(buf[offset:], res.packet.NetworkHeader().Slice()) - n += copy(buf[offset+n:], res.packet.TransportHeader().Slice()) - n += copy(buf[offset+n:], res.packet.Data().AsRange().ToSlice()) - res.packet.DecRef() + pkt := outBuffs[0][offset:] + if res.packet != nil { + bufN := copy(pkt, res.packet.NetworkHeader().Slice()) + bufN += copy(pkt[bufN:], res.packet.TransportHeader().Slice()) + bufN += copy(pkt[bufN:], res.packet.Data().AsRange().ToSlice()) + gso = res.packet.GSOOptions + pkt = pkt[:bufN] + defer res.packet.DecRef() // defer DecRef so we may continue to reference it } else { - n = copy(buf[offset:], res.data) + sizes[0] = copy(pkt, res.data) + pkt = pkt[:sizes[0]] + n = 1 } pc := t.peerConfig.Load() p := parsedPacketPool.Get().(*packet.Parsed) defer parsedPacketPool.Put(p) - p.Decode(buf[offset : offset+n]) + p.Decode(pkt) + + // We invert the transport layer checksum before and after snat() if gVisor + // handed us a segment with a partial checksum. A partial checksum is not a + // ones' complement of the sum, and incremental checksum updating that could + // occur as a result of snat() is not aware of this. Alternatively we could + // plumb partial transport layer checksum awareness down through snat(), + // but the surface area of such a change is much larger, and not yet + // justified by this singular case. + invertGSOChecksum(pkt, gso) pc.snat(p) + invertGSOChecksum(pkt, gso) if m := t.destIPActivity.Load(); m != nil { if fn := m[p.Dst.Addr()]; fn != nil { @@ -983,11 +1037,24 @@ func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int } } - if stats := t.stats.Load(); stats != nil { - stats.UpdateTxVirtual(buf[offset:][:n]) + if res.packet != nil { + var gsoOptions tun.GSOOptions + gsoOptions, err = stackGSOToTunGSO(pkt, gso) + if err != nil { + return 0, err + } + n, err = tun.GSOSplit(pkt, gsoOptions, outBuffs, sizes, offset) } + + if stats := t.stats.Load(); stats != nil { + for i := 0; i < n; i++ { + stats.UpdateTxVirtual(outBuffs[i][offset : offset+sizes[i]]) + } + } + t.noteActivity() - return n, nil + metricPacketOut.Add(int64(n)) + return n, err } func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook capture.Callback, pc *peerConfigTable) filter.Response { @@ -1288,6 +1355,14 @@ func (t *Wrapper) InjectOutboundPacketBuffer(pkt *stack.PacketBuffer) error { } func (t *Wrapper) BatchSize() int { + if runtime.GOOS == "linux" { + // Always setup Linux to handle vectors, even in the very rare case that + // the underlying t.tdev returns 1. gVisor GSO is always enabled for + // Linux, and we cannot make a determination on gVisor usage at + // wireguard-go.Device startup, which is when this value matters for + // packet memory init. + return conn.IdealBatchSize + } return t.tdev.BatchSize() } diff --git a/wgengine/netstack/link_endpoint.go b/wgengine/netstack/link_endpoint.go new file mode 100644 index 000000000..77d6075ca --- /dev/null +++ b/wgengine/netstack/link_endpoint.go @@ -0,0 +1,256 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package netstack + +import ( + "context" + "sync" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +type queue struct { + // TODO(jwhited): evaluate performance with mu as Mutex and/or alternative + // non-channel buffer. + c chan *stack.PacketBuffer + mu sync.RWMutex // mu guards closed + closed bool +} + +func (q *queue) Close() { + q.mu.Lock() + defer q.mu.Unlock() + if !q.closed { + close(q.c) + } + q.closed = true +} + +func (q *queue) Read() *stack.PacketBuffer { + select { + case p := <-q.c: + return p + default: + return nil + } +} + +func (q *queue) ReadContext(ctx context.Context) *stack.PacketBuffer { + select { + case pkt := <-q.c: + return pkt + case <-ctx.Done(): + return nil + } +} + +func (q *queue) Write(pkt *stack.PacketBuffer) tcpip.Error { + // q holds the PacketBuffer. + q.mu.RLock() + defer q.mu.RUnlock() + if q.closed { + return &tcpip.ErrClosedForSend{} + } + + wrote := false + select { + case q.c <- pkt.IncRef(): + wrote = true + default: + // TODO(jwhited): reconsider/count + pkt.DecRef() + } + + if wrote { + return nil + } + return &tcpip.ErrNoBufferSpace{} +} + +func (q *queue) Num() int { + return len(q.c) +} + +var _ stack.LinkEndpoint = (*linkEndpoint)(nil) +var _ stack.GSOEndpoint = (*linkEndpoint)(nil) + +// linkEndpoint implements stack.LinkEndpoint and stack.GSOEndpoint. Outbound +// packets written by gVisor towards Tailscale are stored in a channel. +// Inbound is fed to gVisor via InjectInbound. This is loosely modeled after +// gvisor.dev/pkg/tcpip/link/channel.Endpoint. +type linkEndpoint struct { + LinkEPCapabilities stack.LinkEndpointCapabilities + SupportedGSOKind stack.SupportedGSO + + mu sync.RWMutex // mu guards the following fields + dispatcher stack.NetworkDispatcher + linkAddr tcpip.LinkAddress + mtu uint32 + + q *queue // outbound +} + +func newLinkEndpoint(size int, mtu uint32, linkAddr tcpip.LinkAddress) *linkEndpoint { + return &linkEndpoint{ + q: &queue{ + c: make(chan *stack.PacketBuffer, size), + }, + mtu: mtu, + linkAddr: linkAddr, + } +} + +// Close closes l. Further packet injections will return an error, and all +// pending packets are discarded. Close may be called concurrently with +// WritePackets. +func (l *linkEndpoint) Close() { + l.q.Close() + l.Drain() +} + +// Read does non-blocking read one packet from the outbound packet queue. +func (l *linkEndpoint) Read() *stack.PacketBuffer { + return l.q.Read() +} + +// ReadContext does blocking read for one packet from the outbound packet queue. +// It can be cancelled by ctx, and in this case, it returns nil. +func (l *linkEndpoint) ReadContext(ctx context.Context) *stack.PacketBuffer { + return l.q.ReadContext(ctx) +} + +// Drain removes all outbound packets from the channel and counts them. +func (l *linkEndpoint) Drain() int { + c := 0 + for pkt := l.Read(); pkt != nil; pkt = l.Read() { + pkt.DecRef() + c++ + } + return c +} + +// NumQueued returns the number of packet queued for outbound. +func (l *linkEndpoint) NumQueued() int { + return l.q.Num() +} + +// InjectInbound injects an inbound packet. If the endpoint is not attached, the +// packet is not delivered. +func (l *linkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + l.mu.RLock() + d := l.dispatcher + l.mu.RUnlock() + if d != nil { + d.DeliverNetworkPacket(protocol, pkt) + } +} + +// Attach saves the stack network-layer dispatcher for use later when packets +// are injected. +func (l *linkEndpoint) Attach(dispatcher stack.NetworkDispatcher) { + l.mu.Lock() + defer l.mu.Unlock() + l.dispatcher = dispatcher +} + +// IsAttached implements stack.LinkEndpoint.IsAttached. +func (l *linkEndpoint) IsAttached() bool { + l.mu.RLock() + defer l.mu.RUnlock() + return l.dispatcher != nil +} + +// MTU implements stack.LinkEndpoint.MTU. +func (l *linkEndpoint) MTU() uint32 { + l.mu.RLock() + defer l.mu.RUnlock() + return l.mtu +} + +// SetMTU implements stack.LinkEndpoint.SetMTU. +func (l *linkEndpoint) SetMTU(mtu uint32) { + l.mu.Lock() + defer l.mu.Unlock() + l.mtu = mtu +} + +// Capabilities implements stack.LinkEndpoint.Capabilities. +func (l *linkEndpoint) Capabilities() stack.LinkEndpointCapabilities { + return l.LinkEPCapabilities +} + +// GSOMaxSize implements stack.GSOEndpoint. +func (*linkEndpoint) GSOMaxSize() uint32 { + // This an increase from 32k returned by channel.Endpoint.GSOMaxSize() to + // 64k, which improves throughput. + return (1 << 16) - 1 +} + +// SupportedGSO implements stack.GSOEndpoint. +func (l *linkEndpoint) SupportedGSO() stack.SupportedGSO { + return l.SupportedGSOKind +} + +// MaxHeaderLength returns the maximum size of the link layer header. Given it +// doesn't have a header, it just returns 0. +func (*linkEndpoint) MaxHeaderLength() uint16 { + return 0 +} + +// LinkAddress returns the link address of this endpoint. +func (l *linkEndpoint) LinkAddress() tcpip.LinkAddress { + l.mu.RLock() + defer l.mu.RUnlock() + return l.linkAddr +} + +// SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. +func (l *linkEndpoint) SetLinkAddress(addr tcpip.LinkAddress) { + l.mu.Lock() + defer l.mu.Unlock() + l.linkAddr = addr +} + +// WritePackets stores outbound packets into the channel. +// Multiple concurrent calls are permitted. +func (l *linkEndpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { + n := 0 + // TODO(jwhited): evaluate writing a stack.PacketBufferList instead of a + // single packet. We can split 2 x 64K GSO across + // wireguard-go/conn.IdealBatchSize (128 slots) @ 1280 MTU, and non-GSO we + // could do more. Read API would need to change to take advantage. Verify + // gVisor limits around max number of segments packed together. Since we + // control MTU (and by effect TCP MSS in gVisor) we *shouldn't* expect to + // ever overflow 128 slots (see wireguard-go/tun.ErrTooManySegments usage). + for _, pkt := range pkts.AsSlice() { + if err := l.q.Write(pkt); err != nil { + if _, ok := err.(*tcpip.ErrNoBufferSpace); !ok && n == 0 { + return 0, err + } + break + } + n++ + } + + return n, nil +} + +// Wait implements stack.LinkEndpoint.Wait. +func (*linkEndpoint) Wait() {} + +// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. +func (*linkEndpoint) ARPHardwareType() header.ARPHardwareType { + return header.ARPHardwareNone +} + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (*linkEndpoint) AddHeader(*stack.PacketBuffer) {} + +// ParseHeader implements stack.LinkEndpoint.ParseHeader. +func (*linkEndpoint) ParseHeader(*stack.PacketBuffer) bool { return true } + +// SetOnCloseAction implements stack.LinkEndpoint. +func (*linkEndpoint) SetOnCloseAction(func()) {} diff --git a/wgengine/netstack/netstack.go b/wgengine/netstack/netstack.go index b86a98a43..c575607cb 100644 --- a/wgengine/netstack/netstack.go +++ b/wgengine/netstack/netstack.go @@ -26,7 +26,6 @@ "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" "gvisor.dev/gvisor/pkg/tcpip/header" - "gvisor.dev/gvisor/pkg/tcpip/link/channel" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -176,7 +175,7 @@ type Impl struct { ProcessSubnets bool ipstack *stack.Stack - linkEP *channel.Endpoint + linkEP *linkEndpoint tundev *tstun.Wrapper e wgengine.Engine pm *proxymap.Mapper @@ -285,7 +284,11 @@ func Create(logf logger.Logf, tundev *tstun.Wrapper, e wgengine.Engine, mc *magi return nil, fmt.Errorf("could not disable TCP RACK: %v", tcpipErr) } } - linkEP := channel.New(512, uint32(tstun.DefaultTUNMTU()), "") + linkEP := newLinkEndpoint(512, uint32(tstun.DefaultTUNMTU()), "") + if runtime.GOOS == "linux" { + // TODO(jwhited): add Windows support https://github.com/tailscale/corp/issues/21874 + linkEP.SupportedGSOKind = stack.HostGSOSupported + } if tcpipProblem := ipstack.CreateNIC(nicID, linkEP); tcpipProblem != nil { return nil, fmt.Errorf("could not create netstack NIC: %v", tcpipProblem) }