wgengine/netstack: absorb all quad-100 traffic locally, never leak to peers

Previously, handleLocalPackets intercepted traffic to the Tailscale
service IP (100.100.100.100 / fd7a:115c:a1e0::53) only for an allow-list
of ports: TCP 53/80/8080 and UDP 53. Any other port returned
filter.Accept, letting the packet fall through to the ACL filter and
wireguard-go, which would attempt a peer lookup. No peer owns the
quad-100 AllowedIP, so after ~5s pendopen.go would log:

    open-conn-track: timeout opening ...; no associated peer node

This is the common "conntrack error no peer found for 100.100.100.100:853"
log spam seen in the wild (e.g. from systemd-resolved or another
resolver speculatively trying DoT on quad-100). It also leaks quad-100
packets onto the tailnet.

Remove the port allow-list so handleLocalPackets absorbs every quad-100
packet into netstack regardless of IP protocol or port. Traffic never
reaches the conntrack / peer-routing layers.

With the allow-list gone, acceptTCP needs a corresponding guard: on a
quad-100 TCP port we don't serve, execution used to fall through to the
isTailscaleIP case (quad-100 is in the tailscale IP range), which
rewrote the dial target to 127.0.0.1:<port> and forwardTCP'd the
connection to whatever happened to be listening on the host's loopback
at that port. Add a hittingServiceIP case that RSTs cleanly instead,
placed before the isTailscaleIP fallthrough.

TestQuad100UnservedTCPPortDoesNotForward is a new integration test that
injects a TCP SYN to 100.100.100.100:853 via handleLocalPackets, stubs
forwardDialFunc, and asserts the dialer is not invoked; it catches
regressions of the acceptTCP recursion/loopback-redirection case.

Fixes #15796
Fixes #19421
Updates #3261
Updates #11305

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2026-04-24 02:18:44 +00:00 committed by James Tucker
parent 006d7e180e
commit 1b40911611
2 changed files with 223 additions and 14 deletions

View File

@ -845,20 +845,27 @@ func (ns *Impl) handleLocalPackets(p *packet.Parsed, t *tstun.Wrapper, gro *gro.
serviceName, isVIPServiceIP := ns.atomicIPVIPServiceMap.Load()[dst]
switch {
case dst == serviceIP || dst == serviceIPv6:
// We want to intercept some traffic to the "service IP" (e.g.
// 100.100.100.100 for IPv4). However, of traffic to the
// service IP, we only care about UDP 53, and TCP on port 53,
// 80, and 8080.
switch p.IPProto {
case ipproto.TCP:
if port := p.Dst.Port(); port != 53 && port != 80 && port != 8080 && !ns.isLoopbackPort(port) {
return filter.Accept, gro
}
case ipproto.UDP:
if port := p.Dst.Port(); port != 53 && !ns.isLoopbackPort(port) {
return filter.Accept, gro
}
}
// Traffic to the Tailscale service IP (100.100.100.100 /
// fd7a:115c:a1e0::53) is always terminated locally on this
// node; it must never be forwarded out over WireGuard to a
// peer. Netstack's TCP/UDP acceptors handle the ports we
// actually serve (UDP 53 MagicDNS, TCP 53/80/8080 for DNS,
// the web client, and Taildrive, plus any debug loopback
// port). Other ports are rejected cleanly by netstack: UDP
// closes the endpoint in acceptUDP, and TCP is RST'd by
// acceptTCP's hittingServiceIP guard.
//
// Previously we returned filter.Accept for TCP/UDP on any
// other port, which let the packet fall through to the ACL
// filter and ultimately wireguard-go, where no peer owns the
// quad-100 AllowedIP. That produced noisy "open-conn-track:
// timeout opening ...; no associated peer node" log lines
// (e.g. for stray traffic to 100.100.100.100:853 / DoT) and
// leaked quad-100 packets onto the tailnet.
//
// We now unconditionally absorb quad-100 into netstack here,
// regardless of IP protocol or port, so such traffic never
// reaches the conntrack / peer-routing layers.
case isVIPServiceIP:
// returns all active VIP services in a set, since the IPVIPServiceMap
// contains inactive service IPs when node hosts the service, we need to
@ -1654,6 +1661,24 @@ func (ns *Impl) acceptTCP(r *tcp.ForwarderRequest) {
} else {
dialIP = ipv4Loopback
}
case hittingServiceIP:
// TCP to the Tailscale service IP on a port we don't serve
// (anything other than DNS/53, web client/80, Taildrive/8080,
// or the debug loopback port handled above). handleLocalPackets
// absorbs all quad-100 traffic into netstack to prevent it
// from leaking to WireGuard peers as noisy "open-conn-track:
// timeout opening ...; no associated peer node" log lines
// (see the comment there).
//
// Without this explicit guard, execution would fall through
// to the isTailscaleIP case below (quad-100 is in the
// tailscale IP range), rewriting the dial target to
// 127.0.0.1:<port> and forwardTCP'ing the connection onto
// whatever random service happens to be listening on the
// host's loopback at that port. Reject cleanly with a RST
// here instead.
r.Complete(true) // sends a RST
return
case isTailscaleIP:
dialIP = ipv4Loopback
}

View File

@ -953,6 +953,7 @@ func TestHandleLocalPackets(t *testing.T) {
impl.lb.SetIPServiceMappingsForTest(IPServiceMap)
t.Run("ShouldHandleServiceIP", func(t *testing.T) {
t.Parallel()
pkt := &packet.Parsed{
IPVersion: 4,
IPProto: ipproto.TCP,
@ -965,7 +966,94 @@ func TestHandleLocalPackets(t *testing.T) {
t.Errorf("got filter outcome %v, want filter.DropSilently", resp)
}
})
// Any port on the quad-100 service IP must be absorbed locally by
// netstack and never leak out to the WireGuard / peer-routing
// layers. Historically we only intercepted specific ports (UDP 53
// and TCP 53/80/8080), causing stray traffic to other ports such
// as 100.100.100.100:853 (DoT) to time out in wireguard-go and
// produce "open-conn-track: timeout opening ...; no associated
// peer node" log spam. See the handleLocalPackets comment.
quad100LeakCases := []struct {
name string
proto ipproto.Proto
dst string
}{
{"TCP-853-DoT-v4", ipproto.TCP, "100.100.100.100:853"},
{"TCP-443-DoH-v4", ipproto.TCP, "100.100.100.100:443"},
{"TCP-9000-stray-v4", ipproto.TCP, "100.100.100.100:9000"},
{"UDP-853-DoQ-v4", ipproto.UDP, "100.100.100.100:853"},
{"UDP-443-v4", ipproto.UDP, "100.100.100.100:443"},
{"TCP-853-DoT-v6", ipproto.TCP, "[fd7a:115c:a1e0::53]:853"},
{"UDP-443-v6", ipproto.UDP, "[fd7a:115c:a1e0::53]:443"},
}
for _, tc := range quad100LeakCases {
t.Run("ShouldNotLeakQuad100_"+tc.name, func(t *testing.T) {
t.Parallel()
dst := netip.MustParseAddrPort(tc.dst)
ipVersion := uint8(4)
if dst.Addr().Is6() {
ipVersion = 6
}
src := "127.0.0.1:9999"
if ipVersion == 6 {
src = "[::1]:9999"
}
pkt := &packet.Parsed{
IPVersion: ipVersion,
IPProto: tc.proto,
Src: netip.MustParseAddrPort(src),
Dst: dst,
}
if tc.proto == ipproto.TCP {
pkt.TCPFlags = packet.TCPSyn
}
resp, _ := impl.handleLocalPackets(pkt, impl.tundev, nil)
if resp != filter.DropSilently {
t.Errorf("quad-100 %s packet leaked: got filter outcome %v, want filter.DropSilently", tc.name, resp)
}
})
}
// Exhaustive sweep of all ports for both transport protocols and
// both IP versions, confirming no port leaks. The quad-100 branch
// of handleLocalPackets is port-independent by construction; this
// test serves as a regression guard against accidental port-based
// exemptions slipping back in.
t.Run("ShouldNotLeakQuad100_AllPorts", func(t *testing.T) {
t.Parallel()
protos := []ipproto.Proto{ipproto.TCP, ipproto.UDP}
dsts := []netip.Addr{
netip.MustParseAddr("100.100.100.100"),
netip.MustParseAddr("fd7a:115c:a1e0::53"),
}
for _, proto := range protos {
for _, dstAddr := range dsts {
ipVersion := uint8(4)
srcStr := "127.0.0.1:9999"
if dstAddr.Is6() {
ipVersion = 6
srcStr = "[::1]:9999"
}
src := netip.MustParseAddrPort(srcStr)
for port := 1; port <= 65535; port++ {
pkt := &packet.Parsed{
IPVersion: ipVersion,
IPProto: proto,
Src: src,
Dst: netip.AddrPortFrom(dstAddr, uint16(port)),
}
if proto == ipproto.TCP {
pkt.TCPFlags = packet.TCPSyn
}
resp, _ := impl.handleLocalPackets(pkt, impl.tundev, nil)
if resp != filter.DropSilently {
t.Fatalf("port=%d proto=%v dst=%v: got %v, want filter.DropSilently", port, proto, dstAddr, resp)
}
}
}
}
})
t.Run("ShouldHandle4via6", func(t *testing.T) {
t.Parallel()
pkt := &packet.Parsed{
IPVersion: 6,
IPProto: ipproto.TCP,
@ -988,6 +1076,7 @@ func TestHandleLocalPackets(t *testing.T) {
}
})
t.Run("ShouldHandleLocalTailscaleServices", func(t *testing.T) {
t.Parallel()
pkt := &packet.Parsed{
IPVersion: 4,
IPProto: ipproto.TCP,
@ -1001,6 +1090,7 @@ func TestHandleLocalPackets(t *testing.T) {
}
})
t.Run("OtherNonHandled", func(t *testing.T) {
t.Parallel()
pkt := &packet.Parsed{
IPVersion: 6,
IPProto: ipproto.TCP,
@ -1023,6 +1113,100 @@ func TestHandleLocalPackets(t *testing.T) {
})
}
// TestQuad100UnservedTCPPortDoesNotForward verifies that a TCP SYN to the
// Tailscale service IP (100.100.100.100) on a port we don't serve is
// absorbed by netstack and rejected cleanly, without triggering the
// outbound forwardTCP dialer.
//
// handleLocalPackets now absorbs all quad-100 traffic regardless of
// port to prevent it leaking to WireGuard peers (which produced noisy
// "open-conn-track: timeout opening ...; no associated peer node" log
// lines). That leaves acceptTCP responsible for rejecting connections
// to ports we don't handle; without an explicit guard, execution would
// fall through to the isTailscaleIP case (quad-100 is in the tailscale
// range), rewriting the dial target to 127.0.0.1:<port> and forwarding
// the connection to whatever random service happened to be listening
// on the host's loopback at that port.
//
// This test asserts that the forward dialer is NOT invoked for quad-100
// SYNs on unserved ports; the guard in acceptTCP must RST instead.
func TestQuad100UnservedTCPPortDoesNotForward(t *testing.T) {
impl := makeNetstack(t, func(impl *Impl) {
impl.ProcessSubnets = false
impl.ProcessLocalIPs = false
impl.atomicIsLocalIPFunc.Store(looksLikeATailscaleSelfAddress)
})
dialFn, gotConn := makeHangDialer(t)
impl.forwardDialFunc = dialFn
// Use a client IP in the CGNAT range so shouldProcessInbound-adjacent
// code paths treat this as plausibly-peer-sourced traffic, matching
// what a real stray quad-100 probe from the host OS would look like.
client := netip.MustParseAddr("100.101.102.103")
quad100 := tsaddr.TailscaleServiceIP()
// 853 is DoT, the specific case called out in the original bug
// report ("conntrack error no peer found for 100.100.100.100:853").
// Before the fix, port 853 (and any non-{53,80,8080} port) leaked
// out to WireGuard; after the fix it is absorbed here and must NOT
// trigger forwardTCP.
pkt := tcp4syn(t, client, quad100, 1234, 853)
var parsed packet.Parsed
parsed.Decode(pkt)
resp, _ := impl.handleLocalPackets(&parsed, impl.tundev, nil)
if resp != filter.DropSilently {
t.Fatalf("handleLocalPackets for quad-100:853: got %v, want filter.DropSilently", resp)
}
// acceptTCP runs asynchronously in the gVisor TCP dispatcher after
// handleLocalPackets injects the packet into netstack. Use the
// in-flight connection counter as a deterministic synchronization
// point: wrapTCPProtocolHandler increments connsInFlightByClient
// when the dispatcher hands the connection off to acceptTCP, and
// acceptTCP's deferred decrementInFlightTCPForward decrements it
// on return.
//
// On the green path (RST guard fires), acceptTCP returns promptly
// and the counter reaches 0. On the red path (fall-through to
// forwardTCP), acceptTCP blocks inside the forwardDialFunc call —
// makeHangDialer signals gotConn on entry (buffered, non-blocking)
// and then blocks forever — so the counter never reaches 0 but
// gotConn fires synchronously from the dispatcher goroutine. A
// select on both races those outcomes without real-time padding.
//
// testing/synctest is not usable here: gVisor's sleep package calls
// the runtime's gopark directly rather than via the standard
// library, so synctest.Wait() cannot observe those goroutines
// becoming durably blocked and hangs indefinitely.
inFlightZero := make(chan struct{})
go func() {
for {
impl.mu.Lock()
n := impl.connsInFlightByClient[client]
impl.mu.Unlock()
if n == 0 {
close(inFlightZero)
return
}
time.Sleep(time.Millisecond)
}
}()
select {
case <-gotConn:
t.Fatalf("forwardDialFunc was called for quad-100:853; acceptTCP fell through to forwardTCP instead of sending RST. This means stray traffic to quad-100 on unserved ports is being redirected to the host's loopback at the same port.")
case <-inFlightZero:
// acceptTCP returned cleanly; the RST guard fired.
case <-time.After(5 * time.Second):
// Safety net so a regression in the in-flight counter plumbing
// doesn't hang the whole test run; both outcomes above should
// fire within milliseconds in practice.
t.Fatal("timed out waiting for acceptTCP to dispatch quad-100:853 SYN")
}
}
func TestShouldSendToHost(t *testing.T) {
var (
selfIP4 = netip.MustParseAddr("100.64.1.2")