diff --git a/client/tailscale/localclient.go b/client/tailscale/localclient.go index c4fff4b7d..610bdb12f 100644 --- a/client/tailscale/localclient.go +++ b/client/tailscale/localclient.go @@ -809,10 +809,12 @@ func (lc *LocalClient) ExpandSNIName(ctx context.Context, name string) (fqdn str // Ping sends a ping of the provided type to the provided IP and waits // for its response. -func (lc *LocalClient) Ping(ctx context.Context, ip netip.Addr, pingtype tailcfg.PingType) (*ipnstate.PingResult, error) { +func (lc *LocalClient) Ping(ctx context.Context, ip netip.Addr, pingtype tailcfg.PingType, mtu int) (*ipnstate.PingResult, error) { v := url.Values{} v.Set("ip", ip.String()) + v.Set("mtu", strconv.Itoa(mtu)) v.Set("type", string(pingtype)) + // XXX new api or whatnot body, err := lc.send(ctx, "POST", "/localapi/v0/ping?"+v.Encode(), 200, nil) if err != nil { return nil, fmt.Errorf("error %w: %s", err, body) diff --git a/cmd/tailscale/cli/ping.go b/cmd/tailscale/cli/ping.go index f6ca4183c..1ba0f1140 100644 --- a/cmd/tailscale/cli/ping.go +++ b/cmd/tailscale/cli/ping.go @@ -53,12 +53,14 @@ relay node. fs.BoolVar(&pingArgs.peerAPI, "peerapi", false, "try hitting the peer's peerapi HTTP server") fs.IntVar(&pingArgs.num, "c", 10, "max number of pings to send. 0 for infinity.") fs.DurationVar(&pingArgs.timeout, "timeout", 5*time.Second, "timeout before giving up on a ping") + fs.IntVar(&pingArgs.mtu, "mtu", 0, "send a packet with this many bytes total") return fs })(), } var pingArgs struct { num int + mtu int untilDirect bool verbose bool tsmp bool @@ -115,7 +117,7 @@ func runPing(ctx context.Context, args []string) error { for { n++ ctx, cancel := context.WithTimeout(ctx, pingArgs.timeout) - pr, err := localClient.Ping(ctx, netip.MustParseAddr(ip), pingType()) + pr, err := localClient.Ping(ctx, netip.MustParseAddr(ip), pingType(), pingArgs.mtu) cancel() if err != nil { if errors.Is(err, context.DeadlineExceeded) { diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index ffc0467f7..ca51aa012 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -170,7 +170,7 @@ type ControlDialPlanner interface { // Pinger is the LocalBackend.Ping method. type Pinger interface { // Ping is a request to do a ping with the peer handling the given IP. - Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType) (*ipnstate.PingResult, error) + Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, mtu int) (*ipnstate.PingResult, error) } type Decompressor interface { @@ -1670,7 +1670,7 @@ func doPingerPing(logf logger.Logf, c *http.Client, pr *tailcfg.PingRequest, pin ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - res, err := pinger.Ping(ctx, pr.IP, pingType) + res, err := pinger.Ping(ctx, pr.IP, pingType, 0) if err != nil { d := time.Since(start).Round(time.Millisecond) logf("doPingerPing: ping error of type %q to %v after %v: %v", pingType, pr.IP, d, err) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 813d01265..0d303aaad 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -2395,7 +2395,7 @@ func (b *LocalBackend) StartLoginInteractive() { } } -func (b *LocalBackend) Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType) (*ipnstate.PingResult, error) { +func (b *LocalBackend) Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, mtu int) (*ipnstate.PingResult, error) { if pingType == tailcfg.PingPeerAPI { t0 := time.Now() node, base, err := b.pingPeerAPI(ctx, ip) @@ -2423,7 +2423,7 @@ func (b *LocalBackend) Ping(ctx context.Context, ip netip.Addr, pingType tailcfg case ch <- pr: default: } - }) + }, mtu) select { case pr := <-ch: return pr, nil diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index 1161c330a..51a895ec6 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -1335,7 +1335,17 @@ func (h *Handler) servePing(w http.ResponseWriter, r *http.Request) { http.Error(w, "missing 'type' parameter", 400) return } - res, err := h.b.Ping(ctx, ip, tailcfg.PingType(pingTypeStr)) + mtuStr := r.FormValue("mtu") + if mtuStr == "" { + // XXX old api didn't include this arg + mtuStr = "0" + } + mtu, err := strconv.Atoi(mtuStr) + if err != nil { + http.Error(w, "invalid 'mtu' parameter", 400) + return + } + res, err := h.b.Ping(ctx, ip, tailcfg.PingType(pingTypeStr), mtu) if err != nil { writeErrorJSON(w, err) return diff --git a/net/interfaces/interfaces.go b/net/interfaces/interfaces.go index e9e21eabc..fa16d3311 100644 --- a/net/interfaces/interfaces.go +++ b/net/interfaces/interfaces.go @@ -289,6 +289,9 @@ type State struct { // PAC is the URL to the Proxy Autoconfig URL, if applicable. PAC string + + // MaxMTU is the largest MTU of the available usable interfaces + MaxMTU int } func (s *State) String() string { @@ -524,6 +527,9 @@ func GetState() (*State, error) { } s.HaveV6 = s.HaveV6 || isUsableV6(pfx.Addr()) s.HaveV4 = s.HaveV4 || isUsableV4(pfx.Addr()) + if ni.MTU > s.MaxMTU { + s.MaxMTU = ni.MTU + } } }); err != nil { return nil, err diff --git a/tsnet/tsnet_test.go b/tsnet/tsnet_test.go index 7053e811a..48423bd28 100644 --- a/tsnet/tsnet_test.go +++ b/tsnet/tsnet_test.go @@ -239,7 +239,7 @@ func TestConn(t *testing.T) { } // ping to make sure the connection is up. - res, err := lc2.Ping(ctx, s1ip, tailcfg.PingICMP) + res, err := lc2.Ping(ctx, s1ip, tailcfg.PingICMP, 0) if err != nil { t.Fatal(err) } diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index f9e28acad..614013949 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -1007,7 +1007,7 @@ func (c *Conn) LastRecvActivityOfNodeKey(nk key.NodePublic) string { } // Ping handles a "tailscale ping" CLI query. -func (c *Conn) Ping(peer *tailcfg.Node, res *ipnstate.PingResult, cb func(*ipnstate.PingResult)) { +func (c *Conn) Ping(peer *tailcfg.Node, res *ipnstate.PingResult, cb func(*ipnstate.PingResult), mtu int) { c.mu.Lock() defer c.mu.Unlock() if c.privateKey.IsZero() { @@ -1031,7 +1031,7 @@ func (c *Conn) Ping(peer *tailcfg.Node, res *ipnstate.PingResult, cb func(*ipnst cb(res) return } - ep.cliPing(res, cb) + ep.cliPing(res, cb, mtu) } // c.mu must be held @@ -2056,13 +2056,45 @@ const ( // speeds. var debugIPv4DiscoPingPenalty = envknob.RegisterDuration("TS_DISCO_PONG_IPV4_DELAY") +// Peer MTU notes: +// +// The general concept is to send ~5 different sizes of ping/pong +// packets between clients. The largest one that comes back defines +// our MTU for communicating with this peer via this endpoint. This is +// accomplished by teaching betterAddr() about pong packet length and +// having it prefer the larger mtu for the same addr/port +// endpoint. This prototype adds a wantMTU parameter to the relevant +// disco routines because it was easy. +// +// TODO: Actually use the MTU when sending packets to our peer via +// this endpoint. +// +// TODO: Do the math to convert MTU <-> the wantMTU parameter, if +// necessary. I think it is necessary? + +// usefulMtus are the set of likely on-the-wire MTUs (including all the +// layers of protocal headers above link layer) +// +// Each ping is kicked off with a separate go routine and they seem to +// be run in approximately LIFO order, so if we want the biggest ping +// to arrive first put it last? +var usefulMtus = [...]int{ + // 576, // Smallest MTU for IPv4, probably useless? + // 1124, // An observed max mtu in the wild, maybe 1100 instead? + 1280, // Smallest MTU for IPv6, current default + 1400, // A little less, for tunnels or such + 1500, // Most common real world MTU + // 8000, // Some jumbo frames are this size + 9000, // Most jumbo frames are this size or slightly larger +} + // sendDiscoMessage sends discovery message m to dstDisco at dst. // // If dst is a DERP IP:port, then dstKey must be non-zero. // // The dstKey should only be non-zero if the dstDisco key // unambiguously maps to exactly one peer. -func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDisco key.DiscoPublic, m disco.Message, logLevel discoLogLevel) (sent bool, err error) { +func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDisco key.DiscoPublic, m disco.Message, logLevel discoLogLevel, mtu int) (sent bool, err error) { isDERP := dst.Addr() == derpMagicIPAddr if _, isPong := m.(*disco.Pong); isPong && !isDERP && dst.Addr().Is4() { time.Sleep(debugIPv4DiscoPingPenalty()) @@ -2077,7 +2109,12 @@ func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDi if _, err := crand.Read(nonce[:]); err != nil { panic(err) // worth dying for } - pkt := make([]byte, 0, 512) // TODO: size it correctly? pool? if it matters. + + bufSize := 512 + if mtu != 0 { + bufSize = mtu + } + pkt := make([]byte, 0, bufSize) pkt = append(pkt, disco.Magic...) pkt = c.discoPublic.AppendTo(pkt) di := c.discoInfoLocked(dstDisco) @@ -2089,8 +2126,24 @@ func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDi metricSendDiscoUDP.Add(1) } - box := di.sharedKey.Seal(m.AppendMarshal(nil)) + // pm needs to be padded out to hit our mtu goal + // + // IP adds 20/40 bytes to the packet + // + // Disco header adds ??? bytes + // + // Seal adds 40 bytes to the input + pm := m.AppendMarshal(nil) + // Now extend it to the mtu minus all the headers + var pad int + if mtu != 0 { + pad = mtu - 40 - 40 - len(pkt) - len(pm) + pm = append(pm, make([]byte, pad, pad)...) + } + box := di.sharedKey.Seal(pm) + //c.logf("pkt %v mtu %v pad %v pm %v box %v", len(pkt), mtu, pad, len(pm), len(box)) pkt = append(pkt, box...) + //c.logf("pkt %v", len(pkt)) sent, err = c.sendAddr(dst, dstKey, pkt) if sent { if logLevel == discoLog || (logLevel == discoVerboseLog && debugDisco()) { @@ -2098,7 +2151,7 @@ func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDi if !dstKey.IsZero() { node = dstKey.ShortString() } - c.dlogf("[v1] magicsock: disco: %v->%v (%v, %v) sent %v", c.discoShort, dstDisco.ShortString(), node, derpStr(dst.String()), disco.MessageSummary(m)) + c.logf("magicsock: disco: %v->%v (%v, %v) sent %v len %v\n", c.discoShort, dstDisco.ShortString(), node, derpStr(dst.String()), disco.MessageSummary(m), len(pkt)) } if isDERP { metricSentDiscoDERP.Add(1) @@ -2168,6 +2221,7 @@ const ( // - senderDiscoPubKey [32]byte // - nonce [24]byte // - naclbox of payload (see tailscale.com/disco package for inner payload format) +// - optionally, zero padding to test out the MTU // // For messages received over DERP, the src.Addr() will be derpMagicIP (with // src.Port() being the region ID) and the derpNodeSrc will be the node key @@ -2175,10 +2229,10 @@ const ( // over UDP. func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc key.NodePublic, via discoRXPath) (isDiscoMsg bool) { const headerLen = len(disco.Magic) + key.DiscoPublicRawLen - if len(msg) < headerLen || string(msg[:len(disco.Magic)]) != disco.Magic { + msgLen := len(msg) + if msgLen < headerLen || string(msg[:len(disco.Magic)]) != disco.Magic { return false } - // If the first four parts are the prefix of disco.Magic // (0x5453f09f) then it's definitely not a valid WireGuard // packet (which starts with little-endian uint32 1, 2, 3, 4). @@ -2194,7 +2248,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc ke return } if debugDisco() { - c.logf("magicsock: disco: got disco-looking frame from %v via %s", sender.ShortString(), via) + c.logf("magicsock: disco: got disco-looking frame from %v via %s len %v", sender.ShortString(), via, msgLen) } if c.privateKey.IsZero() { // Ignore disco messages when we're stopped. @@ -2267,14 +2321,14 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc ke switch dm := dm.(type) { case *disco.Ping: metricRecvDiscoPing.Add(1) - c.handlePingLocked(dm, src, di, derpNodeSrc) + c.handlePingLocked(dm, src, di, derpNodeSrc, msgLen) case *disco.Pong: metricRecvDiscoPong.Add(1) // There might be multiple nodes for the sender's DiscoKey. // Ask each to handle it, stopping once one reports that // the Pong's TxID was theirs. c.peerMap.forEachEndpointWithDiscoKey(sender, func(ep *endpoint) (keepGoing bool) { - if ep.handlePongConnLocked(dm, di, src) { + if ep.handlePongConnLocked(dm, di, src, msgLen) { return false } return true @@ -2351,7 +2405,8 @@ func (c *Conn) unambiguousNodeKeyOfPingLocked(dm *disco.Ping, dk key.DiscoPublic // di is the discoInfo of the source of the ping. // derpNodeSrc is non-zero if the ping arrived via DERP. -func (c *Conn) handlePingLocked(dm *disco.Ping, src netip.AddrPort, di *discoInfo, derpNodeSrc key.NodePublic) { +// msgLen is the length of the disco message +func (c *Conn) handlePingLocked(dm *disco.Ping, src netip.AddrPort, di *discoInfo, derpNodeSrc key.NodePublic, msgLen int) { likelyHeartBeat := src == di.lastPingFrom && time.Since(di.lastPingTime) < 5*time.Second di.lastPingFrom = src di.lastPingTime = time.Now() @@ -2380,16 +2435,25 @@ func (c *Conn) handlePingLocked(dm *disco.Ping, src netip.AddrPort, di *discoInf // Remember this route if not present. var numNodes int var dup bool + // adjust mtu for ip header + // XXX assuming ipv6 + var mtu int + // If the msgLen is the size of the bare pong packet, ignore it + // XXX make this much less terrible + if msgLen > 124 { + c.logf("adding 40 bytes to msgLen %v", msgLen) + mtu = msgLen + 40 + } if isDerp { if ep, ok := c.peerMap.endpointForNodeKey(derpNodeSrc); ok { - if ep.addCandidateEndpoint(src, dm.TxID) { + if ep.addCandidateEndpoint(src, dm.TxID, 0) { return } numNodes = 1 } } else { c.peerMap.forEachEndpointWithDiscoKey(di.discoKey, func(ep *endpoint) (keepGoing bool) { - if ep.addCandidateEndpoint(src, dm.TxID) { + if ep.addCandidateEndpoint(src, dm.TxID, mtu) { dup = true return false } @@ -2427,7 +2491,7 @@ func (c *Conn) handlePingLocked(dm *disco.Ping, src netip.AddrPort, di *discoInf go c.sendDiscoMessage(ipDst, dstKey, discoDest, &disco.Pong{ TxID: dm.TxID, Src: src, - }, discoVerboseLog) + }, discoVerboseLog, mtu) } // enqueueCallMeMaybe schedules a send of disco.CallMeMaybe to de via derpAddr @@ -2469,12 +2533,12 @@ func (c *Conn) enqueueCallMeMaybe(derpAddr netip.AddrPort, de *endpoint) { for _, ep := range c.lastEndpoints { eps = append(eps, ep.Addr) } - go de.c.sendDiscoMessage(derpAddr, de.publicKey, epDisco.key, &disco.CallMeMaybe{MyNumber: eps}, discoLog) + go de.c.sendDiscoMessage(derpAddr, de.publicKey, epDisco.key, &disco.CallMeMaybe{MyNumber: eps}, discoLog, 0) if debugSendCallMeUnknownPeer() { // Send a callMeMaybe packet to a non-existent peer unknownKey := key.NewNode().Public() c.logf("magicsock: sending CallMeMaybe to unknown peer per TS_DEBUG_SEND_CALLME_UNKNOWN_PEER") - go de.c.sendDiscoMessage(derpAddr, unknownKey, epDisco.key, &disco.CallMeMaybe{MyNumber: eps}, discoLog) + go de.c.sendDiscoMessage(derpAddr, unknownKey, epDisco.key, &disco.CallMeMaybe{MyNumber: eps}, discoLog, 0) } } @@ -4141,7 +4205,7 @@ type endpoint struct { lastFullPing mono.Time // last time we pinged all disco endpoints derpAddr netip.AddrPort // fallback/bootstrap path, if non-zero (non-zero for well-behaved clients) - bestAddr addrLatency // best non-DERP path; zero if none + bestAddr addrQuality // best non-DERP path; zero if none bestAddrAt mono.Time // time best address re-confirmed trustBestAddrUntil mono.Time // time when bestAddr expires sentPing map[stun.TxID]sentPing @@ -4249,6 +4313,7 @@ type endpointState struct { recentPong uint16 // index into recentPongs of most recent; older before, wrapped index int16 // index in nodecfg.Node.Endpoints; meaningless if lastGotPing non-zero + mtu int // mtu of connection to this endpoint } // indexSentinelDeleted is the temporary value that endpointState.index takes while @@ -4291,11 +4356,13 @@ func (de *endpoint) deleteEndpointLocked(why string, ep netip.AddrPort) { What: "deleteEndpointLocked-bestAddr-" + why, From: de.bestAddr, }) - de.bestAddr = addrLatency{} + de.bestAddr = addrQuality{} } } // pongHistoryCount is how many pongReply values we keep per endpointState +// XXX should maybe make this bigger for mtu stuff +// const pongHistoryCount = 64 * len(usefulMtus) const pongHistoryCount = 64 type pongReply struct { @@ -4303,6 +4370,7 @@ type pongReply struct { pongAt mono.Time // when we received the pong from netip.AddrPort // the pong's src (usually same as endpoint map key) pongSrc netip.AddrPort // what they reported they heard + mtu int // total packet size including IP header } type sentPing struct { @@ -4310,6 +4378,7 @@ type sentPing struct { at mono.Time timer *time.Timer // timeout timer purpose discoPingPurpose + mtu int // total packet size including IP header } // initFakeUDPAddr populates fakeWGAddr with a globally unique fake UDPAddr. @@ -4510,7 +4579,7 @@ func (de *endpoint) noteActiveLocked() { // cliPing starts a ping for the "tailscale ping" command. res is value to call cb with, // already partially filled. -func (de *endpoint) cliPing(res *ipnstate.PingResult, cb func(*ipnstate.PingResult)) { +func (de *endpoint) cliPing(res *ipnstate.PingResult, cb func(*ipnstate.PingResult), mtu int) { de.mu.Lock() defer de.mu.Unlock() @@ -4532,10 +4601,16 @@ func (de *endpoint) cliPing(res *ipnstate.PingResult, cb func(*ipnstate.PingResu // Otherwise "tailscale ping" results to a node on the local network // can look like they're bouncing between, say 10.0.0.0/9 and the peer's // IPv6 address, both 1ms away, and it's random who replies first. - de.startDiscoPingLocked(udpAddr, now, pingCLI) + de.startDiscoPingLockedMTU(udpAddr, now, pingCLI, mtu) } else { for ep := range de.endpointState { - de.startDiscoPingLocked(ep, now, pingCLI) + if mtu == 0 { + for _, testMtu := range usefulMtus { + de.startDiscoPingLockedMTU(ep, now, pingDiscovery, testMtu) + } + } else { + de.startDiscoPingLockedMTU(ep, now, pingCLI, mtu) + } } } de.noteActiveLocked() @@ -4647,11 +4722,11 @@ func (de *endpoint) removeSentDiscoPingLocked(txid stun.TxID, sp sentPing) { // // The caller should use de.discoKey as the discoKey argument. // It is passed in so that sendDiscoPing doesn't need to lock de.mu. -func (de *endpoint) sendDiscoPing(ep netip.AddrPort, discoKey key.DiscoPublic, txid stun.TxID, logLevel discoLogLevel) { +func (de *endpoint) sendDiscoPing(ep netip.AddrPort, discoKey key.DiscoPublic, txid stun.TxID, logLevel discoLogLevel, mtu int) { sent, _ := de.c.sendDiscoMessage(ep, de.publicKey, discoKey, &disco.Ping{ TxID: [12]byte(txid), NodeKey: de.c.publicKeyAtomic.Load(), - }, logLevel) + }, logLevel, mtu) if !sent { de.forgetDiscoPing(txid) } @@ -4676,6 +4751,10 @@ const ( ) func (de *endpoint) startDiscoPingLocked(ep netip.AddrPort, now mono.Time, purpose discoPingPurpose) { + de.startDiscoPingLockedMTU(ep, now, purpose, 0) +} + +func (de *endpoint) startDiscoPingLockedMTU(ep netip.AddrPort, now mono.Time, purpose discoPingPurpose, mtu int) { if runtime.GOOS == "js" { return } @@ -4700,12 +4779,13 @@ func (de *endpoint) startDiscoPingLocked(ep netip.AddrPort, now mono.Time, purpo at: now, timer: time.AfterFunc(pingTimeoutDuration, func() { de.discoPingTimeout(txid) }), purpose: purpose, + mtu: mtu, } logLevel := discoLog if purpose == pingHeartbeat { logLevel = discoVerboseLog } - go de.sendDiscoPing(ep, epDisco.key, txid, logLevel) + go de.sendDiscoPing(ep, epDisco.key, txid, logLevel, mtu) } func (de *endpoint) sendDiscoPingsLocked(now mono.Time, sendCallMeMaybe bool) { @@ -4729,8 +4809,20 @@ func (de *endpoint) sendDiscoPingsLocked(now mono.Time, sendCallMeMaybe bool) { if firstPing && sendCallMeMaybe { de.c.dlogf("[v1] magicsock: disco: send, starting discovery for %v (%v)", de.publicKey.ShortString(), de.discoShort()) } - - de.startDiscoPingLocked(ep, now, pingDiscovery) + // Send a bouquet of pings in different sizes to probe + // peer mtu, but only if this is not a derp addr. + if de.derpAddr.IsValid() { + de.startDiscoPingLocked(ep, now, pingDiscovery) + } else { + for _, mtu := range usefulMtus { + // XXX only send pings less than the mtu of the interface used to reach this endpoint + de.startDiscoPingLockedMTU(ep, now, pingDiscovery, mtu) + // XXX Would be nice to pause a bit to + // give the largest mtu an advantage + // in returning first, but we're + // holding a lock here. + } + } } derpAddr := de.derpAddr if sentAny && sendCallMeMaybe && derpAddr.IsValid() { @@ -4932,15 +5024,23 @@ func (de *endpoint) updateFromNode(n *tailcfg.Node, heartbeatDisabled bool) { // // This is called once we've already verified that we got a valid // discovery message from de via ep. -func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort, forRxPingTxID stun.TxID) (duplicatePing bool) { +func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort, forRxPingTxID stun.TxID, mtu int) (duplicatePing bool) { de.mu.Lock() defer de.mu.Unlock() + // lookup the current end point state if st, ok := de.endpointState[ep]; ok { + // oh, we already have this endpoint in our list! + // check to see if this is a resend of the same ping as the one we got most recently duplicatePing = forRxPingTxID == st.lastGotPingTxID if !duplicatePing { st.lastGotPingTxID = forRxPingTxID } + // If this ping has a higher mtu, update it in the endpoint + if mtu > st.mtu { + de.c.logf("UPDATING mtu to %v", mtu) + st.mtu = mtu + } if st.lastGotPing.IsZero() { // Already-known endpoint from the network map. return duplicatePing @@ -4950,10 +5050,12 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort, forRxPingTxID stun.T } // Newly discovered endpoint. Exciting! - de.c.dlogf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s)", ep, de.discoShort(), de.publicKey.ShortString()) + de.c.dlogf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s) mtu %v", ep, de.discoShort(), de.publicKey.ShortString(), mtu) + // Currently this adds a separate endpoint for each test MTU, would be better to update in place de.endpointState[ep] = &endpointState{ lastGotPing: time.Now(), lastGotPingTxID: forRxPingTxID, + mtu: mtu, } // If for some reason this gets very large, do some cleanup. @@ -4983,7 +5085,7 @@ func (de *endpoint) noteConnectivityChange() { // It should be called with the Conn.mu held. // // It reports whether m.TxID corresponds to a ping that this endpoint sent. -func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip.AddrPort) (knownTxID bool) { +func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip.AddrPort, msgLen int) (knownTxID bool) { de.mu.Lock() defer de.mu.Unlock() @@ -4995,6 +5097,19 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip return false } knownTxID = true // for naked returns below + // adjust mtu for ip header + // XXX assuming ipv6 + var mtu int + // If the msgLen is the size of the bare pong packet, ignore it + // XXX make this much less terrible + if msgLen > 124 { + de.c.logf("adding 40 bytes to msgLen %v", msgLen) + mtu = msgLen + 40 + } + // The mtu of the returned pong should be the same as the sent ping + if sp.mtu != mtu { + de.c.logf("error! pong mtu %v does not match ping mtu %v", mtu, sp.mtu) + } de.removeSentDiscoPingLocked(m.TxID, sp) now := mono.Now() @@ -5014,15 +5129,16 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip pongAt: now, from: src, pongSrc: m.Src, + mtu: mtu, }) } if sp.purpose != pingHeartbeat { - de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort(), de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) { + de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v mtu %v", de.c.discoShort, de.discoShort(), de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) { if sp.to != src { fmt.Fprintf(bw, " ping.to=%v", sp.to) } - })) + }), mtu) } for _, pp := range de.pendingCLIPings { @@ -5034,9 +5150,9 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip // Promote this pong response to our current best address if it's lower latency. // TODO(bradfitz): decide how latency vs. preference order affects decision if !isDerp { - thisPong := addrLatency{sp.to, latency} + thisPong := addrQuality{sp.to, latency, mtu} if betterAddr(thisPong, de.bestAddr) { - de.c.logf("magicsock: disco: node %v %v now using %v", de.publicKey.ShortString(), de.discoShort(), sp.to) + de.c.logf("UPDATING MTU magicsock: disco: node %v %v now using %v mtu %v", de.publicKey.ShortString(), de.discoShort(), sp.to, mtu) de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "handlePingLocked-bestAddr-update", @@ -5074,19 +5190,23 @@ func portableTrySetSocketBuffer(pconn nettype.PacketConn, logf logger.Logf) { } } -// addrLatency is an IPPort with an associated latency. -type addrLatency struct { +// addrQuality is an IPPort with an associated latency and MTU. +type addrQuality struct { netip.AddrPort latency time.Duration + mtu int } -func (a addrLatency) String() string { - return a.AddrPort.String() + "@" + a.latency.String() +func (a addrQuality) String() string { + return a.AddrPort.String() + "@" + a.latency.String() + "+" + strconv.Itoa(a.mtu) } // betterAddr reports whether a is a better addr to use than b. -func betterAddr(a, b addrLatency) bool { +func betterAddr(a, b addrQuality) bool { if a.AddrPort == b.AddrPort { + if a.mtu > b.mtu { + return true + } return false } if !b.IsValid() { @@ -5282,7 +5402,7 @@ func (de *endpoint) stopAndReset() { func (de *endpoint) resetLocked() { de.lastSend = 0 de.lastFullPing = 0 - de.bestAddr = addrLatency{} + de.bestAddr = addrQuality{} de.bestAddrAt = 0 de.trustBestAddrUntil = 0 for _, es := range de.endpointState { diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index ff3a25ab4..2e66c696b 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -1627,10 +1627,13 @@ func TestEndpointSetsEqual(t *testing.T) { func TestBetterAddr(t *testing.T) { const ms = time.Millisecond - al := func(ipps string, d time.Duration) addrLatency { - return addrLatency{netip.MustParseAddrPort(ipps), d} + al := func(ipps string, d time.Duration) addrQuality { + return addrQuality{AddrPort: netip.MustParseAddrPort(ipps), latency: d, mtu: 0} } - zero := addrLatency{} + almtu := func(ipps string, d time.Duration, mtu int) addrQuality { + return addrQuality{AddrPort: netip.MustParseAddrPort(ipps), latency: d, mtu: mtu} + } + zero := addrQuality{} const ( publicV4 = "1.2.3.4:555" @@ -1641,7 +1644,7 @@ func TestBetterAddr(t *testing.T) { ) tests := []struct { - a, b addrLatency + a, b addrQuality want bool // whether a is better than b }{ {a: zero, b: zero, want: false}, @@ -1703,7 +1706,12 @@ func TestBetterAddr(t *testing.T) { b: al(publicV6, 100*ms), want: true, }, - + // If addresses are equal, prefer larger MTU + { + a: almtu(publicV4, 30*ms, 1500), + b: almtu(publicV4, 30*ms, 0), + want: true, + }, // Private IPs are preferred over public IPs even if the public // IP is IPv6. { diff --git a/wgengine/netstack/netstack.go b/wgengine/netstack/netstack.go index 54e991801..0a27f97f2 100644 --- a/wgengine/netstack/netstack.go +++ b/wgengine/netstack/netstack.go @@ -150,6 +150,7 @@ const nicID = 1 // maxUDPPacketSize is the maximum size of a UDP packet we copy in startPacketCopy // when relaying UDP packets. We don't use the 'mtu' const in anticipation of // one day making the MTU more dynamic. +// TODO: make this bigger const maxUDPPacketSize = 1500 // Create creates and populates a new Impl. diff --git a/wgengine/userspace.go b/wgengine/userspace.go index e29e20dae..fac0df8f6 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -1211,7 +1211,7 @@ func (e *userspaceEngine) UpdateStatus(sb *ipnstate.StatusBuilder) { e.magicConn.UpdateStatus(sb) } -func (e *userspaceEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult)) { +func (e *userspaceEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult), mtu int) { res := &ipnstate.PingResult{IP: ip.String()} pip, ok := e.PeerForIP(ip) if !ok { @@ -1227,11 +1227,10 @@ func (e *userspaceEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, cb func return } peer := pip.Node - - e.logf("ping(%v): sending %v ping to %v %v ...", ip, pingType, peer.Key.ShortString(), peer.ComputedName) + e.logf("ping(%v): sending %v ping mtu %v to %v %v...", ip, pingType, mtu, peer.Key.ShortString(), peer.ComputedName) switch pingType { case "disco": - e.magicConn.Ping(peer, res, cb) + e.magicConn.Ping(peer, res, cb, mtu) case "TSMP": e.sendTSMPPing(ip, peer, res, cb) case "ICMP": diff --git a/wgengine/watchdog.go b/wgengine/watchdog.go index 19505be89..e75ebc3e3 100644 --- a/wgengine/watchdog.go +++ b/wgengine/watchdog.go @@ -158,8 +158,8 @@ func (e *watchdogEngine) DiscoPublicKey() (k key.DiscoPublic) { e.watchdog("DiscoPublicKey", func() { k = e.wrap.DiscoPublicKey() }) return k } -func (e *watchdogEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult)) { - e.watchdog("Ping", func() { e.wrap.Ping(ip, pingType, cb) }) +func (e *watchdogEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult), mtu int) { + e.watchdog("Ping", func() { e.wrap.Ping(ip, pingType, cb, mtu) }) } func (e *watchdogEngine) RegisterIPPortIdentity(ipp netip.AddrPort, tsIP netip.Addr) { e.watchdog("RegisterIPPortIdentity", func() { e.wrap.RegisterIPPortIdentity(ipp, tsIP) }) diff --git a/wgengine/wgengine.go b/wgengine/wgengine.go index df591c9e0..48b460b49 100644 --- a/wgengine/wgengine.go +++ b/wgengine/wgengine.go @@ -152,7 +152,7 @@ type Engine interface { // Ping is a request to start a ping with the peer handling the given IP and // then call cb with its ping latency & method. - Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult)) + Ping(ip netip.Addr, pingType tailcfg.PingType, cb func(*ipnstate.PingResult), mtu int) // RegisterIPPortIdentity registers a given node (identified by its // Tailscale IP) as temporarily having the given IP:port for whois lookups.