diff --git a/cmd/tailscale/depaware.txt b/cmd/tailscale/depaware.txt index 8c3b404b1..69d054ea4 100644 --- a/cmd/tailscale/depaware.txt +++ b/cmd/tailscale/depaware.txt @@ -85,7 +85,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep tailscale.com/control/controlhttp from tailscale.com/cmd/tailscale/cli tailscale.com/control/controlhttp/controlhttpcommon from tailscale.com/control/controlhttp tailscale.com/control/controlknobs from tailscale.com/net/portmapper - tailscale.com/derp from tailscale.com/derp/derphttp + tailscale.com/derp from tailscale.com/derp/derphttp+ tailscale.com/derp/derpconst from tailscale.com/derp+ tailscale.com/derp/derphttp from tailscale.com/net/netcheck tailscale.com/disco from tailscale.com/derp diff --git a/derp/derp.go b/derp/derp.go index 65acd4321..24c1ca65c 100644 --- a/derp/derp.go +++ b/derp/derp.go @@ -36,9 +36,13 @@ frameHeaderLen = 1 + 4 // frameType byte + 4 byte length keyLen = 32 maxInfoLen = 1 << 20 - keepAlive = 60 * time.Second ) +// KeepAlive is the minimum frequency at which the DERP server sends +// keep alive frames. The server adds some jitter, so this timing is not +// exact, but 2x this value can be considered a missed keep alive. +const KeepAlive = 60 * time.Second + // ProtocolVersion is bumped whenever there's a wire-incompatible change. // - version 1 (zero on wire): consistent box headers, in use by employee dev nodes a bit // - version 2: received packets have src addrs in frameRecvPacket at beginning diff --git a/derp/derp_server.go b/derp/derp_server.go index c6a749485..bd67e7eec 100644 --- a/derp/derp_server.go +++ b/derp/derp_server.go @@ -1789,7 +1789,7 @@ func (c *sclient) sendLoop(ctx context.Context) error { defer c.onSendLoopDone() jitter := rand.N(5 * time.Second) - keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(keepAlive + jitter) + keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(KeepAlive + jitter) defer keepAliveTick.Stop() var werr error // last write error diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 54627f713..cb622a339 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -23,6 +23,7 @@ "syscall" "time" + "tailscale.com/derp" "tailscale.com/derp/derphttp" "tailscale.com/envknob" "tailscale.com/hostinfo" @@ -449,7 +450,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, pre // restoration back to the home DERP on the next full netcheck ~5 minutes later // - which is highly disruptive when it causes shifts in geo routed subnet // routers. By always including the home DERP in the incremental netcheck, we - // ensure that the home DERP is always probed, even if it observed a recenet + // ensure that the home DERP is always probed, even if it observed a recent // poor latency sample. This inclusion enables the latency history checks in // home DERP selection to still take effect. // planContainsHome indicates whether the home DERP has been added to the probePlan, @@ -989,7 +990,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe var wg sync.WaitGroup var need []*tailcfg.DERPRegion for rid, reg := range dm.Regions { - if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) { + if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) && !reg.Avoid && !reg.NoMeasureNoHome { need = append(need, reg) } } @@ -1371,6 +1372,15 @@ func (c *Client) timeNow() time.Time { // even without receiving a STUN response. // Note: must remain higher than the derp package frameReceiveRecordRate PreferredDERPFrameTime = 8 * time.Second + // PreferredDERPKeepAliveTimeout is 2x the DERP Keep Alive timeout. If there + // is no latency data to make judgements from, but we have heard from our + // current DERP region inside of 2x the KeepAlive window, don't switch DERP + // regions yet, keep the current region. This prevents region flapping / + // home DERP removal during short periods of packet loss where the DERP TCP + // connection may itself naturally recover. + // TODO(raggi): expose shared time bounds from the DERP package rather than + // duplicating them here. + PreferredDERPKeepAliveTimeout = 2 * derp.KeepAlive ) // addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports @@ -1455,13 +1465,10 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report, // the STUN probe) since we started the netcheck, or in the past 2s, as // another signal for "this region is still working". heardFromOldRegionRecently := false + prevRegionLastHeard := rs.opts.getLastDERPActivity(prevDERP) if changingPreferred { - if lastHeard := rs.opts.getLastDERPActivity(prevDERP); !lastHeard.IsZero() { - now := c.timeNow() - - heardFromOldRegionRecently = lastHeard.After(rs.start) - heardFromOldRegionRecently = heardFromOldRegionRecently || lastHeard.After(now.Add(-PreferredDERPFrameTime)) - } + heardFromOldRegionRecently = prevRegionLastHeard.After(rs.start) + heardFromOldRegionRecently = heardFromOldRegionRecently || prevRegionLastHeard.After(now.Add(-PreferredDERPFrameTime)) } // The old region is accessible if we've heard from it via a non-STUN @@ -1488,17 +1495,20 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report, // If the forced DERP region probed successfully, or has recent traffic, // use it. _, haveLatencySample := r.RegionLatency[c.ForcePreferredDERP] - var recentActivity bool - if lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP); !lastHeard.IsZero() { - now := c.timeNow() - recentActivity = lastHeard.After(rs.start) - recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime)) - } + lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP) + recentActivity := lastHeard.After(rs.start) + recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime)) if haveLatencySample || recentActivity { r.PreferredDERP = c.ForcePreferredDERP } } + // If there was no latency data to make judgements on, but there is an + // active DERP connection that has at least been doing KeepAlive recently, + // keep it, rather than dropping it. + if r.PreferredDERP == 0 && prevRegionLastHeard.After(now.Add(-PreferredDERPKeepAliveTimeout)) { + r.PreferredDERP = prevDERP + } } func updateLatency(m map[int]time.Duration, regionID int, d time.Duration) { diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 3affa614d..6830e7f27 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -18,6 +18,7 @@ "testing" "time" + "tailscale.com/derp" "tailscale.com/net/netmon" "tailscale.com/net/stun/stuntest" "tailscale.com/tailcfg" @@ -419,6 +420,39 @@ type step struct { wantPrevLen: 2, wantDERP: 1, }, + { + name: "no_data_keep_home", + steps: []step{ + {0, report("d1", 2, "d2", 3)}, + {30 * time.Second, report()}, + {2 * time.Second, report()}, + {2 * time.Second, report()}, + {2 * time.Second, report()}, + {2 * time.Second, report()}, + }, + opts: &GetReportOpts{ + GetLastDERPActivity: mkLDAFunc(map[int]time.Time{ + 1: startTime, + }), + }, + wantPrevLen: 6, + wantDERP: 1, + }, + { + name: "no_data_home_expires", + steps: []step{ + {0, report("d1", 2, "d2", 3)}, + {30 * time.Second, report()}, + {2 * derp.KeepAlive, report()}, + }, + opts: &GetReportOpts{ + GetLastDERPActivity: mkLDAFunc(map[int]time.Time{ + 1: startTime, + }), + }, + wantPrevLen: 3, + wantDERP: 0, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {