diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 38f98f8fb..199ee7248 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -7240,6 +7240,10 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta // the lowest latency to this device. For peers without a DERP home, we look for // geographic proximity to this device's DERP home. func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) { + // TODO(sfllaw): Context needs to be plumbed down here to support + // reachability testing. + ctx := context.TODO() + netMap := nb.NetMap() if report == nil || report.PreferredDERP == 0 || netMap == nil || netMap.DERPMap == nil { return res, ErrNoPreferredDERP @@ -7248,7 +7252,7 @@ func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSugg // since the netmap doesn't include delta updates (e.g., home DERP or Online // status changes) from the control plane since the last full update. candidates := nb.AppendMatchingPeers(nil, func(peer tailcfg.NodeView) bool { - if !peer.Valid() || !peer.Online().Get() { + if !peer.Valid() || !nb.PeerIsReachable(ctx, peer) { return false } if allowList != nil && !allowList.Contains(peer.StableID()) { @@ -7367,6 +7371,10 @@ var ErrNoNetMap = errors.New("no network map, try again later") // the node’s [tailcfg.Location]. To be eligible for consideration, the node // must have NodeAttrSuggestExitNode in its CapMap. func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) { + // TODO(sfllaw): Context needs to be plumbed down here to support + // reachability testing. + ctx := context.TODO() + nm := nb.NetMap() if nm == nil { return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap @@ -7386,7 +7394,7 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcf if !p.Valid() { return false } - if !p.Online().Get() { + if !nb.PeerIsReachable(ctx, p) { return false } if allowed != nil && !allowed.Contains(p.StableID()) { diff --git a/ipn/ipnlocal/node_backend.go b/ipn/ipnlocal/node_backend.go index 22e965fa6..3408d4cbb 100644 --- a/ipn/ipnlocal/node_backend.go +++ b/ipn/ipnlocal/node_backend.go @@ -362,6 +362,40 @@ func (nb *nodeBackend) PeerAPIBase(p tailcfg.NodeView) string { return peerAPIBase(nm, p) } +// PeerIsReachable reports whether the current node can reach p. If the ctx is +// done, this function may return a result based on stale reachability data. +func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView) bool { + if !nb.SelfHasCap(tailcfg.NodeAttrClientSideReachability) { + // Legacy behavior is to always trust the control plane, which + // isn’t always correct because the peer could be slow to check + // in so that control marks it as offline. + // See tailscale/corp#32686. + return p.Online().Get() + } + + nb.mu.Lock() + nm := nb.netMap + nb.mu.Unlock() + + if self := nm.SelfNode; self.Valid() && self.ID() == p.ID() { + // This node can always reach itself. + return true + } + return nb.peerIsReachable(ctx, p) +} + +func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) bool { + // TODO(sfllaw): The following does not actually test for client-side + // reachability. This would require a mechanism that tracks whether the + // current node can actually reach this peer, either because they are + // already communicating or because they can ping each other. + // + // Instead, it makes the client ignore p.Online completely. + // + // See tailscale/corp#32686. + return true +} + func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr { for _, pfx := range n.Addresses().All() { if pfx.IsSingleIP() && pred(pfx.Addr()) { diff --git a/ipn/ipnlocal/node_backend_test.go b/ipn/ipnlocal/node_backend_test.go index b305837fd..f6698bd4b 100644 --- a/ipn/ipnlocal/node_backend_test.go +++ b/ipn/ipnlocal/node_backend_test.go @@ -9,7 +9,10 @@ import ( "testing" "time" + "tailscale.com/tailcfg" "tailscale.com/tstest" + "tailscale.com/types/netmap" + "tailscale.com/types/ptr" "tailscale.com/util/eventbus" ) @@ -122,3 +125,68 @@ func TestNodeBackendConcurrentReadyAndShutdown(t *testing.T) { nb.Wait(context.Background()) } + +func TestNodeBackendReachability(t *testing.T) { + for _, tc := range []struct { + name string + + // Cap sets [tailcfg.NodeAttrClientSideReachability] on the self + // node. + // + // When disabled, the client relies on the control plane sending + // an accurate peer.Online flag. When enabled, the client + // ignores peer.Online and determines whether it can reach the + // peer node. + cap bool + + peer tailcfg.Node + want bool + }{ + { + name: "disabled/offline", + cap: false, + peer: tailcfg.Node{ + Online: ptr.To(false), + }, + want: false, + }, + { + name: "disabled/online", + cap: false, + peer: tailcfg.Node{ + Online: ptr.To(true), + }, + want: true, + }, + { + name: "enabled/offline", + cap: true, + peer: tailcfg.Node{ + Online: ptr.To(false), + }, + want: true, + }, + { + name: "enabled/online", + cap: true, + peer: tailcfg.Node{ + Online: ptr.To(true), + }, + want: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New()) + nb.netMap = &netmap.NetworkMap{} + if tc.cap { + nb.netMap.AllCaps.Make() + nb.netMap.AllCaps.Add(tailcfg.NodeAttrClientSideReachability) + } + + got := nb.PeerIsReachable(t.Context(), tc.peer.View()) + if got != tc.want { + t.Errorf("got %v, want %v", got, tc.want) + } + }) + } +} diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 01ecc96b3..96e7fbbd9 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -2692,6 +2692,12 @@ const ( // numbers, apostrophe, spaces, and hyphens. This may not be true for the default. // Values can look like "foo.com" or "Foo's Test Tailnet - Staging". NodeAttrTailnetDisplayName NodeCapability = "tailnet-display-name" + + // NodeAttrClientSideReachability configures the node to determine + // reachability itself when choosing connectors. When absent, the + // default behavior is to trust the control plane when it claims that a + // node is no longer online, but that is not a reliable signal. + NodeAttrClientSideReachability = "client-side-reachability" ) // SetDNSRequest is a request to add a DNS record.