From cd523eae52c220ed8731cee349efd77c1aa4a5fe Mon Sep 17 00:00:00 2001 From: Simon Law Date: Thu, 2 Oct 2025 16:01:55 -0700 Subject: [PATCH] ipn/ipnlocal: introduce the concept of client-side-reachability (#17367) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The control plane will sometimes determine that a node is not online, while the node is still able to connect to its peers. This patch doesn’t solve this problem, but it does mitigate it. This PR introduces the `client-side-reachability` node attribute that switches the node to completely ignore the online signal from control. In the future, the client itself should collect reachability data from active Wireguard flows and Tailscale pings. Updates #17366 Updates tailscale/corp#30379 Updates tailscale/corp#32686 Signed-off-by: Simon Law --- ipn/ipnlocal/local.go | 12 +++++- ipn/ipnlocal/node_backend.go | 34 ++++++++++++++++ ipn/ipnlocal/node_backend_test.go | 68 +++++++++++++++++++++++++++++++ tailcfg/tailcfg.go | 6 +++ 4 files changed, 118 insertions(+), 2 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 38f98f8fb..199ee7248 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -7240,6 +7240,10 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta // the lowest latency to this device. For peers without a DERP home, we look for // geographic proximity to this device's DERP home. func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) { + // TODO(sfllaw): Context needs to be plumbed down here to support + // reachability testing. + ctx := context.TODO() + netMap := nb.NetMap() if report == nil || report.PreferredDERP == 0 || netMap == nil || netMap.DERPMap == nil { return res, ErrNoPreferredDERP @@ -7248,7 +7252,7 @@ func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSugg // since the netmap doesn't include delta updates (e.g., home DERP or Online // status changes) from the control plane since the last full update. candidates := nb.AppendMatchingPeers(nil, func(peer tailcfg.NodeView) bool { - if !peer.Valid() || !peer.Online().Get() { + if !peer.Valid() || !nb.PeerIsReachable(ctx, peer) { return false } if allowList != nil && !allowList.Contains(peer.StableID()) { @@ -7367,6 +7371,10 @@ var ErrNoNetMap = errors.New("no network map, try again later") // the node’s [tailcfg.Location]. To be eligible for consideration, the node // must have NodeAttrSuggestExitNode in its CapMap. func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) { + // TODO(sfllaw): Context needs to be plumbed down here to support + // reachability testing. + ctx := context.TODO() + nm := nb.NetMap() if nm == nil { return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap @@ -7386,7 +7394,7 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcf if !p.Valid() { return false } - if !p.Online().Get() { + if !nb.PeerIsReachable(ctx, p) { return false } if allowed != nil && !allowed.Contains(p.StableID()) { diff --git a/ipn/ipnlocal/node_backend.go b/ipn/ipnlocal/node_backend.go index 22e965fa6..3408d4cbb 100644 --- a/ipn/ipnlocal/node_backend.go +++ b/ipn/ipnlocal/node_backend.go @@ -362,6 +362,40 @@ func (nb *nodeBackend) PeerAPIBase(p tailcfg.NodeView) string { return peerAPIBase(nm, p) } +// PeerIsReachable reports whether the current node can reach p. If the ctx is +// done, this function may return a result based on stale reachability data. +func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView) bool { + if !nb.SelfHasCap(tailcfg.NodeAttrClientSideReachability) { + // Legacy behavior is to always trust the control plane, which + // isn’t always correct because the peer could be slow to check + // in so that control marks it as offline. + // See tailscale/corp#32686. + return p.Online().Get() + } + + nb.mu.Lock() + nm := nb.netMap + nb.mu.Unlock() + + if self := nm.SelfNode; self.Valid() && self.ID() == p.ID() { + // This node can always reach itself. + return true + } + return nb.peerIsReachable(ctx, p) +} + +func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) bool { + // TODO(sfllaw): The following does not actually test for client-side + // reachability. This would require a mechanism that tracks whether the + // current node can actually reach this peer, either because they are + // already communicating or because they can ping each other. + // + // Instead, it makes the client ignore p.Online completely. + // + // See tailscale/corp#32686. + return true +} + func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr { for _, pfx := range n.Addresses().All() { if pfx.IsSingleIP() && pred(pfx.Addr()) { diff --git a/ipn/ipnlocal/node_backend_test.go b/ipn/ipnlocal/node_backend_test.go index b305837fd..f6698bd4b 100644 --- a/ipn/ipnlocal/node_backend_test.go +++ b/ipn/ipnlocal/node_backend_test.go @@ -9,7 +9,10 @@ import ( "testing" "time" + "tailscale.com/tailcfg" "tailscale.com/tstest" + "tailscale.com/types/netmap" + "tailscale.com/types/ptr" "tailscale.com/util/eventbus" ) @@ -122,3 +125,68 @@ func TestNodeBackendConcurrentReadyAndShutdown(t *testing.T) { nb.Wait(context.Background()) } + +func TestNodeBackendReachability(t *testing.T) { + for _, tc := range []struct { + name string + + // Cap sets [tailcfg.NodeAttrClientSideReachability] on the self + // node. + // + // When disabled, the client relies on the control plane sending + // an accurate peer.Online flag. When enabled, the client + // ignores peer.Online and determines whether it can reach the + // peer node. + cap bool + + peer tailcfg.Node + want bool + }{ + { + name: "disabled/offline", + cap: false, + peer: tailcfg.Node{ + Online: ptr.To(false), + }, + want: false, + }, + { + name: "disabled/online", + cap: false, + peer: tailcfg.Node{ + Online: ptr.To(true), + }, + want: true, + }, + { + name: "enabled/offline", + cap: true, + peer: tailcfg.Node{ + Online: ptr.To(false), + }, + want: true, + }, + { + name: "enabled/online", + cap: true, + peer: tailcfg.Node{ + Online: ptr.To(true), + }, + want: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New()) + nb.netMap = &netmap.NetworkMap{} + if tc.cap { + nb.netMap.AllCaps.Make() + nb.netMap.AllCaps.Add(tailcfg.NodeAttrClientSideReachability) + } + + got := nb.PeerIsReachable(t.Context(), tc.peer.View()) + if got != tc.want { + t.Errorf("got %v, want %v", got, tc.want) + } + }) + } +} diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 01ecc96b3..96e7fbbd9 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -2692,6 +2692,12 @@ const ( // numbers, apostrophe, spaces, and hyphens. This may not be true for the default. // Values can look like "foo.com" or "Foo's Test Tailnet - Staging". NodeAttrTailnetDisplayName NodeCapability = "tailnet-display-name" + + // NodeAttrClientSideReachability configures the node to determine + // reachability itself when choosing connectors. When absent, the + // default behavior is to trust the control plane when it claims that a + // node is no longer online, but that is not a reliable signal. + NodeAttrClientSideReachability = "client-side-reachability" ) // SetDNSRequest is a request to add a DNS record.