ipn/ipnlocal: introduce the concept of client-side-reachability (#17367)

The control plane will sometimes determine that a node is not online,
while the node is still able to connect to its peers. This patch
doesn’t solve this problem, but it does mitigate it.

This PR introduces the `client-side-reachability` node attribute that
switches the node to completely ignore the online signal from control.

In the future, the client itself should collect reachability data from
active Wireguard flows and Tailscale pings.

Updates #17366
Updates tailscale/corp#30379
Updates tailscale/corp#32686

Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
Simon Law 2025-10-02 16:01:55 -07:00 committed by GitHub
parent 24e38eb729
commit cd523eae52
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 118 additions and 2 deletions

View File

@ -7240,6 +7240,10 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta
// the lowest latency to this device. For peers without a DERP home, we look for
// geographic proximity to this device's DERP home.
func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) {
// TODO(sfllaw): Context needs to be plumbed down here to support
// reachability testing.
ctx := context.TODO()
netMap := nb.NetMap()
if report == nil || report.PreferredDERP == 0 || netMap == nil || netMap.DERPMap == nil {
return res, ErrNoPreferredDERP
@ -7248,7 +7252,7 @@ func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSugg
// since the netmap doesn't include delta updates (e.g., home DERP or Online
// status changes) from the control plane since the last full update.
candidates := nb.AppendMatchingPeers(nil, func(peer tailcfg.NodeView) bool {
if !peer.Valid() || !peer.Online().Get() {
if !peer.Valid() || !nb.PeerIsReachable(ctx, peer) {
return false
}
if allowList != nil && !allowList.Contains(peer.StableID()) {
@ -7367,6 +7371,10 @@ var ErrNoNetMap = errors.New("no network map, try again later")
// the nodes [tailcfg.Location]. To be eligible for consideration, the node
// must have NodeAttrSuggestExitNode in its CapMap.
func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
// TODO(sfllaw): Context needs to be plumbed down here to support
// reachability testing.
ctx := context.TODO()
nm := nb.NetMap()
if nm == nil {
return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap
@ -7386,7 +7394,7 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcf
if !p.Valid() {
return false
}
if !p.Online().Get() {
if !nb.PeerIsReachable(ctx, p) {
return false
}
if allowed != nil && !allowed.Contains(p.StableID()) {

View File

@ -362,6 +362,40 @@ func (nb *nodeBackend) PeerAPIBase(p tailcfg.NodeView) string {
return peerAPIBase(nm, p)
}
// PeerIsReachable reports whether the current node can reach p. If the ctx is
// done, this function may return a result based on stale reachability data.
func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
if !nb.SelfHasCap(tailcfg.NodeAttrClientSideReachability) {
// Legacy behavior is to always trust the control plane, which
// isnt always correct because the peer could be slow to check
// in so that control marks it as offline.
// See tailscale/corp#32686.
return p.Online().Get()
}
nb.mu.Lock()
nm := nb.netMap
nb.mu.Unlock()
if self := nm.SelfNode; self.Valid() && self.ID() == p.ID() {
// This node can always reach itself.
return true
}
return nb.peerIsReachable(ctx, p)
}
func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
// TODO(sfllaw): The following does not actually test for client-side
// reachability. This would require a mechanism that tracks whether the
// current node can actually reach this peer, either because they are
// already communicating or because they can ping each other.
//
// Instead, it makes the client ignore p.Online completely.
//
// See tailscale/corp#32686.
return true
}
func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr {
for _, pfx := range n.Addresses().All() {
if pfx.IsSingleIP() && pred(pfx.Addr()) {

View File

@ -9,7 +9,10 @@ import (
"testing"
"time"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/types/netmap"
"tailscale.com/types/ptr"
"tailscale.com/util/eventbus"
)
@ -122,3 +125,68 @@ func TestNodeBackendConcurrentReadyAndShutdown(t *testing.T) {
nb.Wait(context.Background())
}
func TestNodeBackendReachability(t *testing.T) {
for _, tc := range []struct {
name string
// Cap sets [tailcfg.NodeAttrClientSideReachability] on the self
// node.
//
// When disabled, the client relies on the control plane sending
// an accurate peer.Online flag. When enabled, the client
// ignores peer.Online and determines whether it can reach the
// peer node.
cap bool
peer tailcfg.Node
want bool
}{
{
name: "disabled/offline",
cap: false,
peer: tailcfg.Node{
Online: ptr.To(false),
},
want: false,
},
{
name: "disabled/online",
cap: false,
peer: tailcfg.Node{
Online: ptr.To(true),
},
want: true,
},
{
name: "enabled/offline",
cap: true,
peer: tailcfg.Node{
Online: ptr.To(false),
},
want: true,
},
{
name: "enabled/online",
cap: true,
peer: tailcfg.Node{
Online: ptr.To(true),
},
want: true,
},
} {
t.Run(tc.name, func(t *testing.T) {
nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
nb.netMap = &netmap.NetworkMap{}
if tc.cap {
nb.netMap.AllCaps.Make()
nb.netMap.AllCaps.Add(tailcfg.NodeAttrClientSideReachability)
}
got := nb.PeerIsReachable(t.Context(), tc.peer.View())
if got != tc.want {
t.Errorf("got %v, want %v", got, tc.want)
}
})
}
}

View File

@ -2692,6 +2692,12 @@ const (
// numbers, apostrophe, spaces, and hyphens. This may not be true for the default.
// Values can look like "foo.com" or "Foo's Test Tailnet - Staging".
NodeAttrTailnetDisplayName NodeCapability = "tailnet-display-name"
// NodeAttrClientSideReachability configures the node to determine
// reachability itself when choosing connectors. When absent, the
// default behavior is to trust the control plane when it claims that a
// node is no longer online, but that is not a reliable signal.
NodeAttrClientSideReachability = "client-side-reachability"
)
// SetDNSRequest is a request to add a DNS record.