ipn/ipnlocal: use routecheck reports to make exit node suggestions

Now that the routecheck subsystem is continuously collecting
reachability reports in the background, we can add a hook to
LocalBackend for fetching its report. That allows
suggestExitNodeUsingTrafficSteering to consult that report when
disqualifying candidates.

Exit node suggestions will only consult the report when the
`client-side-reachability` and `client-side-reachability:routecheck`
node attributes are both set on the current node.

Updates #17366
Updates tailscale/corp#33033

Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
Simon Law 2026-04-24 10:25:10 -07:00
parent 905141e4ca
commit 73f52284f6
No known key found for this signature in database
GPG Key ID: B83D1EE07548341D
6 changed files with 64 additions and 30 deletions

View File

@ -12,6 +12,7 @@ import (
"fmt"
"tailscale.com/ipn/ipnext"
"tailscale.com/ipn/ipnlocal"
"tailscale.com/ipn/routecheck"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
@ -28,6 +29,8 @@ func init() {
backend: b,
}, nil
})
ipnlocal.HookRouteCheckReport.Set(routeCheckReport)
}
// Extension implements the [ipnext.Extension] interface.
@ -91,3 +94,7 @@ func (e *Extension) needsRefresh() {
}
e.Client.NeedsRefresh()
}
func routeCheckReport(b *ipnlocal.LocalBackend) ipnlocal.RouteCheckReport {
return ClientFor(b).Report()
}

View File

@ -7573,6 +7573,9 @@ func allowedAutoRoute(ipp netip.Prefix) bool {
return true
}
// HookRouteCheckReport is the hook that returns the latest routecheck Report for this LocalBackend.
var HookRouteCheckReport feature.Hook[func(*LocalBackend) RouteCheckReport]
var ErrNoPreferredDERP = errors.New("no preferred DERP, try again later")
// suggestExitNodeLocked computes a suggestion based on the current netmap and
@ -7588,7 +7591,12 @@ func (b *LocalBackend) suggestExitNodeLocked() (response apitype.ExitNodeSuggest
lastReport := b.MagicConn().GetLastNetcheckReport(b.ctx)
prevSuggestion := b.lastSuggestedExitNode
res, err := suggestExitNode(lastReport, b.currentNode(), prevSuggestion, randomRegion, randomNode, b.getAllowedSuggestions())
var rp RouteCheckReport
if HookRouteCheckReport.IsSet() {
hook := HookRouteCheckReport.Get()
rp = hook(b)
}
res, err := suggestExitNode(lastReport, rp, b.currentNode(), prevSuggestion, randomRegion, randomNode, b.getAllowedSuggestions())
if err != nil {
return res, err
}
@ -7660,11 +7668,11 @@ func fillAllowedSuggestions(polc policyclient.Client) (set.Set[tailcfg.StableNod
// suggestExitNode returns a suggestion for reasonably good exit node based on
// the current netmap and the previous suggestion.
func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) {
func suggestExitNode(report *netcheck.Report, rp RouteCheckReport, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) {
switch {
case nb.SelfHasCap(tailcfg.NodeAttrTrafficSteering):
// The traffic-steering feature flag is enabled on this tailnet.
res, err = suggestExitNodeUsingTrafficSteering(nb, allowList)
res, err = suggestExitNodeUsingTrafficSteering(rp, nb, allowList)
default:
// The control plane will always strip the `traffic-steering`
// node attribute if it isnt enabled for this tailnet, even if
@ -7694,10 +7702,6 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta
// the lowest latency to this device. For peers without a DERP home, we look for
// geographic proximity to this device's DERP home.
func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) {
// TODO(sfllaw): Context needs to be plumbed down here to support
// reachability testing.
ctx := context.TODO()
netMap := nb.NetMap()
if report == nil || report.PreferredDERP == 0 || netMap == nil || netMap.DERPMap == nil {
return res, ErrNoPreferredDERP
@ -7706,7 +7710,7 @@ func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSugg
// since the netmap doesn't include delta updates (e.g., home DERP or Online
// status changes) from the control plane since the last full update.
candidates := nb.AppendMatchingPeers(nil, func(peer tailcfg.NodeView) bool {
if !peer.Valid() || !nb.PeerIsReachable(ctx, peer) {
if !peer.Valid() || !nb.PeerIsReachable(nil, peer) {
return false
}
if allowList != nil && !allowList.Contains(peer.StableID()) {
@ -7834,11 +7838,7 @@ var ErrNoNetMap = errors.New("no network map, try again later")
// pick one of the best exit nodes. These priorities are provided by Control in
// the nodes [tailcfg.Location]. To be eligible for consideration, the node
// must have NodeAttrSuggestExitNode in its CapMap.
func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
// TODO(sfllaw): Context needs to be plumbed down here to support
// reachability testing.
ctx := context.TODO()
func suggestExitNodeUsingTrafficSteering(rp RouteCheckReport, nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
nm := nb.NetMap()
if nm == nil {
return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap
@ -7857,7 +7857,7 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcf
if !p.Valid() {
return false
}
if !nb.PeerIsReachable(ctx, p) {
if !nb.PeerIsReachable(rp, p) {
return false
}
if allowed != nil && !allowed.Contains(p.StableID()) {

View File

@ -5391,7 +5391,7 @@ func TestSuggestExitNode(t *testing.T) {
defer nb.shutdown(errShutdown)
nb.SetNetMap(tt.netMap)
got, err := suggestExitNode(tt.lastReport, nb, tt.lastSuggestion, selectRegion, selectNode, allowList)
got, err := suggestExitNode(tt.lastReport, nil, nb, tt.lastSuggestion, selectRegion, selectNode, allowList)
if got.Name != tt.wantName {
t.Errorf("name=%v, want %v", got.Name, tt.wantName)
}
@ -5865,7 +5865,7 @@ func TestSuggestExitNodeTrafficSteering(t *testing.T) {
defer nb.shutdown(errShutdown)
nb.SetNetMap(tt.netMap)
got, err := suggestExitNodeUsingTrafficSteering(nb, allowList)
got, err := suggestExitNodeUsingTrafficSteering(nil, nb, allowList)
if tt.wantErr == nil && err != nil {
t.Fatalf("err=%v, want nil", err)
}

View File

@ -352,9 +352,16 @@ func (nb *nodeBackend) PeerAPIBase(p tailcfg.NodeView) string {
return peerAPIBase(nm, p)
}
// RouteCheckReport is an interface that reports whether a peer is reachable by the current node.
type RouteCheckReport interface {
// IsReachable reports whether a peer is reachable by the current node
// or if it is unknown because it has yet to be probed.
IsReachable(tailcfg.NodeID) (ok, known bool)
}
// PeerIsReachable reports whether the current node can reach p. If the ctx is
// done, this function may return a result based on stale reachability data.
func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
func (nb *nodeBackend) PeerIsReachable(rp RouteCheckReport, p tailcfg.NodeView) bool {
if !nb.SelfHasCap(tailcfg.NodeAttrClientSideReachability) {
// Legacy behavior is to always trust the control plane, which
// isnt always correct because the peer could be slow to check
@ -371,19 +378,28 @@ func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView)
// This node can always reach itself.
return true
}
return nb.peerIsReachable(ctx, p)
}
func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
// TODO(sfllaw): The following does not actually test for client-side
// reachability. This would require a mechanism that tracks whether the
// current node can actually reach this peer, either because they are
// already communicating or because they can ping each other.
//
// Instead, it makes the client ignore p.Online completely.
//
// See tailscale/corp#32686.
return true
if !nb.SelfHasCap(tailcfg.NodeAttrClientSideReachabilityRouteCheck) {
// TODO(sfllaw): The following does not actually test for client-side
// reachability. This would require a mechanism that tracks whether the
// current node can actually reach this peer, either because they are
// already communicating or because they can ping each other.
//
// Instead, it makes the client ignore p.Online completely.
//
// See tailscale/corp#32686.
return true
}
if rp == nil {
return p.Online().Get() // Fallback
}
ok, known := rp.IsReachable(p.ID())
if !known {
// Reachability is unknown, because the node is a new router, so fall back.
return p.Online().Get()
}
return ok
}
func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr {

View File

@ -182,7 +182,7 @@ func TestNodeBackendReachability(t *testing.T) {
nb.netMap.AllCaps.Add(tailcfg.NodeAttrClientSideReachability)
}
got := nb.PeerIsReachable(t.Context(), tc.peer.View())
got := nb.PeerIsReachable(nil, tc.peer.View())
if got != tc.want {
t.Errorf("got %v, want %v", got, tc.want)
}

View File

@ -30,6 +30,17 @@ type Report struct {
Reachable nodeset `json:"reachable"`
}
// IsReachable reports whether a peer is reachable by the current node
// or if it is unknown because it has yet to be probed.
func (rp Report) IsReachable(id tailcfg.NodeID) (ok, known bool) {
// TODO(sfllaw): We should actually track all routers and consider the
// absence of a router in the report as it being recently added for
// consideration, so it is unknown. Then we should positively track
// whether a node was reachable or not.
_, k := rp.Reachable[id]
return k, k
}
// RoutablePrefixes returns a [RoutingTable] mapping routable network prefixes
// with the associated routers that were reachable by the current host,
// at the time the report was finished.