mirror of
https://github.com/tailscale/tailscale.git
synced 2025-10-04 20:12:16 +02:00
ipn/ipnlocal: introduce the concept of client-side-reachability (#17367)
The control plane will sometimes determine that a node is not online, while the node is still able to connect to its peers. This patch doesn’t solve this problem, but it does mitigate it. This PR introduces the `client-side-reachability` node attribute that switches the node to completely ignore the online signal from control. In the future, the client itself should collect reachability data from active Wireguard flows and Tailscale pings. Updates #17366 Updates tailscale/corp#30379 Updates tailscale/corp#32686 Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
parent
24e38eb729
commit
cd523eae52
@ -7240,6 +7240,10 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta
|
|||||||
// the lowest latency to this device. For peers without a DERP home, we look for
|
// the lowest latency to this device. For peers without a DERP home, we look for
|
||||||
// geographic proximity to this device's DERP home.
|
// geographic proximity to this device's DERP home.
|
||||||
func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) {
|
func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSuggestion tailcfg.StableNodeID, selectRegion selectRegionFunc, selectNode selectNodeFunc, allowList set.Set[tailcfg.StableNodeID]) (res apitype.ExitNodeSuggestionResponse, err error) {
|
||||||
|
// TODO(sfllaw): Context needs to be plumbed down here to support
|
||||||
|
// reachability testing.
|
||||||
|
ctx := context.TODO()
|
||||||
|
|
||||||
netMap := nb.NetMap()
|
netMap := nb.NetMap()
|
||||||
if report == nil || report.PreferredDERP == 0 || netMap == nil || netMap.DERPMap == nil {
|
if report == nil || report.PreferredDERP == 0 || netMap == nil || netMap.DERPMap == nil {
|
||||||
return res, ErrNoPreferredDERP
|
return res, ErrNoPreferredDERP
|
||||||
@ -7248,7 +7252,7 @@ func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSugg
|
|||||||
// since the netmap doesn't include delta updates (e.g., home DERP or Online
|
// since the netmap doesn't include delta updates (e.g., home DERP or Online
|
||||||
// status changes) from the control plane since the last full update.
|
// status changes) from the control plane since the last full update.
|
||||||
candidates := nb.AppendMatchingPeers(nil, func(peer tailcfg.NodeView) bool {
|
candidates := nb.AppendMatchingPeers(nil, func(peer tailcfg.NodeView) bool {
|
||||||
if !peer.Valid() || !peer.Online().Get() {
|
if !peer.Valid() || !nb.PeerIsReachable(ctx, peer) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
if allowList != nil && !allowList.Contains(peer.StableID()) {
|
if allowList != nil && !allowList.Contains(peer.StableID()) {
|
||||||
@ -7367,6 +7371,10 @@ var ErrNoNetMap = errors.New("no network map, try again later")
|
|||||||
// the node’s [tailcfg.Location]. To be eligible for consideration, the node
|
// the node’s [tailcfg.Location]. To be eligible for consideration, the node
|
||||||
// must have NodeAttrSuggestExitNode in its CapMap.
|
// must have NodeAttrSuggestExitNode in its CapMap.
|
||||||
func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
|
func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
|
||||||
|
// TODO(sfllaw): Context needs to be plumbed down here to support
|
||||||
|
// reachability testing.
|
||||||
|
ctx := context.TODO()
|
||||||
|
|
||||||
nm := nb.NetMap()
|
nm := nb.NetMap()
|
||||||
if nm == nil {
|
if nm == nil {
|
||||||
return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap
|
return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap
|
||||||
@ -7386,7 +7394,7 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcf
|
|||||||
if !p.Valid() {
|
if !p.Valid() {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
if !p.Online().Get() {
|
if !nb.PeerIsReachable(ctx, p) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
if allowed != nil && !allowed.Contains(p.StableID()) {
|
if allowed != nil && !allowed.Contains(p.StableID()) {
|
||||||
|
@ -362,6 +362,40 @@ func (nb *nodeBackend) PeerAPIBase(p tailcfg.NodeView) string {
|
|||||||
return peerAPIBase(nm, p)
|
return peerAPIBase(nm, p)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PeerIsReachable reports whether the current node can reach p. If the ctx is
|
||||||
|
// done, this function may return a result based on stale reachability data.
|
||||||
|
func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
|
||||||
|
if !nb.SelfHasCap(tailcfg.NodeAttrClientSideReachability) {
|
||||||
|
// Legacy behavior is to always trust the control plane, which
|
||||||
|
// isn’t always correct because the peer could be slow to check
|
||||||
|
// in so that control marks it as offline.
|
||||||
|
// See tailscale/corp#32686.
|
||||||
|
return p.Online().Get()
|
||||||
|
}
|
||||||
|
|
||||||
|
nb.mu.Lock()
|
||||||
|
nm := nb.netMap
|
||||||
|
nb.mu.Unlock()
|
||||||
|
|
||||||
|
if self := nm.SelfNode; self.Valid() && self.ID() == p.ID() {
|
||||||
|
// This node can always reach itself.
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return nb.peerIsReachable(ctx, p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
|
||||||
|
// TODO(sfllaw): The following does not actually test for client-side
|
||||||
|
// reachability. This would require a mechanism that tracks whether the
|
||||||
|
// current node can actually reach this peer, either because they are
|
||||||
|
// already communicating or because they can ping each other.
|
||||||
|
//
|
||||||
|
// Instead, it makes the client ignore p.Online completely.
|
||||||
|
//
|
||||||
|
// See tailscale/corp#32686.
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr {
|
func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr {
|
||||||
for _, pfx := range n.Addresses().All() {
|
for _, pfx := range n.Addresses().All() {
|
||||||
if pfx.IsSingleIP() && pred(pfx.Addr()) {
|
if pfx.IsSingleIP() && pred(pfx.Addr()) {
|
||||||
|
@ -9,7 +9,10 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"tailscale.com/tailcfg"
|
||||||
"tailscale.com/tstest"
|
"tailscale.com/tstest"
|
||||||
|
"tailscale.com/types/netmap"
|
||||||
|
"tailscale.com/types/ptr"
|
||||||
"tailscale.com/util/eventbus"
|
"tailscale.com/util/eventbus"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -122,3 +125,68 @@ func TestNodeBackendConcurrentReadyAndShutdown(t *testing.T) {
|
|||||||
|
|
||||||
nb.Wait(context.Background())
|
nb.Wait(context.Background())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNodeBackendReachability(t *testing.T) {
|
||||||
|
for _, tc := range []struct {
|
||||||
|
name string
|
||||||
|
|
||||||
|
// Cap sets [tailcfg.NodeAttrClientSideReachability] on the self
|
||||||
|
// node.
|
||||||
|
//
|
||||||
|
// When disabled, the client relies on the control plane sending
|
||||||
|
// an accurate peer.Online flag. When enabled, the client
|
||||||
|
// ignores peer.Online and determines whether it can reach the
|
||||||
|
// peer node.
|
||||||
|
cap bool
|
||||||
|
|
||||||
|
peer tailcfg.Node
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "disabled/offline",
|
||||||
|
cap: false,
|
||||||
|
peer: tailcfg.Node{
|
||||||
|
Online: ptr.To(false),
|
||||||
|
},
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "disabled/online",
|
||||||
|
cap: false,
|
||||||
|
peer: tailcfg.Node{
|
||||||
|
Online: ptr.To(true),
|
||||||
|
},
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "enabled/offline",
|
||||||
|
cap: true,
|
||||||
|
peer: tailcfg.Node{
|
||||||
|
Online: ptr.To(false),
|
||||||
|
},
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "enabled/online",
|
||||||
|
cap: true,
|
||||||
|
peer: tailcfg.Node{
|
||||||
|
Online: ptr.To(true),
|
||||||
|
},
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
|
||||||
|
nb.netMap = &netmap.NetworkMap{}
|
||||||
|
if tc.cap {
|
||||||
|
nb.netMap.AllCaps.Make()
|
||||||
|
nb.netMap.AllCaps.Add(tailcfg.NodeAttrClientSideReachability)
|
||||||
|
}
|
||||||
|
|
||||||
|
got := nb.PeerIsReachable(t.Context(), tc.peer.View())
|
||||||
|
if got != tc.want {
|
||||||
|
t.Errorf("got %v, want %v", got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -2692,6 +2692,12 @@ const (
|
|||||||
// numbers, apostrophe, spaces, and hyphens. This may not be true for the default.
|
// numbers, apostrophe, spaces, and hyphens. This may not be true for the default.
|
||||||
// Values can look like "foo.com" or "Foo's Test Tailnet - Staging".
|
// Values can look like "foo.com" or "Foo's Test Tailnet - Staging".
|
||||||
NodeAttrTailnetDisplayName NodeCapability = "tailnet-display-name"
|
NodeAttrTailnetDisplayName NodeCapability = "tailnet-display-name"
|
||||||
|
|
||||||
|
// NodeAttrClientSideReachability configures the node to determine
|
||||||
|
// reachability itself when choosing connectors. When absent, the
|
||||||
|
// default behavior is to trust the control plane when it claims that a
|
||||||
|
// node is no longer online, but that is not a reliable signal.
|
||||||
|
NodeAttrClientSideReachability = "client-side-reachability"
|
||||||
)
|
)
|
||||||
|
|
||||||
// SetDNSRequest is a request to add a DNS record.
|
// SetDNSRequest is a request to add a DNS record.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user