mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-05 20:26:47 +02:00
Adds an opt-in, in-memory aggregator of recent connection-rejection events (TSMP rejects received from peers, outbound TSMP rejects we emit on ACL-blocked inbound flows, and pendopen timeouts) keyed by (direction, proto, peer-address, reason). The aggregated data is exposed over a new debug-rejects LocalAPI endpoint and a GET /debug/rejects c2n endpoint, intended for future GUI/CLI consumption when diagnosing why a connection failed. Architecture: - net/connreject holds the data types and a per-LocalBackend Aggregator (LRU-bounded, default 256 entries on desktop / 32 on mobile, per direction). - feature/connreject is a self-registering ipnext.Extension that owns one Aggregator per LocalBackend, installs note callbacks on the tundev and engine, subscribes to OnSelfChange to flip the runtime gate, and serves the LocalAPI/c2n endpoints. - wgengine.Engine and *tstun.Wrapper each gain a SetConnRejectNote setter; data-plane sites use a single atomic.Pointer load + nil check, so the cost when no consumer is installed is one MOV. Gating: - Compile-time: ts_omit_connreject build tag (standard feature/buildfeatures + condregister plumbing). Trims ~41 KB. - Runtime: tailcfg.NodeAttrConnReject node attribute, off by default at the control plane. May be removed once the feature is enabled by default. Updates CapabilityVersion to 139 (clients understand NodeAttrConnReject and can serve GET /debug/rejects). Adds Proto/Src/Dst accessors on flowtrack.Tuple (used by pendopen to construct events without exposing the tuple's internals to the aggregator). Updates #1094 Updates #14802 Signed-off-by: James Tucker <james@tailscale.com>
380 lines
11 KiB
Go
380 lines
11 KiB
Go
// Copyright (c) Tailscale Inc & contributors
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
//go:build !ts_omit_debug
|
|
|
|
package wgengine
|
|
|
|
import (
|
|
"fmt"
|
|
"net/netip"
|
|
"runtime"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gaissmai/bart"
|
|
"tailscale.com/net/connreject"
|
|
"tailscale.com/net/flowtrack"
|
|
"tailscale.com/net/packet"
|
|
"tailscale.com/net/tstun"
|
|
"tailscale.com/types/ipproto"
|
|
"tailscale.com/util/mak"
|
|
"tailscale.com/wgengine/filter"
|
|
)
|
|
|
|
type flowtrackTuple = flowtrack.Tuple
|
|
|
|
const tcpTimeoutBeforeDebug = 5 * time.Second
|
|
|
|
type pendingOpenFlow struct {
|
|
timer *time.Timer // until giving up on the flow
|
|
|
|
// guarded by userspaceEngine.mu:
|
|
|
|
// problem is non-zero if we got a MaybeBroken (non-terminal)
|
|
// TSMP "reject" header.
|
|
problem packet.TailscaleRejectReason
|
|
}
|
|
|
|
func (e *userspaceEngine) removeFlow(f flowtrack.Tuple) (removed bool) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
of, ok := e.pendOpen[f]
|
|
if !ok {
|
|
// Not a tracked flow (likely already removed)
|
|
return false
|
|
}
|
|
of.timer.Stop()
|
|
delete(e.pendOpen, f)
|
|
return true
|
|
}
|
|
|
|
func (e *userspaceEngine) noteFlowProblemFromPeer(f flowtrack.Tuple, problem packet.TailscaleRejectReason) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
of, ok := e.pendOpen[f]
|
|
if !ok {
|
|
// Not a tracked flow (likely already removed)
|
|
return
|
|
}
|
|
of.problem = problem
|
|
}
|
|
|
|
func tsRejectFlow(rh packet.TailscaleRejectedHeader) flowtrack.Tuple {
|
|
return flowtrack.MakeTuple(rh.Proto, rh.Src, rh.Dst)
|
|
}
|
|
|
|
func (e *userspaceEngine) trackOpenPreFilterIn(pp *packet.Parsed, t *tstun.Wrapper) (res filter.Response) {
|
|
res = filter.Accept // always
|
|
|
|
if pp.IPProto == ipproto.TSMP {
|
|
res = filter.DropSilently
|
|
rh, ok := pp.AsTailscaleRejectedHeader()
|
|
if !ok {
|
|
return
|
|
}
|
|
if rh.MaybeBroken {
|
|
e.noteFlowProblemFromPeer(tsRejectFlow(rh), rh.Reason)
|
|
} else if f := tsRejectFlow(rh); e.removeFlow(f) {
|
|
e.logf("open-conn-track: flow %v %v > %v rejected due to %v", rh.Proto, rh.Src, rh.Dst, rh.Reason)
|
|
}
|
|
if fnp := e.connRejectNote.Load(); fnp != nil {
|
|
(*fnp)(connreject.Event{
|
|
Direction: connreject.Outgoing,
|
|
Proto: rh.Proto,
|
|
Src: rh.Src,
|
|
Dst: rh.Dst,
|
|
Reason: rejectReasonToReason(rh.Reason),
|
|
Source: connreject.SourceTSMPRecv,
|
|
MaybeBroken: rh.MaybeBroken,
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
if pp.IPVersion == 0 ||
|
|
pp.IPProto != ipproto.TCP ||
|
|
pp.TCPFlags&(packet.TCPSyn|packet.TCPRst) == 0 {
|
|
return
|
|
}
|
|
|
|
// Either a SYN or a RST came back. Remove it in either case.
|
|
|
|
f := flowtrack.MakeTuple(pp.IPProto, pp.Dst, pp.Src) // src/dst reversed
|
|
removed := e.removeFlow(f)
|
|
if removed && pp.TCPFlags&packet.TCPRst != 0 {
|
|
e.logf("open-conn-track: flow TCP %v got RST by peer", f)
|
|
}
|
|
return
|
|
}
|
|
|
|
var (
|
|
appleIPRange = netip.MustParsePrefix("17.0.0.0/8")
|
|
canonicalIPs = sync.OnceValue(func() (checkIPFunc func(netip.Addr) bool) {
|
|
// https://bgp.he.net/AS41231#_prefixes
|
|
t := &bart.Lite{}
|
|
for s := range strings.FieldsSeq(`
|
|
91.189.89.0/24
|
|
91.189.91.0/24
|
|
91.189.92.0/24
|
|
91.189.93.0/24
|
|
91.189.94.0/24
|
|
91.189.95.0/24
|
|
162.213.32.0/24
|
|
162.213.34.0/24
|
|
162.213.35.0/24
|
|
185.125.188.0/23
|
|
185.125.190.0/24
|
|
194.169.254.0/24`) {
|
|
t.Insert(netip.MustParsePrefix(s))
|
|
}
|
|
return t.Contains
|
|
})
|
|
)
|
|
|
|
// isOSNetworkProbe reports whether the target is likely a network
|
|
// connectivity probe target from e.g. iOS or Ubuntu network-manager.
|
|
//
|
|
// iOS likes to probe Apple IPs on all interfaces to check for connectivity.
|
|
// Don't start timers tracking those. They won't succeed anyway. Avoids log
|
|
// spam like:
|
|
func (e *userspaceEngine) isOSNetworkProbe(dst netip.AddrPort) bool {
|
|
// iOS had log spam like:
|
|
// open-conn-track: timeout opening (100.115.73.60:52501 => 17.125.252.5:443); no associated peer node
|
|
if runtime.GOOS == "ios" && dst.Port() == 443 && appleIPRange.Contains(dst.Addr()) {
|
|
if _, ok := e.PeerForIP(dst.Addr()); !ok {
|
|
return true
|
|
}
|
|
}
|
|
// NetworkManager; https://github.com/tailscale/tailscale/issues/13687
|
|
// open-conn-track: timeout opening (TCP 100.96.229.119:42798 => 185.125.190.49:80); no associated peer node
|
|
if runtime.GOOS == "linux" && dst.Port() == 80 && canonicalIPs()(dst.Addr()) {
|
|
if _, ok := e.PeerForIP(dst.Addr()); !ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (e *userspaceEngine) trackOpenPostFilterOut(pp *packet.Parsed, t *tstun.Wrapper) (res filter.Response) {
|
|
res = filter.Accept // always
|
|
|
|
if pp.IPVersion == 0 ||
|
|
pp.IPProto != ipproto.TCP ||
|
|
pp.TCPFlags&packet.TCPAck != 0 ||
|
|
pp.TCPFlags&packet.TCPSyn == 0 {
|
|
return
|
|
}
|
|
if e.isOSNetworkProbe(pp.Dst) {
|
|
return
|
|
}
|
|
|
|
flow := flowtrack.MakeTuple(pp.IPProto, pp.Src, pp.Dst)
|
|
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
if _, dup := e.pendOpen[flow]; dup {
|
|
// Duplicates are expected when the OS retransmits. Ignore.
|
|
return
|
|
}
|
|
|
|
timer := time.AfterFunc(tcpTimeoutBeforeDebug, func() {
|
|
e.onOpenTimeout(flow)
|
|
})
|
|
mak.Set(&e.pendOpen, flow, &pendingOpenFlow{timer: timer})
|
|
|
|
return filter.Accept
|
|
}
|
|
|
|
// openTimeoutDiag captures what [userspaceEngine.diagnoseOpenTimeout]
|
|
// inferred about a pendopen flow that timed out. It is the input to
|
|
// [classifyOpenTimeout].
|
|
type openTimeoutDiag struct {
|
|
// problem is the TSMP reject reason previously reported by the peer
|
|
// via a MaybeBroken TSMP header, or zero if none.
|
|
problem packet.TailscaleRejectReason
|
|
|
|
// noPeer is true if PeerForIP did not find a peer for the flow.
|
|
noPeer bool
|
|
// peerUnreachable is true when we identified a Tailscale peer for
|
|
// the flow but it cannot currently be reached (pre-0.100 client,
|
|
// no HomeDERP, in-netmap-but-unknown-to-WireGuard, etc.).
|
|
peerUnreachable bool
|
|
// onlyZeroRoute is true if PeerForIP matched the peer only via a
|
|
// /0 route (likely a non-selected exit node). The outer code
|
|
// deliberately stays silent in this case; classification would be
|
|
// misleading.
|
|
onlyZeroRoute bool
|
|
}
|
|
|
|
// diagnoseOpenTimeout examines the engine state and logs a description
|
|
// of why a pendopen flow timed out. It returns the populated
|
|
// [openTimeoutDiag] so the caller can classify and (optionally) emit a
|
|
// connreject event.
|
|
func (e *userspaceEngine) diagnoseOpenTimeout(flow flowtrack.Tuple, problem packet.TailscaleRejectReason) openTimeoutDiag {
|
|
d := openTimeoutDiag{problem: problem}
|
|
|
|
if !problem.IsZero() {
|
|
e.logf("open-conn-track: timeout opening %v; peer reported problem: %v", flow, problem)
|
|
}
|
|
|
|
pip, ok := e.PeerForIP(flow.DstAddr())
|
|
if !ok {
|
|
e.logf("open-conn-track: timeout opening %v; no associated peer node", flow)
|
|
d.noPeer = true
|
|
return d
|
|
}
|
|
n := pip.Node
|
|
if !n.IsWireGuardOnly() {
|
|
if n.DiscoKey().IsZero() {
|
|
e.logf("open-conn-track: timeout opening %v; peer node %v running pre-0.100", flow, n.Key().ShortString())
|
|
d.peerUnreachable = true
|
|
return d
|
|
}
|
|
if n.HomeDERP() == 0 {
|
|
e.logf("open-conn-track: timeout opening %v; peer node %v not connected to any DERP relay", flow, n.Key().ShortString())
|
|
d.peerUnreachable = true
|
|
return d
|
|
}
|
|
}
|
|
|
|
ps, found := e.getPeerStatusLite(n.Key())
|
|
if !found {
|
|
onlyZeroRoute := true // whether peerForIP returned n only because its /0 route matched
|
|
for _, r := range n.AllowedIPs().All() {
|
|
if r.Bits() != 0 && r.Contains(flow.DstAddr()) {
|
|
onlyZeroRoute = false
|
|
break
|
|
}
|
|
}
|
|
if onlyZeroRoute {
|
|
// This node was returned by peerForIP because its exit
|
|
// node /0 route(s) matched, but this might not be the
|
|
// exit node that's currently selected. Rather than log
|
|
// misleading errors, just don't log at all for now.
|
|
// TODO(bradfitz): update this code to be exit-node-aware
|
|
// and make peerForIP return the node of the currently
|
|
// selected exit node.
|
|
d.onlyZeroRoute = true
|
|
return d
|
|
}
|
|
e.logf("open-conn-track: timeout opening %v; target node %v in netmap but unknown to WireGuard", flow, n.Key().ShortString())
|
|
d.peerUnreachable = true
|
|
return d
|
|
}
|
|
|
|
// TODO(bradfitz): figure out what PeerStatus.LastHandshake
|
|
// is. It appears to be the last time we sent a handshake,
|
|
// which isn't what I expected. I thought it was when a
|
|
// handshake completed, which is what I want.
|
|
_ = ps.LastHandshake
|
|
|
|
online := "?"
|
|
if n.IsWireGuardOnly() {
|
|
online = "wg"
|
|
} else {
|
|
if v, ok := n.Online().GetOk(); ok {
|
|
if v {
|
|
online = "yes"
|
|
} else {
|
|
online = "no"
|
|
}
|
|
}
|
|
if lastSeen, ok := n.LastSeen().GetOk(); ok && online != "yes" {
|
|
online += fmt.Sprintf(", lastseen=%v", durFmt(lastSeen))
|
|
}
|
|
}
|
|
e.logf("open-conn-track: timeout opening %v to node %v; online=%v, lastRecv=%v",
|
|
flow, n.Key().ShortString(),
|
|
online,
|
|
e.magicConn.LastRecvActivityOfNodeKey(n.Key()))
|
|
return d
|
|
}
|
|
|
|
// classifyOpenTimeout maps an [openTimeoutDiag] to a
|
|
// [connreject.Reason] and [connreject.Source] for emission. A returned
|
|
// reason of "" means the caller should not emit any event (used for
|
|
// the deliberately-silent onlyZeroRoute case).
|
|
//
|
|
// If d.problem is set, the peer's TSMP-reported reason supersedes our
|
|
// own diagnosis and the event is tagged SourceTSMPRecv (the timeout
|
|
// confirms the previously non-terminal reject was actually terminal).
|
|
func classifyOpenTimeout(d openTimeoutDiag) (connreject.Reason, connreject.Source) {
|
|
if d.onlyZeroRoute {
|
|
return "", connreject.SourceUnknown
|
|
}
|
|
if !d.problem.IsZero() {
|
|
return rejectReasonToReason(d.problem), connreject.SourceTSMPRecv
|
|
}
|
|
switch {
|
|
case d.noPeer:
|
|
return connreject.ReasonNoPeer, connreject.SourcePendOpenTimeout
|
|
case d.peerUnreachable:
|
|
return connreject.ReasonPeerUnreachable, connreject.SourcePendOpenTimeout
|
|
}
|
|
return connreject.ReasonTimeout, connreject.SourcePendOpenTimeout
|
|
}
|
|
|
|
func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) {
|
|
e.mu.Lock()
|
|
of, ok := e.pendOpen[flow]
|
|
if !ok {
|
|
// Not a tracked flow, or already handled & deleted.
|
|
e.mu.Unlock()
|
|
return
|
|
}
|
|
delete(e.pendOpen, flow)
|
|
problem := of.problem
|
|
e.mu.Unlock()
|
|
|
|
d := e.diagnoseOpenTimeout(flow, problem)
|
|
reason, source := classifyOpenTimeout(d)
|
|
if reason == "" {
|
|
return
|
|
}
|
|
fnp := e.connRejectNote.Load()
|
|
if fnp == nil {
|
|
return
|
|
}
|
|
(*fnp)(connreject.Event{
|
|
Direction: connreject.Outgoing,
|
|
Proto: flow.Proto(),
|
|
Src: flow.Src(),
|
|
Dst: flow.Dst(),
|
|
Reason: reason,
|
|
Source: source,
|
|
})
|
|
}
|
|
|
|
// rejectReasonToReason maps a [packet.TailscaleRejectReason] to its
|
|
// corresponding [connreject.Reason] tag.
|
|
func rejectReasonToReason(r packet.TailscaleRejectReason) connreject.Reason {
|
|
switch r {
|
|
case packet.RejectedDueToACLs:
|
|
return connreject.ReasonACL
|
|
case packet.RejectedDueToShieldsUp:
|
|
return connreject.ReasonShields
|
|
case packet.RejectedDueToIPForwarding:
|
|
return connreject.ReasonHostIPForwarding
|
|
case packet.RejectedDueToHostFirewall:
|
|
return connreject.ReasonHostFirewall
|
|
}
|
|
return connreject.ReasonUnknown
|
|
}
|
|
|
|
func durFmt(t time.Time) string {
|
|
if t.IsZero() {
|
|
return "never"
|
|
}
|
|
d := time.Since(t).Round(time.Second)
|
|
if d < 10*time.Minute {
|
|
// node.LastSeen times are rounded very coarsely, and
|
|
// we compare times from different clocks (server vs
|
|
// local), so negative is common when close. Format as
|
|
// "recent" if negative or actually recent.
|
|
return "recent"
|
|
}
|
|
return d.String()
|
|
}
|