diff --git a/client/local/local.go b/client/local/local.go index 5c75c0487..5a3571851 100644 --- a/client/local/local.go +++ b/client/local/local.go @@ -34,6 +34,7 @@ import ( "tailscale.com/feature/buildfeatures" "tailscale.com/ipn" "tailscale.com/ipn/ipnstate" + "tailscale.com/net/connreject" "tailscale.com/net/netutil" "tailscale.com/net/udprelay/status" "tailscale.com/paths" @@ -451,6 +452,23 @@ func (lc *Client) EventBusQueues(ctx context.Context) ([]byte, error) { return lc.get200(ctx, "/localapi/v0/debug-bus-queues") } +// DebugRejects returns the node's aggregated connection-rejection +// diagnostics. The feature is gated at runtime by +// [tailcfg.NodeAttrConnReject]; when that attribute is not set on the +// node, the response is returned with Enabled=false and empty +// Outgoing/Incoming slices. +func (lc *Client) DebugRejects(ctx context.Context) (*connreject.DebugRejectsResponse, error) { + body, err := lc.get200(ctx, "/localapi/v0/debug-rejects") + if err != nil { + return nil, err + } + resp, err := decodeJSON[connreject.DebugRejectsResponse](body) + if err != nil { + return nil, err + } + return &resp, nil +} + // StreamBusEvents returns an iterator of Tailscale bus events as they arrive. // Each pair is a valid event and a nil error, or a zero event a non-nil error. // In case of error, the iterator ends after the pair reporting the error. diff --git a/feature/buildfeatures/feature_connreject_disabled.go b/feature/buildfeatures/feature_connreject_disabled.go new file mode 100644 index 000000000..cd80fd82f --- /dev/null +++ b/feature/buildfeatures/feature_connreject_disabled.go @@ -0,0 +1,13 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Code generated by gen.go; DO NOT EDIT. + +//go:build ts_omit_connreject + +package buildfeatures + +// HasConnReject is whether the binary was built with support for modular feature "Connection-rejection diagnostics (TSMP rejects, pendopen timeouts) exposed over debug-rejects LocalAPI and c2n endpoints". +// Specifically, it's whether the binary was NOT built with the "ts_omit_connreject" build tag. +// It's a const so it can be used for dead code elimination. +const HasConnReject = false diff --git a/feature/buildfeatures/feature_connreject_enabled.go b/feature/buildfeatures/feature_connreject_enabled.go new file mode 100644 index 000000000..8b3af638f --- /dev/null +++ b/feature/buildfeatures/feature_connreject_enabled.go @@ -0,0 +1,13 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Code generated by gen.go; DO NOT EDIT. + +//go:build !ts_omit_connreject + +package buildfeatures + +// HasConnReject is whether the binary was built with support for modular feature "Connection-rejection diagnostics (TSMP rejects, pendopen timeouts) exposed over debug-rejects LocalAPI and c2n endpoints". +// Specifically, it's whether the binary was NOT built with the "ts_omit_connreject" build tag. +// It's a const so it can be used for dead code elimination. +const HasConnReject = true diff --git a/feature/condregister/maybe_connreject.go b/feature/condregister/maybe_connreject.go new file mode 100644 index 000000000..08b326a7b --- /dev/null +++ b/feature/condregister/maybe_connreject.go @@ -0,0 +1,8 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !ts_omit_connreject + +package condregister + +import _ "tailscale.com/feature/connreject" diff --git a/feature/connreject/connreject.go b/feature/connreject/connreject.go new file mode 100644 index 000000000..329133d53 --- /dev/null +++ b/feature/connreject/connreject.go @@ -0,0 +1,163 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Package connreject registers the connection-rejection diagnostics +// feature. It owns a [*connreject.Aggregator] that records rejection +// events and exposes them via the debug-rejects LocalAPI endpoint and +// the GET /debug/rejects c2n endpoint. +// +// The feature wires itself into the local backend's TUN wrapper and +// engine via callbacks installed at extension Init. Recording is gated +// at runtime by the [tailcfg.NodeAttrConnReject] node attribute; when +// the attribute is not set, recorded events are silently dropped by the +// aggregator. +package connreject + +import ( + "encoding/json" + "net/http" + + "tailscale.com/feature" + "tailscale.com/ipn/ipnext" + "tailscale.com/ipn/ipnlocal" + "tailscale.com/ipn/localapi" + "tailscale.com/net/connreject" + "tailscale.com/tailcfg" + "tailscale.com/types/logger" + "tailscale.com/util/httpm" +) + +// featureName is the feature tag used for feature.Register and the +// ipnext extension name. +const featureName = "connreject" + +func init() { + feature.Register(featureName) + ipnext.RegisterExtension(featureName, newExtension) + localapi.Register("debug-rejects", serveDebugRejects) + ipnlocal.RegisterC2N("GET /debug/rejects", handleC2NDebugRejects) +} + +// extension owns a per-LocalBackend [connreject.Aggregator] and installs note +// callbacks on the engine and tundev that feed it. +type extension struct { + logf logger.Logf + sb ipnext.SafeBackend + agg *connreject.Aggregator +} + +func newExtension(logf logger.Logf, sb ipnext.SafeBackend) (ipnext.Extension, error) { + return &extension{ + logf: logger.WithPrefix(logf, featureName+": "), + sb: sb, + agg: connreject.NewAggregator(connreject.DefaultMax()), + }, nil +} + +// Name implements [ipnext.Extension]. +func (e *extension) Name() string { return featureName } + +// Init implements [ipnext.Extension]. It installs note callbacks on the +// tundev and engine so they can deliver rejection events to the +// aggregator, and subscribes to self-node changes to flip the runtime +// enable bit. +// +// Init tolerates a nil SafeBackend (e.g. in unit tests that construct +// the extension directly without going through newExtension); in that +// case the note callbacks aren't installed but OnSelfChange still works. +func (e *extension) Init(host ipnext.Host) error { + if e.sb != nil { + if tun := e.sb.Sys().Tun.Get(); tun != nil { + tun.SetConnRejectNote(e.note) + } + if eng := e.sb.Sys().Engine.Get(); eng != nil { + eng.SetConnRejectNote(e.note) + } + } + host.Hooks().OnSelfChange.Add(e.onSelfChange) + return nil +} + +// note delivers a rejection event to the aggregator. The aggregator +// applies its own enable gate and direction dispatch. +func (e *extension) note(evt connreject.Event) { + e.agg.Note(evt) +} + +// Shutdown implements [ipnext.Extension]. It uninstalls the note +// callbacks and disables the aggregator. +func (e *extension) Shutdown() error { + e.uninstallNotes() + e.agg.SetEnabled(false) + return nil +} + +// uninstallNotes clears the note callbacks installed during Init. It is +// safe to call when no SafeBackend was wired (e.g. unit tests). +func (e *extension) uninstallNotes() { + if e.sb == nil { + return + } + if tun := e.sb.Sys().Tun.Get(); tun != nil { + tun.SetConnRejectNote(nil) + } + if eng := e.sb.Sys().Engine.Get(); eng != nil { + eng.SetConnRejectNote(nil) + } +} + +func (e *extension) onSelfChange(self tailcfg.NodeView) { + enabled := self.HasCap(tailcfg.NodeAttrConnReject) + if prev := e.agg.SetEnabled(enabled); prev != enabled { + if enabled { + e.logf("enabled via NodeAttrConnReject") + } else { + e.logf("disabled via NodeAttrConnReject") + } + } +} + +func buildResponse(a *connreject.Aggregator) connreject.DebugRejectsResponse { + return connreject.DebugRejectsResponse{ + Enabled: a.Enabled(), + Outgoing: a.Outgoing(), + Incoming: a.Incoming(), + } +} + +func serveDebugRejects(h *localapi.Handler, w http.ResponseWriter, r *http.Request) { + if !h.PermitRead { + http.Error(w, "debug-rejects access denied", http.StatusForbidden) + return + } + if r.Method != httpm.GET { + http.Error(w, "GET required", http.StatusMethodNotAllowed) + return + } + var e *extension + if !h.LocalBackend().FindMatchingExtension(&e) { + http.Error(w, "connreject extension unavailable", http.StatusInternalServerError) + return + } + writeJSON(w, buildResponse(e.agg)) +} + +func handleC2NDebugRejects(lb *ipnlocal.LocalBackend, w http.ResponseWriter, r *http.Request) { + if r.Method != httpm.GET { + http.Error(w, "GET required", http.StatusMethodNotAllowed) + return + } + var e *extension + if !lb.FindMatchingExtension(&e) { + http.Error(w, "connreject extension unavailable", http.StatusInternalServerError) + return + } + writeJSON(w, buildResponse(e.agg)) +} + +func writeJSON(w http.ResponseWriter, v any) { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(v) +} + + diff --git a/feature/connreject/connreject_test.go b/feature/connreject/connreject_test.go new file mode 100644 index 000000000..25a10951b --- /dev/null +++ b/feature/connreject/connreject_test.go @@ -0,0 +1,153 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package connreject + +import ( + "net/netip" + "testing" + + "tailscale.com/net/connreject" + "tailscale.com/tailcfg" + "tailscale.com/types/ipproto" + "tailscale.com/types/logger" +) + +// newTestExtension returns a fresh extension backed by an isolated +// Aggregator (no SafeBackend wiring), suitable for per-test use in +// parallel. +func newTestExtension() *extension { + return &extension{ + logf: logger.Discard, + agg: connreject.NewAggregator(8), + } +} + +func sampleOutgoingEvent() connreject.Event { + return connreject.Event{ + Direction: connreject.Outgoing, + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort("100.0.0.1:1001"), + Dst: netip.MustParseAddrPort("100.0.0.2:443"), + Reason: connreject.ReasonACL, + Source: connreject.SourceTSMPRecv, + } +} + +func sampleIncomingEvent() connreject.Event { + return connreject.Event{ + Direction: connreject.Incoming, + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort("100.0.0.2:5555"), + Dst: netip.MustParseAddrPort("100.0.0.1:22"), + Reason: connreject.ReasonACL, + Source: connreject.SourceTSMPSent, + } +} + +// TestOnEventDeliversToAggregator verifies that events reaching the +// extension's note callback are forwarded to the aggregator, which then +// partitions by direction into its own buffers. +func TestOnEventDeliversToAggregator(t *testing.T) { + t.Parallel() + e := newTestExtension() + e.agg.SetEnabled(true) + + e.note(sampleOutgoingEvent()) + e.note(sampleIncomingEvent()) + + if got := len(e.agg.Outgoing()); got != 1 { + t.Errorf("outgoing len = %d, want 1", got) + } + if got := len(e.agg.Incoming()); got != 1 { + t.Errorf("incoming len = %d, want 1", got) + } +} + +func TestOnEventDroppedWhenDisabled(t *testing.T) { + t.Parallel() + e := newTestExtension() + + e.note(sampleOutgoingEvent()) + e.note(sampleIncomingEvent()) + + if got := len(e.agg.Outgoing()); got != 0 { + t.Errorf("outgoing len = %d, want 0 when disabled", got) + } + if got := len(e.agg.Incoming()); got != 0 { + t.Errorf("incoming len = %d, want 0 when disabled", got) + } +} + +func TestExtensionOnSelfChange(t *testing.T) { + t.Parallel() + e := newTestExtension() + + // NodeView without the cap → stays off. + noCap := (&tailcfg.Node{}).View() + e.onSelfChange(noCap) + if e.agg.Enabled() { + t.Error("enabled = true after self-change with no cap, want false") + } + + // NodeView with the cap → turns on. + withCap := (&tailcfg.Node{ + CapMap: tailcfg.NodeCapMap{ + tailcfg.NodeAttrConnReject: nil, + }, + }).View() + e.onSelfChange(withCap) + if !e.agg.Enabled() { + t.Error("enabled = false after self-change with cap, want true") + } + + // Flipping back off works. + e.onSelfChange(noCap) + if e.agg.Enabled() { + t.Error("enabled = true after flip back, want false") + } +} + +func TestExtensionShutdownDisables(t *testing.T) { + t.Parallel() + e := newTestExtension() + e.agg.SetEnabled(true) + + if err := e.Shutdown(); err != nil { + t.Fatalf("Shutdown error: %v", err) + } + if e.agg.Enabled() { + t.Error("enabled still true after Shutdown") + } +} + +// TestBuildResponse covers the JSON-payload shape served by both the +// LocalAPI and c2n handlers. The handlers themselves are pure routing +// boilerplate (method check, extension lookup, JSON marshal) and are +// not exercised here; verifying buildResponse plus the +// FindMatchingExtension contract on LocalBackend is sufficient. +func TestBuildResponse(t *testing.T) { + t.Parallel() + e := newTestExtension() + e.agg.SetEnabled(true) + e.note(sampleOutgoingEvent()) + e.note(sampleIncomingEvent()) + + resp := buildResponse(e.agg) + if !resp.Enabled { + t.Error("Enabled = false, want true") + } + if got := len(resp.Outgoing); got != 1 { + t.Errorf("len(Outgoing) = %d, want 1", got) + } + if got := len(resp.Incoming); got != 1 { + t.Errorf("len(Incoming) = %d, want 1", got) + } + + // Disabled aggregator yields empty slices and Enabled=false. + e.agg.SetEnabled(false) + resp = buildResponse(e.agg) + if resp.Enabled { + t.Error("Enabled = true after SetEnabled(false), want false") + } +} diff --git a/feature/featuretags/featuretags.go b/feature/featuretags/featuretags.go index e44a4f592..4f1bbdfbb 100644 --- a/feature/featuretags/featuretags.go +++ b/feature/featuretags/featuretags.go @@ -140,6 +140,10 @@ var Features = map[FeatureTag]FeatureMeta{ }, "completion": {Sym: "Completion", Desc: "CLI shell completion"}, "conn25": {Sym: "Conn25", Desc: "Route traffic for configured domains through connector devices"}, + "connreject": { + Sym: "ConnReject", + Desc: "Connection-rejection diagnostics (TSMP rejects, pendopen timeouts) exposed over debug-rejects LocalAPI and c2n endpoints", + }, "completion_scripts": { Sym: "CompletionScripts", Desc: "embed CLI shell completion scripts", Deps: []FeatureTag{"completion"}, diff --git a/ipn/ipnlocal/state_test.go b/ipn/ipnlocal/state_test.go index 104c29a3f..ffe6af5f8 100644 --- a/ipn/ipnlocal/state_test.go +++ b/ipn/ipnlocal/state_test.go @@ -27,6 +27,7 @@ import ( "tailscale.com/ipn/store/mem" "tailscale.com/net/dns" "tailscale.com/net/netmon" + "tailscale.com/net/connreject" "tailscale.com/net/packet" "tailscale.com/net/tsdial" "tailscale.com/tailcfg" @@ -1940,6 +1941,8 @@ func (e *mockEngine) InstallCaptureHook(packet.CaptureCallback) {} func (e *mockEngine) SetPeerByIPPacketFunc(func(netip.Addr) (_ key.NodePublic, ok bool)) {} +func (e *mockEngine) SetConnRejectNote(func(connreject.Event)) {} + func (e *mockEngine) Close() { e.mu.Lock() defer e.mu.Unlock() diff --git a/net/connreject/connreject.go b/net/connreject/connreject.go new file mode 100644 index 000000000..d0598d227 --- /dev/null +++ b/net/connreject/connreject.go @@ -0,0 +1,320 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Package connreject provides a bounded, in-memory, per-direction +// aggregator of recent connection-rejection events observed by the +// local node. +// +// A rejection event is any evidence that a connection attempt was +// blocked or dropped: a peer's TSMP reject message, an outbound TSMP +// reject we emitted because a peer's inbound connection violated our +// ACLs, a pendopen timeout where we never saw a reply, etc. +// +// The aggregator is split by [Direction]: +// - [Outgoing]: connections initiated by this node that were rejected +// or otherwise failed (keyed on the peer's address — "who we failed +// to reach"). +// - [Incoming]: connections from a peer that this node rejected +// (keyed on the peer's address — "who we blocked"). +// +// Repeated observations for the same (direction, proto, peer-address, +// reason) are aggregated into a single [Event], bumping Count and Last. +// LRU-by-count eviction keeps each direction's buffer bounded. +// +// This package contains only the data types and aggregator implementation; +// wiring into the rest of the system is done by feature/connreject. +package connreject + +import ( + "container/list" + "net/netip" + "runtime" + "sync" + "sync/atomic" + "time" + + "tailscale.com/types/ipproto" +) + +// Direction describes whether a rejection applied to a connection this +// node initiated (Outgoing) or a connection a peer initiated toward us +// (Incoming). +// +// Direction is a string type so it serializes directly to human-readable +// JSON without requiring MarshalText/UnmarshalText boilerplate. +type Direction string + +// Direction values. +const ( + DirectionUnknown Direction = "" + Outgoing Direction = "outgoing" + Incoming Direction = "incoming" +) + +// Source describes how this node learned of a rejection. +type Source string + +// Source values. +const ( + SourceUnknown Source = "" + // SourceTSMPRecv is a TSMP reject message received from a peer. + SourceTSMPRecv Source = "tsmp_recv" + // SourceTSMPSent is a TSMP reject message this node emitted to a peer. + SourceTSMPSent Source = "tsmp_sent" + // SourcePendOpenTimeout is a pendopen-timer expiry with no response. + SourcePendOpenTimeout Source = "pendopen_timeout" +) + +// Reason is a short, stable, machine-readable tag describing why a +// connection was rejected. Reason is part of the aggregation key in +// [Aggregator]. +// +// These values are part of the wire format exposed over the debug +// endpoints; do not rename or repurpose them without bumping the +// capability version. +type Reason string + +// Reason values. +const ( + ReasonUnknown Reason = "unknown" + ReasonACL Reason = "acl" + ReasonShields Reason = "shields" + ReasonHostIPForwarding Reason = "host-ip-forwarding" + ReasonHostFirewall Reason = "host-firewall" + ReasonNoPeer Reason = "no-peer" + ReasonPeerUnreachable Reason = "peer-unreachable" + ReasonTimeout Reason = "timeout" +) + +// Event is a single aggregated rejection observation. +// +// Events are aggregated by [Aggregator] on +// (Direction, Proto, peer-address, Reason). Src and Dst carry the ports +// observed on the most recent occurrence but do not participate in the +// aggregation key. +type Event struct { + // First is when this (key) was first observed. If unset on a Note, + // the aggregator fills it with the current time. + First time.Time + // Last is the most recent observation of this (key). If unset on a + // Note, the aggregator fills it with the current time. + Last time.Time + // Count is the number of observations merged into this Event. If + // unset on a Note, the aggregator treats it as 1. + Count uint32 + // Direction indicates whether the event applies to an outgoing or + // incoming connection attempt. + Direction Direction + // Proto is the transport protocol of the rejected flow. + Proto ipproto.Proto + // Src is a representative source endpoint from the most recent + // observation. For Outgoing events this is our side; for Incoming + // events this is the peer side. + Src netip.AddrPort + // Dst is a representative destination endpoint from the most recent + // observation. For Outgoing events this is the peer side; for + // Incoming events this is our side. + Dst netip.AddrPort + // Reason is the machine-readable reason tag. + Reason Reason + // Source describes how we learned about this event. + Source Source + // MaybeBroken is true for the non-terminal form of an inbound TSMP + // reject (see packet.TailscaleRejectedHeader.MaybeBroken). It is + // informational and only meaningful when Source == SourceTSMPRecv. + MaybeBroken bool +} + +// peerAddr returns the peer-side address for aggregation keying, based on +// the event's direction. For Outgoing, the peer is Dst; for Incoming, Src. +func (e Event) peerAddr() netip.Addr { + switch e.Direction { + case Outgoing: + return e.Dst.Addr() + case Incoming: + return e.Src.Addr() + } + return netip.Addr{} +} + +// aggKey is the aggregation key for an entry in an [Aggregator]. +type aggKey struct { + dir Direction + proto ipproto.Proto + addr netip.Addr + reason Reason +} + +// entry is an internal aggregator entry. +type entry struct { + k aggKey + e Event +} + +// dirBuf is an internal LRU+map for a single direction. +type dirBuf struct { + m map[aggKey]*list.Element // elements hold *entry + order *list.List // oldest (LRU) at Front, newest at Back +} + +func newDirBuf() dirBuf { + return dirBuf{ + m: make(map[aggKey]*list.Element), + order: list.New(), + } +} + +// Aggregator owns bounded, per-direction LRU buffers for rejection +// [Event]s and a runtime enable flag. It is safe for concurrent use. +// +// An Aggregator is per-[LocalBackend] (or per-test); there is no +// package-level state. +type Aggregator struct { + max int // per-direction LRU capacity; <= 0 disables recording. + + enabled atomic.Bool + + mu sync.Mutex + outgoing dirBuf + incoming dirBuf +} + +// NewAggregator returns an [Aggregator] with per-direction LRU buffers +// sized to max entries each. It starts disabled (calls to Note are +// silently dropped until SetEnabled(true) is called). +// +// A max of 0 or negative disables recording entirely. +func NewAggregator(max int) *Aggregator { + return &Aggregator{ + max: max, + outgoing: newDirBuf(), + incoming: newDirBuf(), + } +} + +// SetEnabled sets the runtime enable flag. When false, Note is a silent +// no-op. SetEnabled returns the previous value. +func (a *Aggregator) SetEnabled(v bool) (prev bool) { + return a.enabled.Swap(v) +} + +// Enabled reports whether the aggregator is currently enabled. +func (a *Aggregator) Enabled() bool { return a.enabled.Load() } + +// Note records a rejection observation. It is a no-op if the aggregator +// is disabled, if max is non-positive, or if e.Direction is not a +// routable value (Outgoing or Incoming). +// +// If an existing entry matches the aggregation key +// (Direction, Proto, peer-address, Reason), its Last, Count, Src, Dst, +// Source, and MaybeBroken are updated with the new observation, and it +// is moved to the "most recent" end of its buffer. Otherwise a new entry +// is appended, evicting the oldest entry if the buffer is over capacity. +func (a *Aggregator) Note(e Event) { + if !a.enabled.Load() || a.max <= 0 { + return + } + buf := a.bufFor(e.Direction) + if buf == nil { + return + } + if e.Last.IsZero() { + e.Last = time.Now() + } + if e.First.IsZero() { + e.First = e.Last + } + if e.Count == 0 { + e.Count = 1 + } + k := aggKey{ + dir: e.Direction, + proto: e.Proto, + addr: e.peerAddr(), + reason: e.Reason, + } + + a.mu.Lock() + defer a.mu.Unlock() + + if el, ok := buf.m[k]; ok { + ent := el.Value.(*entry) + ent.e.Last = e.Last + ent.e.Count += e.Count + ent.e.Src = e.Src + ent.e.Dst = e.Dst + ent.e.Source = e.Source + ent.e.MaybeBroken = e.MaybeBroken + buf.order.MoveToBack(el) + return + } + + buf.m[k] = buf.order.PushBack(&entry{k: k, e: e}) + + // Evict oldest if over capacity. + for buf.order.Len() > a.max { + old := buf.order.Front() + delete(buf.m, old.Value.(*entry).k) + buf.order.Remove(old) + } +} + +// Outgoing returns a snapshot of the outgoing-rejection events in +// oldest-to-newest order. +func (a *Aggregator) Outgoing() []Event { return a.snapshot(&a.outgoing) } + +// Incoming returns a snapshot of the incoming-rejection events in +// oldest-to-newest order. +func (a *Aggregator) Incoming() []Event { return a.snapshot(&a.incoming) } + +func (a *Aggregator) snapshot(buf *dirBuf) []Event { + a.mu.Lock() + defer a.mu.Unlock() + out := make([]Event, 0, buf.order.Len()) + for el := buf.order.Front(); el != nil; el = el.Next() { + out = append(out, el.Value.(*entry).e) + } + return out +} + +// bufFor returns the internal buffer for a direction, or nil if the +// direction is not a recognized routable value. +// +// The returned pointer is stable for the lifetime of a; safe to use +// before acquiring a.mu. +func (a *Aggregator) bufFor(d Direction) *dirBuf { + switch d { + case Outgoing: + return &a.outgoing + case Incoming: + return &a.incoming + } + return nil +} + +// DefaultMax returns the default maximum number of entries per +// direction. On mobile platforms the default is smaller to respect tight +// memory budgets. +func DefaultMax() int { + switch runtime.GOOS { + case "ios", "android", "tvos": + return 32 + } + return 256 +} + +// DebugRejectsResponse is the JSON response body shared by the +// debug-rejects LocalAPI endpoint, the GET /debug/rejects c2n endpoint, +// and the [client/local.Client.DebugRejects] method. +type DebugRejectsResponse struct { + // Enabled is whether [tailcfg.NodeAttrConnReject] is currently set + // on the node. When false, Outgoing and Incoming will be empty + // regardless of prior activity. + Enabled bool + // Outgoing is the list of rejections observed for connections this + // node initiated. + Outgoing []Event + // Incoming is the list of rejections this node has emitted for + // connections initiated by peers. + Incoming []Event +} diff --git a/net/connreject/connreject_test.go b/net/connreject/connreject_test.go new file mode 100644 index 000000000..09c99eee7 --- /dev/null +++ b/net/connreject/connreject_test.go @@ -0,0 +1,325 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package connreject + +import ( + "encoding/json" + "net/netip" + "strings" + "testing" + "time" + + "tailscale.com/types/ipproto" +) + +func outEvent(src, dst string, reason Reason, t time.Time) Event { + return Event{ + Last: t, + Direction: Outgoing, + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort(src), + Dst: netip.MustParseAddrPort(dst), + Reason: reason, + Source: SourceTSMPRecv, + } +} + +func inEvent(src, dst string, reason Reason, t time.Time) Event { + return Event{ + Last: t, + Direction: Incoming, + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort(src), + Dst: netip.MustParseAddrPort(dst), + Reason: reason, + Source: SourceTSMPSent, + } +} + +// newEnabled returns an Aggregator with the given max, with +// SetEnabled(true) already called. +func newEnabled(max int) *Aggregator { + a := NewAggregator(max) + a.SetEnabled(true) + return a +} + +func TestAggregatorDisabledByDefault(t *testing.T) { + t.Parallel() + a := NewAggregator(8) + if a.Enabled() { + t.Error("new Aggregator is enabled; want disabled by default") + } + a.Note(outEvent("100.0.0.1:1", "100.0.0.2:2", ReasonACL, time.Now())) + a.Note(inEvent("100.0.0.2:2", "100.0.0.1:1", ReasonACL, time.Now())) + if got := len(a.Outgoing()); got != 0 { + t.Errorf("outgoing len = %d, want 0", got) + } + if got := len(a.Incoming()); got != 0 { + t.Errorf("incoming len = %d, want 0", got) + } +} + +func TestAggregatorRejectsUnknownDirection(t *testing.T) { + t.Parallel() + a := newEnabled(8) + // Event with no Direction set: silently dropped. + a.Note(Event{ + Last: time.Now(), + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort("100.0.0.1:1"), + Dst: netip.MustParseAddrPort("100.0.0.2:2"), + Reason: ReasonACL, + }) + if got := len(a.Outgoing()) + len(a.Incoming()); got != 0 { + t.Errorf("aggregator recorded event with unknown direction: %d entries", got) + } +} + +func TestAggregatorAggregatesByKey(t *testing.T) { + t.Parallel() + a := newEnabled(8) + now := time.Now() + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(outEvent("100.0.0.1:1002", "100.0.0.2:443", ReasonACL, now.Add(1*time.Second))) + a.Note(outEvent("100.0.0.1:1003", "100.0.0.2:443", ReasonACL, now.Add(2*time.Second))) + + all := a.Outgoing() + if len(all) != 1 { + t.Fatalf("got %d entries, want 1 (aggregated)", len(all)) + } + e := all[0] + if e.Count != 3 { + t.Errorf("Count = %d, want 3", e.Count) + } + if !e.First.Equal(now) { + t.Errorf("First = %v, want %v", e.First, now) + } + if !e.Last.Equal(now.Add(2 * time.Second)) { + t.Errorf("Last = %v, want %v", e.Last, now.Add(2*time.Second)) + } + if got, want := e.Src, netip.MustParseAddrPort("100.0.0.1:1003"); got != want { + t.Errorf("Src = %v, want %v", got, want) + } +} + +func TestAggregatorDistinctByReason(t *testing.T) { + t.Parallel() + a := newEnabled(8) + now := time.Now() + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(outEvent("100.0.0.1:1002", "100.0.0.2:443", ReasonShields, now.Add(1*time.Second))) + + if got := len(a.Outgoing()); got != 2 { + t.Errorf("outgoing len = %d, want 2 (distinct reasons)", got) + } +} + +func TestAggregatorDistinctByProto(t *testing.T) { + t.Parallel() + a := newEnabled(8) + now := time.Now() + e1 := outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now) + e2 := outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now.Add(time.Second)) + e2.Proto = ipproto.UDP + + a.Note(e1) + a.Note(e2) + + if got := len(a.Outgoing()); got != 2 { + t.Errorf("outgoing len = %d, want 2 (distinct protos)", got) + } +} + +func TestAggregatorDistinctByPeerAddr(t *testing.T) { + t.Parallel() + a := newEnabled(8) + now := time.Now() + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(outEvent("100.0.0.1:1001", "100.0.0.3:443", ReasonACL, now.Add(time.Second))) + + if got := len(a.Outgoing()); got != 2 { + t.Errorf("outgoing len = %d, want 2 (distinct destinations)", got) + } +} + +func TestAggregatorPortNotInKey(t *testing.T) { + t.Parallel() + // For Outgoing, the peer address is Dst.Addr, so Dst port + // differences should aggregate into one entry. + a := newEnabled(8) + now := time.Now() + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:8443", ReasonACL, now.Add(time.Second))) + + if got := len(a.Outgoing()); got != 1 { + t.Errorf("outgoing len = %d, want 1 (dst port is not part of key)", got) + } +} + +func TestAggregatorIncomingKeyedBySrcAddr(t *testing.T) { + t.Parallel() + a := newEnabled(8) + now := time.Now() + // Same peer (100.0.0.2), different dst ports on us: should aggregate. + a.Note(inEvent("100.0.0.2:5555", "100.0.0.1:22", ReasonACL, now)) + a.Note(inEvent("100.0.0.2:5556", "100.0.0.1:443", ReasonACL, now.Add(time.Second))) + if got := len(a.Incoming()); got != 1 { + t.Errorf("incoming len = %d, want 1 (src addr is the peer; dst port ignored)", got) + } + + // Different peer: a second entry. + a.Note(inEvent("100.0.0.3:5555", "100.0.0.1:22", ReasonACL, now.Add(2*time.Second))) + if got := len(a.Incoming()); got != 2 { + t.Errorf("incoming len = %d, want 2 (distinct source peers)", got) + } +} + +func TestAggregatorOutgoingAndIncomingIndependent(t *testing.T) { + t.Parallel() + // A single Aggregator's buffers are independent: a hit in one + // direction does not aggregate with the other. + a := newEnabled(8) + now := time.Now() + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(inEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + + if got := len(a.Outgoing()); got != 1 { + t.Errorf("outgoing len = %d, want 1", got) + } + if got := len(a.Incoming()); got != 1 { + t.Errorf("incoming len = %d, want 1", got) + } +} + +func TestAggregatorLRUEviction(t *testing.T) { + t.Parallel() + a := newEnabled(2) + now := time.Now() + + // Three distinct reasons → three distinct keys; cap is 2. + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonShields, now.Add(1*time.Second))) + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonHostFirewall, now.Add(2*time.Second))) + + all := a.Outgoing() + if len(all) != 2 { + t.Fatalf("len = %d, want 2", len(all)) + } + // Oldest (ReasonACL) should have been evicted. + want := []Reason{ReasonShields, ReasonHostFirewall} + for i := range all { + if all[i].Reason != want[i] { + t.Errorf("entry[%d].Reason = %q, want %q", i, all[i].Reason, want[i]) + } + } +} + +func TestAggregatorLRUMovesOnHit(t *testing.T) { + t.Parallel() + a := newEnabled(2) + now := time.Now() + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, now)) + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonShields, now.Add(1*time.Second))) + // Hit the oldest entry; it should move to the back. + a.Note(outEvent("100.0.0.1:1002", "100.0.0.2:443", ReasonACL, now.Add(2*time.Second))) + // Now add a new one; ReasonShields should be evicted, not ReasonACL. + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonHostFirewall, now.Add(3*time.Second))) + + all := a.Outgoing() + if len(all) != 2 { + t.Fatalf("len = %d, want 2", len(all)) + } + want := []Reason{ReasonACL, ReasonHostFirewall} + for i := range all { + if all[i].Reason != want[i] { + t.Errorf("entry[%d].Reason = %q, want %q", i, all[i].Reason, want[i]) + } + } +} + +func TestAggregatorZeroMaxDisables(t *testing.T) { + t.Parallel() + a := newEnabled(0) + a.Note(outEvent("100.0.0.1:1001", "100.0.0.2:443", ReasonACL, time.Now())) + if got := len(a.Outgoing()); got != 0 { + t.Errorf("outgoing = %d, want 0 (max<=0 disables)", got) + } +} + +func TestEventAutoTimestamps(t *testing.T) { + t.Parallel() + a := newEnabled(8) + e := Event{ + Direction: Outgoing, + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort("100.0.0.1:1001"), + Dst: netip.MustParseAddrPort("100.0.0.2:443"), + Reason: ReasonACL, + } + before := time.Now() + a.Note(e) + after := time.Now() + + all := a.Outgoing() + if len(all) != 1 { + t.Fatalf("len = %d, want 1", len(all)) + } + got := all[0] + if got.First.Before(before) || got.First.After(after) { + t.Errorf("First = %v, not in [%v, %v]", got.First, before, after) + } + if got.Last.Before(got.First) { + t.Errorf("Last %v before First %v", got.Last, got.First) + } + if got.Count != 1 { + t.Errorf("Count = %d, want 1", got.Count) + } +} + +func TestEventJSONRoundTrip(t *testing.T) { + t.Parallel() + // Because Direction/Source/Reason are typed strings, encoding/json + // handles them natively with no MarshalText boilerplate. + want := Event{ + First: time.Unix(1700000000, 0).UTC(), + Last: time.Unix(1700000060, 0).UTC(), + Count: 3, + Direction: Outgoing, + Proto: ipproto.TCP, + Src: netip.MustParseAddrPort("100.0.0.1:1001"), + Dst: netip.MustParseAddrPort("100.0.0.2:443"), + Reason: ReasonACL, + Source: SourceTSMPRecv, + } + b, err := json.Marshal(want) + if err != nil { + t.Fatal(err) + } + s := string(b) + for _, fragment := range []string{ + `"Direction":"outgoing"`, + `"Source":"tsmp_recv"`, + `"Reason":"acl"`, + `"Count":3`, + } { + if !strings.Contains(s, fragment) { + t.Errorf("JSON missing %q; got %s", fragment, s) + } + } + var got Event + if err := json.Unmarshal(b, &got); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if got.Direction != want.Direction || + got.Source != want.Source || + got.Reason != want.Reason || + got.Count != want.Count || + got.Src != want.Src || + got.Dst != want.Dst { + t.Errorf("roundtrip mismatch\n got=%+v\nwant=%+v", got, want) + } +} + + diff --git a/net/flowtrack/flowtrack.go b/net/flowtrack/flowtrack.go index 5df34a509..8c5ad331f 100644 --- a/net/flowtrack/flowtrack.go +++ b/net/flowtrack/flowtrack.go @@ -52,8 +52,19 @@ func (t Tuple) DstAddr() netip.Addr { return netip.AddrFrom16(t.dst).Unmap() } -func (t Tuple) SrcPort() uint16 { return t.srcPort } -func (t Tuple) DstPort() uint16 { return t.dstPort } +func (t Tuple) SrcPort() uint16 { return t.srcPort } +func (t Tuple) DstPort() uint16 { return t.dstPort } +func (t Tuple) Proto() ipproto.Proto { return t.proto } + +// Src returns the tuple's source endpoint as a netip.AddrPort. +func (t Tuple) Src() netip.AddrPort { + return netip.AddrPortFrom(t.SrcAddr(), t.srcPort) +} + +// Dst returns the tuple's destination endpoint as a netip.AddrPort. +func (t Tuple) Dst() netip.AddrPort { + return netip.AddrPortFrom(t.DstAddr(), t.dstPort) +} func (t Tuple) String() string { return fmt.Sprintf("(%v %v => %v)", t.proto, diff --git a/net/tstun/connreject_test.go b/net/tstun/connreject_test.go new file mode 100644 index 000000000..3f866bba1 --- /dev/null +++ b/net/tstun/connreject_test.go @@ -0,0 +1,107 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +package tstun + +import ( + "testing" + + "go4.org/netipx" + "tailscale.com/net/connreject" + "tailscale.com/net/packet" + "tailscale.com/types/ipproto" + "tailscale.com/types/logger" + "tailscale.com/util/eventbus/eventbustest" + "tailscale.com/wgengine/filter" +) + +// TestOutboundTSMPRecordsRejection verifies that when an inbound TCP4 SYN +// is dropped by the packet filter, the TSMP reject we inject outbound is +// also delivered to the connreject note callback. +func TestOutboundTSMPRecordsRejection(t *testing.T) { + t.Parallel() + bus := eventbustest.NewBus(t) + _, w := newFakeTUN(t.Logf, bus, false) + defer w.Close() + + // The note callback fires synchronously from + // filterPacketInboundFromWireGuard on this test's goroutine, so a + // plain slice (no mutex) is sufficient. + var events []connreject.Event + w.SetConnRejectNote(func(e connreject.Event) { events = append(events, e) }) + + // Drain the outbound queue so InjectOutbound doesn't block. + go func() { + var buf [MaxPacketSize]byte + for { + n, err := w.Read([][]byte{buf[:]}, []int{0}, 0) + _ = n + if err != nil { + return + } + } + }() + + // Install an allow-none filter so the inbound SYN is dropped, + // triggering the TSMP reject emission. + w.disableFilter = false + w.SetFilter(filter.NewAllowNone(logger.Discard, new(netipx.IPSet))) + + pkt := tcp4syn("1.2.3.4", "100.64.1.2", 1234, 60000) + p := new(packet.Parsed) + p.Decode(pkt) + + if res, _ := w.filterPacketInboundFromWireGuard(p, nil, nil, nil); !res.IsDrop() { + t.Fatalf("expected drop, got %v", res) + } + + if len(events) != 1 { + t.Fatalf("got %d events, want 1", len(events)) + } + got := events[0] + if got.Direction != connreject.Incoming { + t.Errorf("Direction = %v, want Incoming", got.Direction) + } + if got.Source != connreject.SourceTSMPSent { + t.Errorf("Source = %v, want tsmp_sent", got.Source) + } + if got.Reason != connreject.ReasonACL { + t.Errorf("Reason = %q, want %q", got.Reason, connreject.ReasonACL) + } + if got.Proto != ipproto.TCP { + t.Errorf("Proto = %v, want TCP", got.Proto) + } + // Src is the remote peer-side of the incoming flow. + if got.Src.Port() != 1234 { + t.Errorf("Src.Port = %d, want 1234", got.Src.Port()) + } + if got.Dst.Port() != 60000 { + t.Errorf("Dst.Port = %d, want 60000", got.Dst.Port()) + } +} + +// TestOutboundTSMPNotRecordedWhenDisabled verifies we skip the note +// callback when disableTSMPRejected is set. +func TestOutboundTSMPNotRecordedWhenDisabled(t *testing.T) { + t.Parallel() + bus := eventbustest.NewBus(t) + _, w := newFakeTUN(t.Logf, bus, false) + defer w.Close() + + var events []connreject.Event + w.SetConnRejectNote(func(e connreject.Event) { events = append(events, e) }) + + w.disableFilter = false + w.disableTSMPRejected = true + w.SetFilter(filter.NewAllowNone(logger.Discard, new(netipx.IPSet))) + + pkt := tcp4syn("1.2.3.4", "100.64.1.2", 1234, 60000) + p := new(packet.Parsed) + p.Decode(pkt) + + w.filterPacketInboundFromWireGuard(p, nil, nil, nil) + + if len(events) != 0 { + t.Errorf("got %d events, want 0 when disableTSMPRejected was set", len(events)) + } +} diff --git a/net/tstun/wrap.go b/net/tstun/wrap.go index cd75aff5c..03e8e1812 100644 --- a/net/tstun/wrap.go +++ b/net/tstun/wrap.go @@ -26,6 +26,7 @@ import ( "tailscale.com/envknob" "tailscale.com/feature" "tailscale.com/feature/buildfeatures" + "tailscale.com/net/connreject" "tailscale.com/net/packet" "tailscale.com/net/packet/checksum" "tailscale.com/net/tsaddr" @@ -223,6 +224,14 @@ type Wrapper struct { eventClient *eventbus.Client discoKeyAdvertisementPub *eventbus.Publisher[events.DiscoKeyAdvertisement] + // connRejectNote is an optional callback invoked when this Wrapper + // emits a TSMP reject. It is nil until [SetConnRejectNote] is called + // (typically by the connreject feature extension at startup). + // + // Stored as an atomic so it can be installed safely after the + // Wrapper is in use; once set, callers do not change it again. + connRejectNote atomic.Pointer[func(connreject.Event)] + // tunDevStatsCloser closes TUN device stats polling. It may be nil if // [HookPollTUNDevStats] is unset, or the hook func returned an error. tunDevStatsCloser io.Closer @@ -347,6 +356,21 @@ func (t *Wrapper) SetDiscoKey(k key.DiscoPublic) { t.discoKey.Store(k) } +// SetConnRejectNote installs a callback that is invoked when the Wrapper +// emits an outbound TSMP reject for an inbound peer connection that was +// dropped by the packet filter. The callback receives a fully populated +// [connreject.Event] of [connreject.Incoming] direction. +// +// SetConnRejectNote may be called at most once. A nil fn unsets any +// previously installed callback. +func (t *Wrapper) SetConnRejectNote(fn func(connreject.Event)) { + if fn == nil { + t.connRejectNote.Store(nil) + return + } + t.connRejectNote.Store(&fn) +} + // isSelfDisco reports whether packet p // looks like a Disco packet from ourselves. // See Issue 1526. @@ -1238,6 +1262,21 @@ func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook pa t.InjectOutbound(pkt) // TODO(bradfitz): also send a TCP RST, after the TSMP message. + + if fnp := t.connRejectNote.Load(); fnp != nil { + reason := connreject.ReasonACL + if rj.Reason == packet.RejectedDueToShieldsUp { + reason = connreject.ReasonShields + } + (*fnp)(connreject.Event{ + Direction: connreject.Incoming, + Proto: rj.Proto, + Src: rj.Src, + Dst: rj.Dst, + Reason: reason, + Source: connreject.SourceTSMPSent, + }) + } } return filter.Drop, gro diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 0cb7597c3..8eee06cc2 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -185,7 +185,8 @@ type CapabilityVersion int // - 136: 2026-04-09: Client understands [NodeAttrDisableLinuxCGNATDropRule] // - 137: 2026-04-15: Client handles 429 responses to /machine/register. // - 138: 2026-03-31: can handle C2N /debug/tka. -const CurrentCapabilityVersion CapabilityVersion = 138 +// - 139: 2026-04-24: Client understands [NodeAttrConnReject]; can handle C2N /debug/rejects. +const CurrentCapabilityVersion CapabilityVersion = 139 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2547,6 +2548,15 @@ const ( // STUN queries regardless of inactivity. NodeAttrDebugForceBackgroundSTUN NodeCapability = "debug-always-stun" + // NodeAttrConnReject enables the node's aggregated connection-rejection + // diagnostic buffers (ACL blocks, TSMP rejects, pendopen timeouts) and + // makes them available via the debug-rejects LocalAPI endpoint and the + // GET /debug/rejects c2n endpoint. + // + // This attribute may be removed in a future release once the feature + // is enabled by default. + NodeAttrConnReject NodeCapability = "debug-conn-reject" + // NodeAttrDebugDisableWGTrim disables the lazy WireGuard configuration, // always giving WireGuard the full netmap, even for idle peers. NodeAttrDebugDisableWGTrim NodeCapability = "debug-no-wg-trim" diff --git a/wgengine/pendopen.go b/wgengine/pendopen.go index e816506de..e507400dd 100644 --- a/wgengine/pendopen.go +++ b/wgengine/pendopen.go @@ -14,6 +14,7 @@ import ( "time" "github.com/gaissmai/bart" + "tailscale.com/net/connreject" "tailscale.com/net/flowtrack" "tailscale.com/net/packet" "tailscale.com/net/tstun" @@ -78,6 +79,17 @@ func (e *userspaceEngine) trackOpenPreFilterIn(pp *packet.Parsed, t *tstun.Wrapp } else if f := tsRejectFlow(rh); e.removeFlow(f) { e.logf("open-conn-track: flow %v %v > %v rejected due to %v", rh.Proto, rh.Src, rh.Dst, rh.Reason) } + if fnp := e.connRejectNote.Load(); fnp != nil { + (*fnp)(connreject.Event{ + Direction: connreject.Outgoing, + Proto: rh.Proto, + Src: rh.Src, + Dst: rh.Dst, + Reason: rejectReasonToReason(rh.Reason), + Source: connreject.SourceTSMPRecv, + MaybeBroken: rh.MaybeBroken, + }) + } return } @@ -175,37 +187,55 @@ func (e *userspaceEngine) trackOpenPostFilterOut(pp *packet.Parsed, t *tstun.Wra return filter.Accept } -func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) { - e.mu.Lock() - of, ok := e.pendOpen[flow] - if !ok { - // Not a tracked flow, or already handled & deleted. - e.mu.Unlock() - return - } - delete(e.pendOpen, flow) - problem := of.problem - e.mu.Unlock() +// openTimeoutDiag captures what [userspaceEngine.diagnoseOpenTimeout] +// inferred about a pendopen flow that timed out. It is the input to +// [classifyOpenTimeout]. +type openTimeoutDiag struct { + // problem is the TSMP reject reason previously reported by the peer + // via a MaybeBroken TSMP header, or zero if none. + problem packet.TailscaleRejectReason + + // noPeer is true if PeerForIP did not find a peer for the flow. + noPeer bool + // peerUnreachable is true when we identified a Tailscale peer for + // the flow but it cannot currently be reached (pre-0.100 client, + // no HomeDERP, in-netmap-but-unknown-to-WireGuard, etc.). + peerUnreachable bool + // onlyZeroRoute is true if PeerForIP matched the peer only via a + // /0 route (likely a non-selected exit node). The outer code + // deliberately stays silent in this case; classification would be + // misleading. + onlyZeroRoute bool +} + +// diagnoseOpenTimeout examines the engine state and logs a description +// of why a pendopen flow timed out. It returns the populated +// [openTimeoutDiag] so the caller can classify and (optionally) emit a +// connreject event. +func (e *userspaceEngine) diagnoseOpenTimeout(flow flowtrack.Tuple, problem packet.TailscaleRejectReason) openTimeoutDiag { + d := openTimeoutDiag{problem: problem} if !problem.IsZero() { e.logf("open-conn-track: timeout opening %v; peer reported problem: %v", flow, problem) } - // Diagnose why it might've timed out. pip, ok := e.PeerForIP(flow.DstAddr()) if !ok { e.logf("open-conn-track: timeout opening %v; no associated peer node", flow) - return + d.noPeer = true + return d } n := pip.Node if !n.IsWireGuardOnly() { if n.DiscoKey().IsZero() { e.logf("open-conn-track: timeout opening %v; peer node %v running pre-0.100", flow, n.Key().ShortString()) - return + d.peerUnreachable = true + return d } if n.HomeDERP() == 0 { e.logf("open-conn-track: timeout opening %v; peer node %v not connected to any DERP relay", flow, n.Key().ShortString()) - return + d.peerUnreachable = true + return d } } @@ -219,19 +249,19 @@ func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) { } } if onlyZeroRoute { - // This node was returned by peerForIP because - // its exit node /0 route(s) matched, but this - // might not be the exit node that's currently - // selected. Rather than log misleading - // errors, just don't log at all for now. - // TODO(bradfitz): update this code to be - // exit-node-aware and make peerForIP return - // the node of the currently selected exit - // node. - return + // This node was returned by peerForIP because its exit + // node /0 route(s) matched, but this might not be the + // exit node that's currently selected. Rather than log + // misleading errors, just don't log at all for now. + // TODO(bradfitz): update this code to be exit-node-aware + // and make peerForIP return the node of the currently + // selected exit node. + d.onlyZeroRoute = true + return d } e.logf("open-conn-track: timeout opening %v; target node %v in netmap but unknown to WireGuard", flow, n.Key().ShortString()) - return + d.peerUnreachable = true + return d } // TODO(bradfitz): figure out what PeerStatus.LastHandshake @@ -259,6 +289,78 @@ func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) { flow, n.Key().ShortString(), online, e.magicConn.LastRecvActivityOfNodeKey(n.Key())) + return d +} + +// classifyOpenTimeout maps an [openTimeoutDiag] to a +// [connreject.Reason] and [connreject.Source] for emission. A returned +// reason of "" means the caller should not emit any event (used for +// the deliberately-silent onlyZeroRoute case). +// +// If d.problem is set, the peer's TSMP-reported reason supersedes our +// own diagnosis and the event is tagged SourceTSMPRecv (the timeout +// confirms the previously non-terminal reject was actually terminal). +func classifyOpenTimeout(d openTimeoutDiag) (connreject.Reason, connreject.Source) { + if d.onlyZeroRoute { + return "", connreject.SourceUnknown + } + if !d.problem.IsZero() { + return rejectReasonToReason(d.problem), connreject.SourceTSMPRecv + } + switch { + case d.noPeer: + return connreject.ReasonNoPeer, connreject.SourcePendOpenTimeout + case d.peerUnreachable: + return connreject.ReasonPeerUnreachable, connreject.SourcePendOpenTimeout + } + return connreject.ReasonTimeout, connreject.SourcePendOpenTimeout +} + +func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) { + e.mu.Lock() + of, ok := e.pendOpen[flow] + if !ok { + // Not a tracked flow, or already handled & deleted. + e.mu.Unlock() + return + } + delete(e.pendOpen, flow) + problem := of.problem + e.mu.Unlock() + + d := e.diagnoseOpenTimeout(flow, problem) + reason, source := classifyOpenTimeout(d) + if reason == "" { + return + } + fnp := e.connRejectNote.Load() + if fnp == nil { + return + } + (*fnp)(connreject.Event{ + Direction: connreject.Outgoing, + Proto: flow.Proto(), + Src: flow.Src(), + Dst: flow.Dst(), + Reason: reason, + Source: source, + }) +} + +// rejectReasonToReason maps a [packet.TailscaleRejectReason] to its +// corresponding [connreject.Reason] tag. +func rejectReasonToReason(r packet.TailscaleRejectReason) connreject.Reason { + switch r { + case packet.RejectedDueToACLs: + return connreject.ReasonACL + case packet.RejectedDueToShieldsUp: + return connreject.ReasonShields + case packet.RejectedDueToIPForwarding: + return connreject.ReasonHostIPForwarding + case packet.RejectedDueToHostFirewall: + return connreject.ReasonHostFirewall + } + return connreject.ReasonUnknown } func durFmt(t time.Time) string { diff --git a/wgengine/pendopen_test.go b/wgengine/pendopen_test.go new file mode 100644 index 000000000..7fc69ef8e --- /dev/null +++ b/wgengine/pendopen_test.go @@ -0,0 +1,177 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !ts_omit_debug + +package wgengine + +import ( + "net/netip" + "testing" + + "tailscale.com/net/connreject" + "tailscale.com/net/packet" + "tailscale.com/types/ipproto" +) + +// TestInboundTSMPRecordsRejection asserts that a parsed inbound TSMP reject +// packet is delivered as an outgoing-direction [connreject.Event] to the +// installed note callback when trackOpenPreFilterIn processes it. +func TestInboundTSMPRecordsRejection(t *testing.T) { + t.Parallel() + var events []connreject.Event + e := &userspaceEngine{} + e.SetConnRejectNote(func(evt connreject.Event) { events = append(events, evt) }) + + rh := packet.TailscaleRejectedHeader{ + IPSrc: netip.MustParseAddr("100.0.0.2"), + IPDst: netip.MustParseAddr("100.0.0.1"), + Src: netip.MustParseAddrPort("100.0.0.1:12345"), + Dst: netip.MustParseAddrPort("100.0.0.2:443"), + Proto: ipproto.TCP, + Reason: packet.RejectedDueToACLs, + } + raw := packet.Generate(rh, nil) + var pp packet.Parsed + pp.Decode(raw) + + _ = e.trackOpenPreFilterIn(&pp, nil) + + if len(events) != 1 { + t.Fatalf("got %d events, want 1", len(events)) + } + got := events[0] + if got.Direction != connreject.Outgoing { + t.Errorf("Direction = %v, want Outgoing", got.Direction) + } + if got.Source != connreject.SourceTSMPRecv { + t.Errorf("Source = %v, want tsmp_recv", got.Source) + } + if got.Reason != connreject.ReasonACL { + t.Errorf("Reason = %q, want %q", got.Reason, connreject.ReasonACL) + } + if got.Proto != ipproto.TCP { + t.Errorf("Proto = %v, want TCP", got.Proto) + } + if got.Dst.Addr() != netip.MustParseAddr("100.0.0.2") { + t.Errorf("Dst.Addr = %v, want 100.0.0.2", got.Dst.Addr()) + } +} + +// TestInboundTSMPNonTerminalRecordsMaybeBroken asserts that a MaybeBroken +// TSMP reject is published with MaybeBroken=true. +func TestInboundTSMPNonTerminalRecordsMaybeBroken(t *testing.T) { + t.Parallel() + var events []connreject.Event + e := &userspaceEngine{} + e.SetConnRejectNote(func(evt connreject.Event) { events = append(events, evt) }) + + rh := packet.TailscaleRejectedHeader{ + IPSrc: netip.MustParseAddr("100.0.0.2"), + IPDst: netip.MustParseAddr("100.0.0.1"), + Src: netip.MustParseAddrPort("100.0.0.1:12345"), + Dst: netip.MustParseAddrPort("100.0.0.2:443"), + Proto: ipproto.TCP, + Reason: packet.RejectedDueToHostFirewall, + MaybeBroken: true, + } + raw := packet.Generate(rh, nil) + var pp packet.Parsed + pp.Decode(raw) + + _ = e.trackOpenPreFilterIn(&pp, nil) + + if len(events) != 1 { + t.Fatalf("got %d events, want 1", len(events)) + } + got := events[0] + if !got.MaybeBroken { + t.Error("MaybeBroken = false, want true") + } + if got.Reason != connreject.ReasonHostFirewall { + t.Errorf("Reason = %q, want %q", got.Reason, connreject.ReasonHostFirewall) + } +} + +func TestClassifyOpenTimeout(t *testing.T) { + t.Parallel() + tests := []struct { + name string + in openTimeoutDiag + wantReason connreject.Reason + wantSource connreject.Source + }{ + { + name: "onlyZeroRoute suppresses recording", + in: openTimeoutDiag{onlyZeroRoute: true}, + wantReason: "", + wantSource: connreject.SourceUnknown, + }, + { + name: "onlyZeroRoute suppresses even with problem", + in: openTimeoutDiag{onlyZeroRoute: true, problem: packet.RejectedDueToACLs}, + wantReason: "", + wantSource: connreject.SourceUnknown, + }, + { + name: "problem supersedes diagnosis", + in: openTimeoutDiag{problem: packet.RejectedDueToACLs, noPeer: true}, + wantReason: connreject.ReasonACL, + wantSource: connreject.SourceTSMPRecv, + }, + { + name: "problem host-firewall", + in: openTimeoutDiag{problem: packet.RejectedDueToHostFirewall}, + wantReason: connreject.ReasonHostFirewall, + wantSource: connreject.SourceTSMPRecv, + }, + { + name: "noPeer", + in: openTimeoutDiag{noPeer: true}, + wantReason: connreject.ReasonNoPeer, + wantSource: connreject.SourcePendOpenTimeout, + }, + { + name: "peerUnreachable", + in: openTimeoutDiag{peerUnreachable: true}, + wantReason: connreject.ReasonPeerUnreachable, + wantSource: connreject.SourcePendOpenTimeout, + }, + { + name: "plain timeout", + in: openTimeoutDiag{}, + wantReason: connreject.ReasonTimeout, + wantSource: connreject.SourcePendOpenTimeout, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + gotReason, gotSource := classifyOpenTimeout(tc.in) + if gotReason != tc.wantReason { + t.Errorf("reason = %q, want %q", gotReason, tc.wantReason) + } + if gotSource != tc.wantSource { + t.Errorf("source = %q, want %q", gotSource, tc.wantSource) + } + }) + } +} + +func TestRejectReasonToReason(t *testing.T) { + t.Parallel() + tests := []struct { + in packet.TailscaleRejectReason + want connreject.Reason + }{ + {packet.RejectedDueToACLs, connreject.ReasonACL}, + {packet.RejectedDueToShieldsUp, connreject.ReasonShields}, + {packet.RejectedDueToIPForwarding, connreject.ReasonHostIPForwarding}, + {packet.RejectedDueToHostFirewall, connreject.ReasonHostFirewall}, + {packet.TailscaleRejectReasonNone, connreject.ReasonUnknown}, + } + for _, tc := range tests { + if got := rejectReasonToReason(tc.in); got != tc.want { + t.Errorf("rejectReasonToReason(%v) = %q, want %q", tc.in, got, tc.want) + } + } +} diff --git a/wgengine/userspace.go b/wgengine/userspace.go index 23edf30b3..6d59992a6 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -28,6 +28,7 @@ import ( "tailscale.com/feature/buildfeatures" "tailscale.com/health" "tailscale.com/ipn/ipnstate" + "tailscale.com/net/connreject" "tailscale.com/net/dns" "tailscale.com/net/dns/resolver" "tailscale.com/net/ipset" @@ -77,6 +78,13 @@ type userspaceEngine struct { eventBus *eventbus.Bus eventClient *eventbus.Client + // connRejectNote is an optional callback invoked when the engine + // observes an outbound-direction rejection (an inbound TSMP reject + // from a peer or a pendopen timeout). Installed via + // [SetConnRejectNote], typically by the connreject feature + // extension at startup. + connRejectNote atomic.Pointer[func(connreject.Event)] + linkChangeQueue execqueue.ExecQueue logf logger.Logf @@ -1662,6 +1670,21 @@ func (e *userspaceEngine) InstallCaptureHook(cb packet.CaptureCallback) { e.magicConn.InstallCaptureHook(cb) } +// SetConnRejectNote installs a callback invoked when the engine +// observes an outbound-direction connection rejection (an inbound TSMP +// reject from a peer or a pendopen timeout). Passing nil uninstalls a +// previously installed callback. +// +// SetConnRejectNote is typically called once at startup by the +// [tailscale.com/feature/connreject] extension. +func (e *userspaceEngine) SetConnRejectNote(fn func(connreject.Event)) { + if fn == nil { + e.connRejectNote.Store(nil) + return + } + e.connRejectNote.Store(&fn) +} + func (e *userspaceEngine) reconfigureVPNIfNecessary() error { if e.reconfigureVPN == nil { return nil diff --git a/wgengine/watchdog.go b/wgengine/watchdog.go index 6aa1c1bd6..cc2f1015d 100644 --- a/wgengine/watchdog.go +++ b/wgengine/watchdog.go @@ -17,6 +17,7 @@ import ( "tailscale.com/envknob" "tailscale.com/feature/buildfeatures" "tailscale.com/ipn/ipnstate" + "tailscale.com/net/connreject" "tailscale.com/net/dns" "tailscale.com/net/packet" "tailscale.com/tailcfg" @@ -243,6 +244,10 @@ func (e *watchdogEngine) InstallCaptureHook(cb packet.CaptureCallback) { e.wrap.InstallCaptureHook(cb) } +func (e *watchdogEngine) SetConnRejectNote(fn func(connreject.Event)) { + e.wrap.SetConnRejectNote(fn) +} + func (e *watchdogEngine) PeerByKey(pubKey key.NodePublic) (_ wgint.Peer, ok bool) { return e.wrap.PeerByKey(pubKey) } diff --git a/wgengine/wgengine.go b/wgengine/wgengine.go index 5ca4b75cf..f4ab0b896 100644 --- a/wgengine/wgengine.go +++ b/wgengine/wgengine.go @@ -10,6 +10,7 @@ import ( "time" "tailscale.com/ipn/ipnstate" + "tailscale.com/net/connreject" "tailscale.com/net/dns" "tailscale.com/net/packet" "tailscale.com/tailcfg" @@ -141,4 +142,14 @@ type Engine interface { // SetPeerByIPPacketFunc installs a callback used by wireguard-go to // look up which peer should handle an outbound packet by destination IP. SetPeerByIPPacketFunc(func(netip.Addr) (_ key.NodePublic, ok bool)) + + // SetConnRejectNote installs a callback invoked when the engine + // observes an outbound-direction connection rejection (an inbound + // TSMP reject from a peer or a pendopen timeout). Passing nil + // uninstalls a previously installed callback. + // + // The callback is invoked synchronously on the data-plane + // goroutine; implementations should treat it as if it were a + // metric-emit callback and return promptly. + SetConnRejectNote(func(connreject.Event)) }