From 9f48567bf1933273b023c2f60e04babbecb68ae5 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Tue, 12 May 2026 12:01:05 -0700 Subject: [PATCH] ipn/ipnlocal,wgengine/magicsock: add basic counters for cached peer connectivity (#19699) Add new clientmetric counters for establishing contact with peers while using cached network map data. To do this, instrument the magicsock.Conn with a bit to indicate whether its peer data came from a cached netmap. If so, there are two conditions we will count as establishing connectivity to a peer: - Receipt of a CallMeMaybe from a peer via disco. - Establishing a valid endpoint address for a peer. In vmtest, add Env.ClientMetrics to scrape metrics from the specified node. Use this to check that counters were updated in caching tests. Updates https://github.com/tailscale/projects/issues/13 Updates #12639 Change-Id: Ie8cf3244ac8af4f5bcfe4d0d944078da2ba08990 Signed-off-by: M. J. Fromberger --- ipn/ipnlocal/local.go | 6 ++- tstest/natlab/vmtest/vmtest.go | 64 +++++++++++++++++++++++++++++ tstest/natlab/vmtest/vmtest_test.go | 39 ++++++++++++++++-- wgengine/magicsock/endpoint.go | 7 ++++ wgengine/magicsock/magicsock.go | 59 +++++++++++++++++++------- 5 files changed, 155 insertions(+), 20 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 03863648a..cba23b6a6 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -6587,7 +6587,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { b.currentNode().SetNetMap(nm) if ms, ok := b.sys.MagicSock.GetOK(); ok { if nm != nil { - ms.SetNetworkMap(nm.SelfNode, nm.Peers) + if nm.Cached { + ms.SetNetworkMapCached(nm.SelfNode, nm.Peers) + } else { + ms.SetNetworkMap(nm.SelfNode, nm.Peers) + } } else { ms.SetNetworkMap(tailcfg.NodeView{}, nil) } diff --git a/tstest/natlab/vmtest/vmtest.go b/tstest/natlab/vmtest/vmtest.go index 679a3e45f..206ffde78 100644 --- a/tstest/natlab/vmtest/vmtest.go +++ b/tstest/natlab/vmtest/vmtest.go @@ -38,6 +38,8 @@ import ( "time" "github.com/google/gopacket/layers" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" "go4.org/mem" "golang.org/x/sync/errgroup" "tailscale.com/client/local" @@ -847,6 +849,68 @@ func (e *Env) Status(n *Node) *ipnstate.Status { return st } +// ClientMetrics returns the client metrics exported by the given node. +func (e *Env) ClientMetrics(n *Node) ClientMetrics { + e.t.Helper() + raw, err := n.Agent().DaemonMetrics(e.t.Context()) + if err != nil { + e.t.Fatalf("Node %q DaemonMetrics: %v", n.Name(), err) + } + + // Metrics are reported in Prometheus exposition format. + var parser expfmt.TextParser + mfs, err := parser.TextToMetricFamilies(bytes.NewReader(raw)) + if err != nil { + e.t.Fatalf("Node %q parse client metrics: %v", n.Name(), err) + } + + // Tailscale client metrics are all unlabelled integer-valued counters and + // gauges, so we don't need to handle the full generality of the Prometheus + // representation. If we see anything else, we'll log and skip it. + out := make(ClientMetrics) + for _, mf := range mfs { + name := mf.GetName() + if _, ok := out[name]; ok { + e.t.Logf("Node %q: duplicate client metric %q (ignored)", n.Name(), name) + continue + } else if len(mf.Metric) != 1 { + e.t.Logf("Node %q: got %d values for client metric %q, want 1 (ignored)", n.Name(), len(mf.Metric), name) + continue + } + + var mtype string + var value int64 + switch mf.GetType() { + case dto.MetricType_COUNTER: + mtype = "counter" + value = int64(mf.Metric[0].GetCounter().GetValue()) + case dto.MetricType_GAUGE: + mtype = "gauge" + value = int64(mf.Metric[0].GetGauge().GetValue()) + default: + e.t.Logf("Node %q unexpected client metric %q type %q (ignored)", n.Name(), name, mf.GetType().String()) + continue + } + out[name] = ClientMetric{ + Name: name, + Type: mtype, + Value: value, + } + } + return out +} + +// ClientMetrics is a view of the client metrics exported by a node. +// The keys of the map are the metric names. +type ClientMetrics map[string]ClientMetric + +// ClientMetric is a view of a node client metric. +type ClientMetric struct { + Name string // as published to the clientmetrics package + Type string // either "gauge" or "counter" + Value int64 // the gauge or counter value +} + // SetAcceptRoutes toggles the node's RouteAll preference (the // --accept-routes flag), controlling whether it installs subnet routes // advertised by peers. diff --git a/tstest/natlab/vmtest/vmtest_test.go b/tstest/natlab/vmtest/vmtest_test.go index fe8d00bef..c317ed712 100644 --- a/tstest/natlab/vmtest/vmtest_test.go +++ b/tstest/natlab/vmtest/vmtest_test.go @@ -923,6 +923,20 @@ func TestMullvadExitNode(t *testing.T) { check(checkOff2Step, "exit-off (again)", clientWAN) } +// checkClientMetrics verifies that each entry in want exists and has the given +// value in metrics. +func checkClientMetrics(t *testing.T, label string, metrics vmtest.ClientMetrics, want map[string]int64) { + t.Helper() + for name, wantValue := range want { + got, ok := metrics[name] + if !ok { + t.Errorf("%s: required metric %q not found", label, name) + } else if got.Value != wantValue { + t.Errorf("%s: metric %q: got %v, want %v", label, name, got.Value, wantValue) + } + } +} + // TestCachedNetmapAfterRestart verifies that two nodes with netmap // caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct // WireGuard tunnel after both are restarted while the control server is @@ -1020,13 +1034,23 @@ func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) { vmtest.OS(vmtest.Gokrazy), tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil}) + checkInitialMetrics := env.AddStep("Check initial client metrics") cutControlStep := env.AddStep("Cut control server access") restartStep := env.AddStep("Restart tailscaled on a") tsmpPingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)") - DiscoPingStep := env.AddStep("Ping a → b Disco (want Direct)") + discoPingStep := env.AddStep("Ping a → b Disco (want Direct)") + checkFinalMetrics := env.AddStep("Check final client metrics") env.Start() + // Before: Verify that we have not recorded any cached contacts. + checkInitialMetrics.Begin() + checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{ + "magicsock_cached_peer_contact_derp": 0, + "magicsock_cached_peer_contact_direct": 0, + }) + checkInitialMetrics.End(nil) + cutControlStep.Begin() a.DropControlTraffic() env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) { @@ -1047,10 +1071,17 @@ func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) { } tsmpPingStep.End(nil) - DiscoPingStep.Begin() + discoPingStep.Begin() if err := env.PingExpect(a, b, vmtest.PingRouteDirect, 30*time.Second); err != nil { - DiscoPingStep.End(err) + discoPingStep.End(err) t.Fatal(err) } - DiscoPingStep.End(nil) + discoPingStep.End(nil) + + // After: Verify that we recorded a direct contact on the disconnected node. + checkFinalMetrics.Begin() + checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{ + "magicsock_cached_peer_contact_direct": 1, + }) + checkFinalMetrics.End(nil) } diff --git a/wgengine/magicsock/endpoint.go b/wgengine/magicsock/endpoint.go index 71edfe9a1..d831a9032 100644 --- a/wgengine/magicsock/endpoint.go +++ b/wgengine/magicsock/endpoint.go @@ -138,6 +138,13 @@ func (de *endpoint) udpRelayEndpointReady(maybeBest addrQuality) { func (de *endpoint) setBestAddrLocked(v addrQuality) { if v.epAddr != de.bestAddr.epAddr { de.probeUDPLifetime.resetCycleEndpointLocked() + + // Reaching here, if we are using data from a cached netmap and we are + // upgrading from an invalid (missing) address to a valid one, increment + // the counter for peers established. + if !de.bestAddr.ap.IsValid() && v.ap.IsValid() && de.c.usingCachedNetmap.Load() { + metricCachedPeerContactDirect.Add(1) + } } de.bestAddr = v } diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 9720f57cd..6461c552e 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -272,6 +272,9 @@ type Conn struct { // discoAtomic is the current disco private and public keypair for this conn. discoAtomic discoAtomic + // usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map + usingCachedNetmap atomic.Bool + // ============================================================ // mu guards all following fields; see userspaceEngine lock // ordering rules against the engine. For derphttp, mu must @@ -355,18 +358,19 @@ type Conn struct { // magicsock could do with any complexity reduction it can get. netInfoLast *tailcfg.NetInfo - derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled - self tailcfg.NodeView // from last SetNetworkMap - peersByID map[tailcfg.NodeID]tailcfg.NodeView // current peer set, keyed by NodeID. Maintained by SetNetworkMap/UpsertPeer/RemovePeer. Note: per-field NodeMutation patches received in UpdateNetmapDelta are never applied to these snapshots. - filt *filter.Filter // from last SetFilter - relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers - lastFlags debugFlags // at time of last SetNetworkMap - privateKey key.NodePrivate // WireGuard private key for this node - everHadKey bool // whether we ever had a non-zero private key - myDerp int // nearest DERP region ID; 0 means none/unknown - homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0) - derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close - activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region + derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled + self tailcfg.NodeView // from last SetNetworkMap + peersByID map[tailcfg.NodeID]tailcfg.NodeView // current peer set, keyed by NodeID. Maintained by SetNetworkMap/UpsertPeer/RemovePeer. Note: per-field NodeMutation patches received in UpdateNetmapDelta are never applied to these snapshots. + + filt *filter.Filter // from last SetFilter + relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers + lastFlags debugFlags // at time of last SetNetworkMap + privateKey key.NodePrivate // WireGuard private key for this node + everHadKey bool // whether we ever had a non-zero private key + myDerp int // nearest DERP region ID; 0 means none/unknown + homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0) + derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close + activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region prevDerp map[int]*syncs.WaitGroupChan // derpRoute contains optional alternate routes to use as an @@ -2334,6 +2338,13 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake c.logf("[unexpected] %s from peer via DERP whose netmap discokey != disco source", msgType) return } + + // Reaching here, if we are using data from a cached network map the + // receipt of a CallMeMaybe from a peer indicates we have a sufficiently + // viable connection to that peer to count it as active while cached. + if c.usingCachedNetmap.Load() { + metricCachedPeerContactDERP.Add(1) + } if isVia { c.dlogf("[v1] magicsock: disco: %v<-%v via %v (%v, %v) got call-me-maybe-via, %d endpoints", c.discoAtomic.Short(), epDisco.short, via.ServerDisco.ShortString(), @@ -2954,9 +2965,10 @@ func (c *candidatePeerRelay) isValid() bool { return !c.nodeKey.IsZero() && !c.discoKey.IsZero() } -// SetNetworkMap updates the network map with the given self node and peers. -// It must be called synchronously from the caller's goroutine to ensure -// magicsock has the current state before subsequent operations proceed. +// SetNetworkMap updates the network map with the given self node and peers +// reported by the control plane (rather than cached). It must be called +// synchronously from the caller's goroutine to ensure magicsock has the +// current state before subsequent operations proceed. // // self may be invalid if there's no network map. // @@ -2966,6 +2978,18 @@ func (c *candidatePeerRelay) isValid() bool { // initial netmap and for changes to self or to global state (filter, DERP, // etc.) that aren't covered by the per-peer methods. func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) { + c.setNetworkMapInternal(self, peers, false) +} + +// SetNetworkMapCached behaves as SetNetworkMap, but indicates to c that the +// data provided are from a cache rather than the control plane. The same +// constraints otherwise apply. +func (c *Conn) SetNetworkMapCached(self tailcfg.NodeView, peers []tailcfg.NodeView) { + c.setNetworkMapInternal(self, peers, true) +} + +// setNetworkMapInternal is the shared implementation of SetNetworkMap and SetNetworkMapCached. +func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.NodeView, isCached bool) { peersChanged := c.updateNodes(self, peers) relayClientEnabled := self.Valid() && @@ -2979,6 +3003,7 @@ func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) { selfView := c.self peersSnap := c.peerSnapshotLocked() isClosed := c.closed + c.usingCachedNetmap.Store(isCached) c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n) if isClosed { @@ -4198,6 +4223,10 @@ var ( metricTSMPDiscoKeyAdvertisementReceived = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_received") metricTSMPDiscoKeyAdvertisementApplied = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_applied") metricTSMPDiscoKeyAdvertisementUnchanged = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_unchanged") + + // Counters for peer contacts established using cached network map data. + metricCachedPeerContactDERP = clientmetric.NewCounter("magicsock_cached_peer_contact_derp") + metricCachedPeerContactDirect = clientmetric.NewCounter("magicsock_cached_peer_contact_direct") ) // newUDPLifetimeCounter returns a new *clientmetric.Metric with the provided