mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-06 04:36:15 +02:00
Add a Go benchmark that exercises a single tailnet client (a [tsnet.Server]
running in the test process) against a synthetic large initial netmap and
a stream of caller-driven peer add/remove deltas, all in-process.
The harness is split in two parts:
- tstest/largetailnet, a reusable package containing a [Streamer]
that hijacks the map long-poll on a [testcontrol.Server] via the new
AltMapStream hook, sends one initial MapResponse with N synthetic
peers, and forwards caller-supplied delta MapResponses on the same
stream. Helpers like MakePeer / AllocPeer build synthetic peers with
unique IDs and addresses derived from the Tailscale ULA range.
- tstest/largetailnet/largetailnet_test.go, BenchmarkGiantTailnet
(headless tailscaled workload, no IPN bus subscriber) and
BenchmarkGiantTailnetBusWatcher (GUI-client workload with one
Notify subscriber attached). Both are gated on
--actually-test-giant-tailnet (skipped by default), stand up an
in-process testcontrol + tsnet.Server, let Up block until the
initial N-peer netmap has been processed, then ResetTimer and run
add+remove pairs via b.Loop. Per-delta sync is via a test-only
[ipnlocal.LocalBackend.AwaitNodeKeyForTest] channel that closes
once the just-added peer key appears in the netmap (no-watcher
variant) or via bus-Notify drain (bus-watcher variant).
To support the hijack, [testcontrol.Server] grows an AltMapStream hook
and a small MapStreamWriter interface for benchmarks/stress tests that
need to drive a controlled MapResponse sequence; the normal serveMap
path is untouched when AltMapStream is nil. The streamer answers
non-streaming "lite" map polls (which controlclient issues before the
streaming long-poll to push HostInfo) with an empty MapResponse and
returns immediately, so the streaming poll that follows is the one
that gets the initial netmap.
The benchmark is intended for before/after comparisons of netmap- and
delta-handling changes targeted at large tailnets. CPU profiles on
unmodified main show the expected O(N) hotspots:
setControlClientStatusLocked / authReconfigLocked /
userspaceEngine.Reconfig / setNetMapLocked, plus JSON encoding of the
full Notify.NetMap to bus watchers (which dominates the BusWatcher
variant).
Median ms/op over 10 runs on unmodified main, by tailnet size N:
N no-watcher bus-watcher
10000 32 166
50000 222 865
100000 504 1765
250000 1551 4696
Recommended invocation:
go test ./tstest/largetailnet/ -run=^$ \
-bench='BenchmarkGiantTailnet(BusWatcher)?$' \
-benchtime=2000x -timeout=10m \
--actually-test-giant-tailnet \
--giant-tailnet-n=250000 \
-cpuprofile=/tmp/giant.cpu.pprof
Updates #12542
Change-Id: I4f5b2bb271a36ba853d5a0ffe82054ef2b15c585
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
219 lines
6.6 KiB
Go
219 lines
6.6 KiB
Go
// Copyright (c) Tailscale Inc & contributors
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package largetailnet_test
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"net/http/httptest"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"testing"
|
|
"time"
|
|
|
|
"tailscale.com/ipn/store/mem"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/tsnet"
|
|
"tailscale.com/tstest/integration"
|
|
"tailscale.com/tstest/integration/testcontrol"
|
|
"tailscale.com/tstest/largetailnet"
|
|
"tailscale.com/types/logger"
|
|
)
|
|
|
|
// tsnet.Server.Up handles the wait-for-ipn.Running step itself: it
|
|
// subscribes to the IPN bus with NotifyInitialState and blocks until State
|
|
// reaches ipn.Running, which by definition means a netmap has been applied.
|
|
// We don't redo that work here.
|
|
|
|
var (
|
|
flagActuallyTest = flag.Bool("actually-test-giant-tailnet", false,
|
|
"if set, run the BenchmarkGiantTailnet* benchmarks; otherwise they are skipped")
|
|
flagN = flag.Int("giant-tailnet-n", 250_000,
|
|
"size of the initial netmap (peer count) for BenchmarkGiantTailnet*")
|
|
flagBenchVerbose = flag.Bool("giant-tailnet-verbose", false,
|
|
"if set, log tsnet output and DERP setup to stderr")
|
|
)
|
|
|
|
// BenchmarkGiantTailnet measures the per-delta CPU cost of a tailnet client
|
|
// processing peer-add/peer-remove deltas in steady state, with no IPN bus
|
|
// subscribers attached. This represents the headless-tailscaled workload
|
|
// (Linux subnet routers, container sidecars, ...) where the LocalBackend
|
|
// does not pay for fanning Notify.NetMap out to GUI watchers.
|
|
//
|
|
// Use [BenchmarkGiantTailnetBusWatcher] for the GUI-client workload.
|
|
//
|
|
// The benchmark is opt-in via --actually-test-giant-tailnet.
|
|
func BenchmarkGiantTailnet(b *testing.B) {
|
|
if !*flagActuallyTest {
|
|
b.Skip("set --actually-test-giant-tailnet to run this benchmark")
|
|
}
|
|
benchGiantTailnet(b, false)
|
|
}
|
|
|
|
// BenchmarkGiantTailnetBusWatcher is like [BenchmarkGiantTailnet] but
|
|
// attaches one [local.Client.WatchIPNBus] subscriber for the duration of the
|
|
// benchmark. The Notify-fan-out cost (notably Notify.NetMap encoding to
|
|
// every watcher on every full-rebuild path) is therefore included in the
|
|
// per-delta measurement, which approximates the GUI-client workload.
|
|
//
|
|
// The benchmark is opt-in via --actually-test-giant-tailnet.
|
|
func BenchmarkGiantTailnetBusWatcher(b *testing.B) {
|
|
if !*flagActuallyTest {
|
|
b.Skip("set --actually-test-giant-tailnet to run this benchmark")
|
|
}
|
|
benchGiantTailnet(b, true)
|
|
}
|
|
|
|
// benchGiantTailnet is the shared body of the BenchmarkGiantTailnet*
|
|
// benchmarks. Setup is entirely in-process: a [testcontrol.Server] hosts
|
|
// the control plane, a [tsnet.Server] hosts the client, and a
|
|
// [largetailnet.Streamer] hijacks the map long-poll to drive an exact
|
|
// MapResponse sequence.
|
|
//
|
|
// Each loop iteration sends one [tailcfg.MapResponse] with PeersChanged
|
|
// (a fresh peer) and PeersRemoved (the previous fresh peer), then waits
|
|
// for the client to apply it. Net peer count stays at flagN throughout the
|
|
// loop.
|
|
//
|
|
// The wait mechanism differs by variant:
|
|
//
|
|
// - busWatcher=false: block on a channel returned by
|
|
// [ipnlocal.LocalBackend.AwaitNodeKeyForTest] (reached via
|
|
// [tsnet.TestHooks]). The channel is closed by LocalBackend the moment
|
|
// the just-added peer's key appears in the netmap, so the wait has zero
|
|
// polling overhead.
|
|
// - busWatcher=true: drain Notify events from the bus subscription, since
|
|
// a Notify firing is exactly the side-effect we want to amortize into
|
|
// the per-delta measurement.
|
|
//
|
|
// Recommended invocation for profiling on unmodified main:
|
|
//
|
|
// go test ./tstest/largetailnet/ -run=^$ \
|
|
// -bench='BenchmarkGiantTailnet(BusWatcher)?$' \
|
|
// -benchtime=2000x -timeout=10m \
|
|
// --actually-test-giant-tailnet \
|
|
// --giant-tailnet-n=250000 \
|
|
// -cpuprofile=/tmp/giant.cpu.pprof
|
|
func benchGiantTailnet(b *testing.B, busWatcher bool) {
|
|
logf := logger.Discard
|
|
if *flagBenchVerbose {
|
|
logf = b.Logf
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
|
b.Cleanup(cancel)
|
|
|
|
derpMap := integration.RunDERPAndSTUN(b, logf, "127.0.0.1")
|
|
|
|
streamer := largetailnet.New(*flagN, derpMap)
|
|
|
|
ctrl := &testcontrol.Server{
|
|
DERPMap: derpMap,
|
|
DNSConfig: &tailcfg.DNSConfig{},
|
|
AltMapStream: streamer.AltMapStream(),
|
|
Logf: logf,
|
|
}
|
|
ctrl.HTTPTestServer = httptest.NewUnstartedServer(ctrl)
|
|
ctrl.HTTPTestServer.Start()
|
|
b.Cleanup(ctrl.HTTPTestServer.Close)
|
|
controlURL := ctrl.HTTPTestServer.URL
|
|
b.Logf("testcontrol listening on %s", controlURL)
|
|
|
|
tmp := filepath.Join(b.TempDir(), "tsnet")
|
|
if err := os.MkdirAll(tmp, 0755); err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
|
|
s := &tsnet.Server{
|
|
Dir: tmp,
|
|
ControlURL: controlURL,
|
|
Hostname: "largetailnet-bench",
|
|
Store: new(mem.Store),
|
|
Ephemeral: true,
|
|
Logf: logf,
|
|
}
|
|
b.Cleanup(func() { s.Close() })
|
|
|
|
// tsnet.Server.Up blocks until the backend reaches Running, which
|
|
// requires the initial flagN-peer MapResponse to have been processed.
|
|
upStart := time.Now()
|
|
if _, err := s.Up(ctx); err != nil {
|
|
b.Fatalf("tsnet.Server.Up: %v", err)
|
|
}
|
|
b.Logf("initial %d-peer netmap processed in %v", *flagN, time.Since(upStart))
|
|
|
|
lc, err := s.LocalClient()
|
|
if err != nil {
|
|
b.Fatalf("LocalClient: %v", err)
|
|
}
|
|
lb := tsnet.TestHooks.LocalBackend(s)
|
|
|
|
var notifyCh chan struct{}
|
|
if busWatcher {
|
|
bw, err := lc.WatchIPNBus(ctx, 0)
|
|
if err != nil {
|
|
b.Fatalf("WatchIPNBus: %v", err)
|
|
}
|
|
b.Cleanup(func() { bw.Close() })
|
|
notifyCh = make(chan struct{}, 1024)
|
|
go func() {
|
|
for {
|
|
n, err := bw.Next()
|
|
if err != nil {
|
|
return
|
|
}
|
|
if n.NetMap != nil || len(n.PeerChanges) > 0 {
|
|
select {
|
|
case notifyCh <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
var prevAdded *tailcfg.Node
|
|
runtime.GC()
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
added := streamer.AllocPeer()
|
|
mr := &tailcfg.MapResponse{
|
|
PeersChanged: []*tailcfg.Node{added},
|
|
}
|
|
if prevAdded != nil {
|
|
mr.PeersRemoved = []tailcfg.NodeID{prevAdded.ID}
|
|
}
|
|
prevAdded = added
|
|
|
|
if err := streamer.SendDelta(ctx, mr); err != nil {
|
|
b.Fatalf("SendDelta: %v", err)
|
|
}
|
|
|
|
if busWatcher {
|
|
// A Notify firing is itself part of the workload we
|
|
// want to measure on this variant.
|
|
select {
|
|
case <-notifyCh:
|
|
case <-time.After(10 * time.Second):
|
|
b.Fatal("timed out waiting for notify")
|
|
case <-ctx.Done():
|
|
b.Fatalf("ctx done waiting for notify: %v", ctx.Err())
|
|
}
|
|
} else {
|
|
// Block on the LocalBackend's test-only signal that
|
|
// the just-added peer key has landed in the netmap.
|
|
// No polling, no notify fan-out cost.
|
|
select {
|
|
case <-lb.AwaitNodeKeyForTest(added.Key):
|
|
case <-time.After(10 * time.Second):
|
|
b.Fatalf("timed out waiting for node key %v", added.Key)
|
|
case <-ctx.Done():
|
|
b.Fatalf("ctx done waiting for node key: %v", ctx.Err())
|
|
}
|
|
}
|
|
}
|
|
}
|