fix flaky tests: improve test isolation and reliability

Test fixes for consistent pass rates under repeated runs (-count=N): Global state isolation: - appc: check metric deltas instead of absolute values - cmd/derper: initialize DNS caches before test - tsweb/varz: use unpublished expvar to avoid global registration - wgengine/netstack: check metric deltas instead of absolute values - net/dns: use SetForTest() with deferred restore for hooks Timeout and concurrency: - cmd/containerboot: increase wait loop timeouts for parallel load - tsconsensus: add deadline to waitFor(), use sync.Once for netns - tstest/integration: add tstest.Shard/Parallel, fix IPN bus watchers - net/netcheck: set testCaptivePortalDelay to prevent hangs - wgengine/magicsock: use Port:0, add timeout to callback wait - drive/driveimpl: use http.Server for proper shutdown Race conditions: - tstest/archtest: remove !race from build constraint - util/deephash: use local sink variable instead of package-level - net/art: switch to math/rand/v2 for thread-safe globals - tstest/integration: use Status() instead of MustStatus() from goroutines Test optimization: - net/udprelay: rewrite VNI test to avoid iterating all 16M values - ipn/ipnlocal: reset env vars between subtests - cmd/containerboot/serve: use SetWaitDurationForTest - tsnet: wait for service VIP in AllowedIPs before dialing Change-Id: Id6186fb8a45031920550a208ded77382e57cc016 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Avery Pennarun <apenwarr@tailscale.com>
2026-05-05 04:06:35 +02:00 · 2026-04-13 03:54:59 +02:00 · 2026-04-13 03:54:59 +02:00 · 97e021d8ed
commit 97e021d8ed
parent ebbf47ca1a
19 changed files with 297 additions and 104 deletions
--- a/appc/appconnector_test.go
+++ b/appc/appconnector_test.go
@ -736,12 +736,19 @@ func TestRateLogger(t *testing.T) {
 }

 func TestRouteStoreMetrics(t *testing.T) {
+	// Capture initial metric values before the test actions, since metrics
+	// are global counters that persist across test runs.
+	initial := make(map[string]int64)
+	for _, x := range clientmetric.Metrics() {
+		initial[x.Name()] = x.Value()
+	}
+
 	metricStoreRoutes(1, 1)
 	metricStoreRoutes(1, 1)         // the 1 buckets value should be 2
 	metricStoreRoutes(5, 5)         // the 5 buckets value should be 1
 	metricStoreRoutes(6, 6)         // the 10 buckets value should be 1
 	metricStoreRoutes(10001, 10001) // the over buckets value should be 1
-	wanted := map[string]int64{
+	wantedDelta := map[string]int64{
 		"appc_store_routes_n_routes_1":    2,
 		"appc_store_routes_rate_1":        2,
 		"appc_store_routes_n_routes_5":    1,
@ -752,8 +759,9 @@ func TestRouteStoreMetrics(t *testing.T) {
 		"appc_store_routes_rate_over":     1,
 	}
 	for _, x := range clientmetric.Metrics() {
-		if x.Value() != wanted[x.Name()] {
-			t.Errorf("%s: want: %d, got: %d", x.Name(), wanted[x.Name()], x.Value())
+		delta := x.Value() - initial[x.Name()]
+		if delta != wantedDelta[x.Name()] {
+			t.Errorf("%s: want delta: %d, got delta: %d", x.Name(), wantedDelta[x.Name()], delta)
 		}
 	}
 }
--- a/cmd/containerboot/main_test.go
+++ b/cmd/containerboot/main_test.go
@ -1324,9 +1324,13 @@ func TestContainerBoot(t *testing.T) {
 				}

 				wantCmds = append(wantCmds, p.WantCmds...)
-				waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n"))
+				// Use generous timeouts (10s) because tests run in parallel,
+				// causing resource contention. The polling interval is 100ms,
+				// so successful tests complete quickly; the timeout only
+				// affects failure cases.
+				waitArgs(t, 10*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n"))

-				err = tstest.WaitFor(2*time.Second, func() error {
+				err = tstest.WaitFor(10*time.Second, func() error {
 					for path, want := range p.WantFiles {
 						gotBs, err := os.ReadFile(filepath.Join(env.d, path))
 						if err != nil {
@ -1343,7 +1347,7 @@ func TestContainerBoot(t *testing.T) {
 				}

 				for url, want := range p.EndpointStatuses {
-					err := tstest.WaitFor(2*time.Second, func() error {
+					err := tstest.WaitFor(10*time.Second, func() error {
 						resp, err := http.Get(url)
 						if err != nil && want != -1 {
 							return fmt.Errorf("GET %s: %v", url, err)
@ -1361,7 +1365,7 @@ func TestContainerBoot(t *testing.T) {
 					}
 				}
 			}
-			waitLogLine(t, 2*time.Second, cbOut, "Startup complete, waiting for shutdown signal")
+			waitLogLine(t, 10*time.Second, cbOut, "Startup complete, waiting for shutdown signal")
 			if cmd.ProcessState != nil {
 				t.Fatalf("containerboot should be running but exited with exit code %d", cmd.ProcessState.ExitCode())
 			}
--- a/cmd/containerboot/serve_test.go
+++ b/cmd/containerboot/serve_test.go
@ -10,11 +10,13 @@ import (
 	"os"
 	"path/filepath"
 	"testing"
+	"time"

 	"github.com/google/go-cmp/cmp"
 	"tailscale.com/ipn"
 	"tailscale.com/kube/kubetypes"
 	"tailscale.com/kube/localclient"
+	"tailscale.com/kube/services"
 	"tailscale.com/tailcfg"
 )

@ -198,6 +200,10 @@ func TestReadServeConfig(t *testing.T) {
 }

 func TestRefreshAdvertiseServices(t *testing.T) {
+	// Use a very short wait duration for tests to avoid 20-second delays.
+	restore := services.SetWaitDurationForTest(1 * time.Millisecond)
+	t.Cleanup(restore)
+
 	tests := []struct {
 		name                string
 		sc                  *ipn.ServeConfig
--- a/cmd/derper/bootstrap_dns_test.go
+++ b/cmd/derper/bootstrap_dns_test.go
@ -166,6 +166,13 @@ func TestUnpublishedDNSEmptyList(t *testing.T) {
 }

 func TestLookupMetric(t *testing.T) {
+	// Set up the DNS caches so handleBootstrapDNS returns valid JSON.
+	pub := &dnsEntryMap{
+		IPs: map[string][]net.IP{"tailscale.com": {net.IPv4(10, 10, 10, 10)}},
+	}
+	dnsCache.Store(pub)
+	dnsCacheBytes.Store([]byte(`{"tailscale.com":["10.10.10.10"]}`))
+
 	d := []string{"a.io", "b.io", "c.io", "d.io", "e.io", "e.io", "e.io", "a.io"}
 	resetMetrics()
 	for _, q := range d {
--- a/drive/driveimpl/drive_test.go
+++ b/drive/driveimpl/drive_test.go
@ -434,6 +434,7 @@ type local struct {

 type remote struct {
 	l           net.Listener
+	httpServer  *http.Server
 	fs          *FileSystemForRemote
 	fileServer  *FileServer
 	shares      map[string]string
@ -510,14 +511,15 @@ func (s *system) addRemote(name string) string {
 	s.t.Logf("FileServer for %v listening at %s", name, fileServer.Addr())

 	r := &remote{
-		l:           ln,
-		fileServer:  fileServer,
-		fs:          NewFileSystemForRemote(log.Printf),
-		shares:      make(map[string]string),
+		l:          ln,
+		fileServer: fileServer,
+		fs:         NewFileSystemForRemote(log.Printf),
+		shares:     make(map[string]string),
 		permissions: make(map[string]drive.Permission),
 	}
 	r.fs.SetFileServerAddr(fileServer.Addr())
-	go http.Serve(ln, r)
+	r.httpServer = &http.Server{Handler: r}
+	go r.httpServer.Serve(ln)
 	s.remotes[name] = r

 	remotes := make([]*drive.Remote, 0, len(s.remotes))
@ -694,9 +696,9 @@ func (s *system) stop() {
 			s.t.Fatalf("failed to Close remote fs: %s", err)
 		}

-		err = r.l.Close()
+		err = r.httpServer.Close()
 		if err != nil {
-			s.t.Fatalf("failed to Close remote listener: %s", err)
+			s.t.Fatalf("failed to Close remote http server: %s", err)
 		}

 		err = r.fileServer.Close()
--- a/ipn/ipnlocal/cert_test.go
+++ b/ipn/ipnlocal/cert_test.go
@ -518,9 +518,13 @@ func TestGetCertPEMWithValidity(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-
+			// Always reset the cert share mode at the start of each subtest to ensure
+			// test isolation. Without this, a previous subtest's "ro" setting would
+			// leak into subsequent subtests.
 			if tt.readOnlyMode {
 				envknob.Setenv("TS_CERT_SHARE_MODE", "ro")
+			} else {
+				envknob.Setenv("TS_CERT_SHARE_MODE", "")
 			}

 			os.RemoveAll(certDir)
--- a/net/art/table_test.go
+++ b/net/art/table_test.go
@ -6,7 +6,7 @@ package art
 import (
 	crand "crypto/rand"
 	"fmt"
-	"math/rand"
+	"math/rand/v2"
 	"net/netip"
 	"runtime"
 	"strconv"
@ -1140,7 +1140,7 @@ func randomPrefixes4(n int) []slowPrefixEntry[int] {
 	pfxs := map[netip.Prefix]bool{}

 	for len(pfxs) < n {
-		len := rand.Intn(33)
+		len := rand.IntN(33)
 		pfx, err := randomAddr4().Prefix(len)
 		if err != nil {
 			panic(err)
@ -1161,7 +1161,7 @@ func randomPrefixes6(n int) []slowPrefixEntry[int] {
 	pfxs := map[netip.Prefix]bool{}

 	for len(pfxs) < n {
-		len := rand.Intn(129)
+		len := rand.IntN(129)
 		pfx, err := randomAddr6().Prefix(len)
 		if err != nil {
 			panic(err)
@ -1179,7 +1179,7 @@ func randomPrefixes6(n int) []slowPrefixEntry[int] {

 // randomAddr returns a randomly generated IP address.
 func randomAddr() netip.Addr {
-	if rand.Intn(2) == 1 {
+	if rand.IntN(2) == 1 {
 		return randomAddr6()
 	} else {
 		return randomAddr4()
--- a/net/dns/direct_linux_test.go
+++ b/net/dns/direct_linux_test.go
@ -21,7 +21,8 @@ import (
 )

 func TestDNSTrampleRecovery(t *testing.T) {
-	HookWatchFile.Set(watchFile)
+	restore := HookWatchFile.SetForTest(watchFile)
+	defer restore()
 	synctest.Test(t, func(t *testing.T) {
 		tmp := t.TempDir()
 		if err := os.MkdirAll(filepath.Join(tmp, "etc"), 0700); err != nil {
--- a/net/netcheck/netcheck_test.go
+++ b/net/netcheck/netcheck_test.go
@ -32,6 +32,10 @@ func newTestClient(t testing.TB) *Client {
 		TimeNow: func() time.Time {
 			return time.Unix(1729624521, 0)
 		},
+		// Disable captive portal detection in tests by setting a very long
+		// delay. The captivePortalStop() call before waiting ensures the
+		// channel is closed even if the timer hasn't fired.
+		testCaptivePortalDelay: time.Hour,
 	}
 	return c
 }
--- a/net/udprelay/server.go
+++ b/net/udprelay/server.go
@ -94,6 +94,7 @@ type Server struct {
 	closed              bool
 	lamportID           uint64
 	nextVNI             uint32
+	testVNIPoolSize     uint32 // if non-zero, overrides totalPossibleVNI for testing
 	// serverEndpointByVNI is consistent with serverEndpointByDisco while mu is
 	// held, i.e. mu must be held around write ops. Read ops in performance
 	// sensitive paths, e.g. packet forwarding, do not need to acquire mu.
@ -977,7 +978,11 @@ func (e ErrServerNotReady) Error() string {
 // For now, we favor simplicity and reducing VNI re-use over more complex
 // ephemeral port (VNI) selection algorithms.
 func (s *Server) getNextVNILocked() (uint32, error) {
-	for range totalPossibleVNI {
+	poolSize := totalPossibleVNI
+	if s.testVNIPoolSize > 0 {
+		poolSize = s.testVNIPoolSize
+	}
+	for range poolSize {
 		vni := s.nextVNI
 		if vni == maxVNI {
 			s.nextVNI = minVNI
--- a/net/udprelay/server_test.go
+++ b/net/udprelay/server_test.go
@ -342,22 +342,95 @@ func TestServer(t *testing.T) {
 func TestServer_getNextVNILocked(t *testing.T) {
 	t.Parallel()
 	c := qt.New(t)
+
+	// Test 1: Basic sequential allocation from minVNI
 	s := &Server{
 		nextVNI: minVNI,
 	}
-	for range uint64(totalPossibleVNI) {
-		vni, err := s.getNextVNILocked()
-		if err != nil { // using quicktest here triples test time
-			t.Fatal(err)
-		}
-		s.serverEndpointByVNI.Store(vni, nil)
-	}
-	c.Assert(s.nextVNI, qt.Equals, minVNI)
-	_, err := s.getNextVNILocked()
-	c.Assert(err, qt.IsNotNil)
-	s.serverEndpointByVNI.Delete(minVNI)
-	_, err = s.getNextVNILocked()
+	vni, err := s.getNextVNILocked()
 	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, minVNI)
+	c.Assert(s.nextVNI, qt.Equals, minVNI+1)
+
+	// Test 2: Wrapping from maxVNI back to minVNI
+	s = &Server{
+		nextVNI: maxVNI,
+	}
+	vni, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, maxVNI)
+	c.Assert(s.nextVNI, qt.Equals, minVNI) // Should wrap to minVNI
+
+	// Test 3: Skips occupied VNIs
+	s = &Server{
+		nextVNI: minVNI,
+	}
+	s.serverEndpointByVNI.Store(minVNI, nil)
+	s.serverEndpointByVNI.Store(minVNI+1, nil)
+	vni, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, minVNI+2)
+
+	// Test 4: Wrapping while skipping occupied VNIs
+	s = &Server{
+		nextVNI: maxVNI - 1,
+	}
+	s.serverEndpointByVNI.Store(maxVNI-1, nil)
+	s.serverEndpointByVNI.Store(maxVNI, nil)
+	vni, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, minVNI)
+	c.Assert(s.nextVNI, qt.Equals, minVNI+1)
+
+	// Test 5: VNI freeing and re-allocation
+	// Verify that when VNIs are freed, new allocations continue from nextVNI
+	// (the freed VNI won't be returned until nextVNI wraps back around to it).
+	s = &Server{
+		nextVNI: minVNI,
+	}
+	s.serverEndpointByVNI.Store(minVNI, nil)
+	s.serverEndpointByVNI.Store(minVNI+1, nil)
+	s.serverEndpointByVNI.Store(minVNI+2, nil)
+	vni, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, minVNI+3)
+
+	// Delete minVNI+1 and verify we can get VNIs (even though minVNI+1 won't
+	// be returned next since nextVNI has advanced past it)
+	s.serverEndpointByVNI.Delete(minVNI + 1)
+	vni, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, minVNI+4) // nextVNI is now at minVNI+4
+
+	// Test 6: Pool exhaustion with testVNIPoolSize
+	// Use a small pool size to test the exhaustion error path without
+	// allocating 16M entries. testVNIPoolSize limits how many VNIs the
+	// algorithm checks before giving up, simulating a full pool.
+	//
+	// We occupy more VNIs than testVNIPoolSize to ensure that no matter
+	// where nextVNI starts, the search will only find occupied VNIs.
+	const testPoolSize = 100
+	s = &Server{
+		nextVNI:         minVNI,
+		testVNIPoolSize: testPoolSize,
+	}
+	// Occupy VNIs [minVNI, minVNI + testPoolSize*2) to ensure the search
+	// window is fully covered regardless of where nextVNI advances to
+	for i := uint32(0); i < testPoolSize*2; i++ {
+		s.serverEndpointByVNI.Store(minVNI+i, nil)
+	}
+	// Search starts at minVNI, checks testPoolSize VNIs, all are occupied
+	_, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNotNil)
+	c.Assert(err.Error(), qt.Equals, "VNI pool exhausted")
+
+	// Test 7: Recovery after freeing a VNI in the search window
+	// nextVNI is now at minVNI+testPoolSize (after the failed search advanced it)
+	// Free a VNI that will be found in the next search
+	s.serverEndpointByVNI.Delete(minVNI + testPoolSize + 10)
+	vni, err = s.getNextVNILocked()
+	c.Assert(err, qt.IsNil)
+	c.Assert(vni, qt.Equals, minVNI+testPoolSize+10)
 }

 func Test_blakeMACFromBindMsg(t *testing.T) {
--- a/tsconsensus/tsconsensus_test.go
+++ b/tsconsensus/tsconsensus_test.go
@ -40,6 +40,24 @@ import (
 	"tailscale.com/util/racebuild"
 )

+func TestMain(m *testing.M) {
+	// tailscale/corp#4520: don't use netns for tests.
+	//
+	// When netns is enabled, it sets socket options (SO_MARK on Linux,
+	// IP_BOUND_IF on macOS) to bypass Tailscale routing and reach the physical
+	// network directly. These tests use fake localhost networks with tsnet.Server
+	// and testcontrol.Server. With netns enabled:
+	//   - Socket options may fail (non-root can't set SO_MARK on Linux)
+	//   - Traffic may route to real interfaces instead of test localhost servers
+	//
+	// We disable netns once in TestMain rather than per-test because the previous
+	// pattern of disabling in each test and restoring in t.Cleanup caused races:
+	// when tests run in parallel, Test A's cleanup would restore netns while
+	// Test B was still running with netns disabled.
+	netns.SetEnabled(false)
+	os.Exit(m.Run())
+}
+
 type fsm struct {
 	mu          sync.Mutex
 	applyEvents []string
@ -125,11 +143,7 @@ func testConfig(t *testing.T) {

 func startControl(t testing.TB) (control *testcontrol.Server, controlURL string) {
 	t.Helper()
-	// tailscale/corp#4520: don't use netns for tests.
-	netns.SetEnabled(false)
-	t.Cleanup(func() {
-		netns.SetEnabled(true)
-	})
+	// netns is disabled for this package in TestMain.

 	derpLogf := logger.Discard
 	derpMap := integration.RunDERPAndSTUN(t, derpLogf, "127.0.0.1")
@ -269,8 +283,10 @@ func TestStart(t *testing.T) {

 func waitFor(t testing.TB, msg string, condition func() bool, waitBetweenTries time.Duration) {
 	t.Helper()
+	// Use a deadline to prevent infinite waiting under load
+	deadline := time.Now().Add(2 * time.Minute)
 	try := 0
-	for true {
+	for time.Now().Before(deadline) {
 		try++
 		done := condition()
 		if done {
@ -279,6 +295,7 @@ func waitFor(t testing.TB, msg string, condition func() bool, waitBetweenTries t
 		}
 		time.Sleep(waitBetweenTries)
 	}
+	t.Fatalf("waitFor timeout: %s: failed after %d tries over 2 minutes", msg, try)
 }

 type participant struct {
--- a/tsnet/tsnet_test.go
+++ b/tsnet/tsnet_test.go
@ -1012,21 +1012,51 @@ func setUpServiceState(t *testing.T, name, ip string, host, client *Server,
 	})

 	// Wait until both nodes have up-to-date netmaps before
-	// proceeding with the test.
-	netmapUpToDate := func(nm *netmap.NetworkMap) bool {
+	// proceeding with the test. We need to check both:
+	// 1. DNS records contain the service IP (sentinel for netmap propagation)
+	// 2. For the client: the service host peer has the service VIP in AllowedIPs
+	//    (required for routing to work)
+	servicePrefix := netip.MustParsePrefix(ip + "/32")
+	hasDNSRecord := func(nm *netmap.NetworkMap) bool {
 		return nm != nil && slices.ContainsFunc(nm.DNS.ExtraRecords, func(r tailcfg.DNSRecord) bool {
 			return r.Value == ip
 		})
 	}
-	waitForLatestNetmap := func(t *testing.T, s *Server) {
+	hasRouteToService := func(nm *netmap.NetworkMap, hostNodeKey key.NodePublic) bool {
+		if nm == nil {
+			return false
+		}
+		for _, peer := range nm.Peers {
+			if peer.Key() == hostNodeKey {
+				for _, aip := range peer.AllowedIPs().All() {
+					if aip == servicePrefix {
+						return true
+					}
+				}
+			}
+		}
+		return false
+	}
+	waitForLatestNetmap := func(t *testing.T, s *Server, checkRoute bool, hostNodeKey key.NodePublic) {
 		t.Helper()
 		w := must.Get(s.localClient.WatchIPNBus(t.Context(), ipn.NotifyInitialNetMap))
 		defer w.Close()
-		for n := must.Get(w.Next()); !netmapUpToDate(n.NetMap); n = must.Get(w.Next()) {
+		for {
+			n := must.Get(w.Next())
+			if !hasDNSRecord(n.NetMap) {
+				continue
+			}
+			if checkRoute && !hasRouteToService(n.NetMap, hostNodeKey) {
+				continue
+			}
+			break
 		}
 	}
-	waitForLatestNetmap(t, client)
-	waitForLatestNetmap(t, host)
+	hostNodeKey := host.lb.NodeKey()
+	// Client needs both DNS and route to service host
+	waitForLatestNetmap(t, client, true, hostNodeKey)
+	// Host only needs DNS (it's the one hosting the service)
+	waitForLatestNetmap(t, host, false, key.NodePublic{})
 }

 func TestListenService(t *testing.T) {
--- a/tstest/archtest/qemu_test.go
+++ b/tstest/archtest/qemu_test.go
@ -1,7 +1,12 @@
 // Copyright (c) Tailscale Inc & contributors
 // SPDX-License-Identifier: BSD-3-Clause

-//go:build linux && amd64 && !race
+// This file previously had "!race" in its build constraint, which was a CI
+// optimization (qemu binaries aren't installed on the race builder to save
+// time). The constraint was removed because it's not a technical requirement:
+// the test gracefully skips architectures when qemu-{arch} isn't available.
+
+//go:build linux && amd64

 package archtest

--- a/tstest/integration/integration_test.go
+++ b/tstest/integration/integration_test.go
@ -34,7 +34,6 @@ import (
 	"github.com/miekg/dns"
 	"go4.org/mem"
 	"tailscale.com/client/local"
-	"tailscale.com/client/tailscale"
 	"tailscale.com/cmd/testwrapper/flakytest"
 	"tailscale.com/feature"
 	_ "tailscale.com/feature/clientupdate"
@ -1194,40 +1193,46 @@ func TestClientSideJailing(t *testing.T) {
 		}
 	}

-	b1, err := lc1.WatchIPNBus(context.Background(), 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-	b2, err := lc2.WatchIPNBus(context.Background(), 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-	waitPeerIsJailed := func(t *testing.T, b *tailscale.IPNBusWatcher, jailed bool) {
-		t.Helper()
-		for {
-			n, err := b.Next()
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			// Start watchers BEFORE calling SetJailed to ensure we don't miss the update.
+			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+			b1, err := lc1.WatchIPNBus(ctx, 0)
 			if err != nil {
 				t.Fatal(err)
 			}
-			if n.NetMap == nil {
-				continue
+			defer b1.Close()
+			b2, err := lc2.WatchIPNBus(ctx, 0)
+			if err != nil {
+				t.Fatal(err)
 			}
-			if len(n.NetMap.Peers) == 0 {
-				continue
-			}
-			if j := n.NetMap.Peers[0].IsJailed(); j == jailed {
-				break
-			}
-		}
-	}
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
+			defer b2.Close()
+
 			env.Control.SetJailed(k1, k2, tc.n2JailedForN1)
 			env.Control.SetJailed(k2, k1, tc.n1JailedForN2)

 			// Wait for the jailed status to propagate.
-			waitPeerIsJailed(t, b1, tc.n2JailedForN1)
-			waitPeerIsJailed(t, b2, tc.n1JailedForN2)
+			waitPeerIsJailed := func(b *local.IPNBusWatcher, jailed bool) {
+				t.Helper()
+				for {
+					n, err := b.Next()
+					if err != nil {
+						t.Fatalf("waiting for peer jailed=%v: %v", jailed, err)
+					}
+					if n.NetMap == nil {
+						continue
+					}
+					if len(n.NetMap.Peers) == 0 {
+						continue
+					}
+					if j := n.NetMap.Peers[0].IsJailed(); j == jailed {
+						return
+					}
+				}
+			}
+			waitPeerIsJailed(b1, tc.n2JailedForN1)
+			waitPeerIsJailed(b2, tc.n1JailedForN2)

 			testDial(t, lc1, ip2, port, tc.n1JailedForN2)
 			testDial(t, lc2, ip1, port, tc.n2JailedForN1)
@ -2050,18 +2055,28 @@ func TestPeerRelayPing(t *testing.T) {
 			a := pair[0]
 			z := pair[1]
 			err := tstest.WaitFor(time.Second*10, func() error {
-				remoteKey := z.MustStatus().Self.PublicKey
+				// Use Status() instead of MustStatus() to avoid calling t.Fatal
+				// from a goroutine, which can cause a race with test cleanup.
+				zSt, err := z.Status()
+				if err != nil {
+					return fmt.Errorf("z.Status: %w", err)
+				}
+				remoteKey := zSt.Self.PublicKey
 				if err := a.Tailscale("ping", "--until-direct=false", "--c=1", "--timeout=1s", z.AwaitIP4().String()).Run(); err != nil {
 					return err
 				}
-				remotePeer, ok := a.MustStatus().Peer[remoteKey]
+				aSt, err := a.Status()
+				if err != nil {
+					return fmt.Errorf("a.Status: %w", err)
+				}
+				remotePeer, ok := aSt.Peer[remoteKey]
 				if !ok {
-					return fmt.Errorf("%v->%v remote peer not found", a.MustStatus().Self.ID, z.MustStatus().Self.ID)
+					return fmt.Errorf("%v->%v remote peer not found", aSt.Self.ID, zSt.Self.ID)
 				}
 				if len(remotePeer.PeerRelay) == 0 {
-					return fmt.Errorf("%v->%v not using peer relay, curAddr=%v relay=%v", a.MustStatus().Self.ID, z.MustStatus().Self.ID, remotePeer.CurAddr, remotePeer.Relay)
+					return fmt.Errorf("%v->%v not using peer relay, curAddr=%v relay=%v", aSt.Self.ID, zSt.Self.ID, remotePeer.CurAddr, remotePeer.Relay)
 				}
-				t.Logf("%v->%v using peer relay addr: %v", a.MustStatus().Self.ID, z.MustStatus().Self.ID, remotePeer.PeerRelay)
+				t.Logf("%v->%v using peer relay addr: %v", aSt.Self.ID, zSt.Self.ID, remotePeer.PeerRelay)
 				return nil
 			})
 			errCh <- err
@ -2256,6 +2271,8 @@ func TestC2NDebugNetmap(t *testing.T) {
 }

 func TestTailnetLock(t *testing.T) {
+	tstest.Shard(t)
+	tstest.Parallel(t)

 	// If you run `tailscale lock log` on a node where Tailnet Lock isn't
 	// enabled, you get an error explaining that.
--- a/tsweb/varz/varz_test.go
+++ b/tsweb/varz/varz_test.go
@ -205,7 +205,9 @@ func TestVarzHandler(t *testing.T) {
 			"string_map",
 			func() *expvar.Map {
 				m := new(expvar.Map)
-				m.Set("a", expvar.NewString("foo"))
+				s := new(expvar.String)
+				s.Set("foo")
+				m.Set("a", s)
 				return m
 			}(),
 			"# skipping \"string_map\" expvar map key \"a\" with unknown value type *expvar.String\n",
--- a/util/deephash/deephash_test.go
+++ b/util/deephash/deephash_test.go
@ -26,7 +26,6 @@ import (
 	"tailscale.com/types/key"
 	"tailscale.com/util/deephash/testtype"
 	"tailscale.com/util/hashx"
-	"tailscale.com/version"
 )

 type appendBytes []byte
@ -777,10 +776,6 @@ func TestMapCyclicFallback(t *testing.T) {
 }

 func TestArrayAllocs(t *testing.T) {
-	if version.IsRace() {
-		t.Skip("skipping test under race detector")
-	}
-
 	// In theory, there should be no allocations. However, escape analysis on
 	// certain architectures fails to detect that certain cases do not escape.
 	// This discrepancy currently affects sha256.digest.Sum.
@ -801,9 +796,11 @@ func TestArrayAllocs(t *testing.T) {
 		X [32]byte
 	}
 	x := &T{X: [32]byte{1: 1, 2: 2, 3: 3, 4: 4}}
+	var localSink Sum
 	got := int(testing.AllocsPerRun(1000, func() {
-		sink = Hash(x)
+		localSink = Hash(x)
 	}))
+	_ = localSink // prevent compiler from optimizing away the Hash call
 	if got > want {
 		t.Errorf("allocs = %v; want %v", got, want)
 	}
--- a/wgengine/magicsock/magicsock_test.go
+++ b/wgengine/magicsock/magicsock_test.go
@ -414,9 +414,11 @@ func TestNewConn(t *testing.T) {
 	stunAddr, stunCleanupFn := stuntest.Serve(t)
 	defer stunCleanupFn()

-	port := pickPort(t)
+	// Use port 0 to let the system assign a port, avoiding TOCTOU races
+	// from the previous pickPort approach which would close a socket and
+	// hope to rebind to the same port.
 	conn, err := NewConn(Options{
-		Port:              port,
+		Port:              0,
 		DisablePortMapper: true,
 		EndpointsFunc:     epFunc,
 		Logf:              t.Logf,
@ -428,6 +430,13 @@ func TestNewConn(t *testing.T) {
 		t.Fatal(err)
 	}
 	defer conn.Close()
+
+	// Get the actual port that was assigned
+	port := conn.LocalPort()
+	if port == 0 {
+		t.Fatal("LocalPort returned 0")
+	}
+
 	conn.SetDERPMap(stuntest.DERPMapOf(stunAddr.String()))
 	conn.SetPrivateKey(key.NewNode())

@ -463,16 +472,6 @@ collectEndpoints:
 	}
 }

-func pickPort(t testing.TB) uint16 {
-	t.Helper()
-	conn, err := net.ListenPacket("udp4", "127.0.0.1:0")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer conn.Close()
-	return uint16(conn.LocalAddr().(*net.UDPAddr).Port)
-}
-
 func TestPickDERPFallback(t *testing.T) {
 	tstest.PanicOnLog()
 	tstest.ResourceCheck(t)
@ -557,6 +556,8 @@ func TestDERPActiveFuncCalledAfterConnect(t *testing.T) {
 		EndpointsFunc:          func([]tailcfg.Endpoint) {},
 		DERPActiveFunc: func() {
 			// derpStarted should already be closed when DERPActiveFunc is called.
+			// The goroutine calling this waits on startGate (= derpStarted for
+			// firstDerp), so by the time we get here, derpStarted must be closed.
 			select {
 			case <-conn.derpStarted:
 				resultCh <- true
@ -575,8 +576,14 @@ func TestDERPActiveFuncCalledAfterConnect(t *testing.T) {
 		t.Fatal(err)
 	}

-	if ok := <-resultCh; !ok {
-		t.Error("DERPActiveFunc was called before DERP connection was established")
+	// Add a timeout to avoid hanging forever if DERPActiveFunc is never called.
+	select {
+	case ok := <-resultCh:
+		if !ok {
+			t.Error("DERPActiveFunc was called before DERP connection was established")
+		}
+	case <-time.After(10 * time.Second):
+		t.Fatal("timeout waiting for DERPActiveFunc to be called")
 	}
 }

@ -1435,7 +1442,6 @@ func Test32bitAlignment(t *testing.T) {
 // newTestConn returns a new Conn.
 func newTestConn(t testing.TB) *Conn {
 	t.Helper()
-	port := pickPort(t)

 	bus := eventbustest.NewBus(t)

@ -1445,6 +1451,7 @@ func newTestConn(t testing.TB) *Conn {
 	}
 	t.Cleanup(func() { netMon.Close() })

+	// Use port 0 to let the system assign a port, avoiding TOCTOU races.
 	conn, err := NewConn(Options{
 		NetMon:                 netMon,
 		EventBus:               bus,
@ -1452,7 +1459,7 @@ func newTestConn(t testing.TB) *Conn {
 		Metrics:                new(usermetric.Registry),
 		DisablePortMapper:      true,
 		Logf:                   t.Logf,
-		Port:                   port,
+		Port:                   0,
 		TestOnlyPacketListener: localhostListener{},
 		EndpointsFunc: func(eps []tailcfg.Endpoint) {
 			t.Logf("endpoints: %q", eps)
--- a/wgengine/netstack/netstack_test.go
+++ b/wgengine/netstack/netstack_test.go
@ -870,6 +870,10 @@ func TestTCPForwardLimits_PerClient(t *testing.T) {
 		}
 	}

+	// Capture the initial value of the clientmetric before injecting packets,
+	// since it's a global counter that persists across test runs.
+	initialClientMetric := metricPerClientForwardLimit.Value()
+
 	// Inject the packet to start the TCP forward and wait until we have an
 	// in-flight outgoing connection.
 	mustInjectPacket()
@ -901,9 +905,9 @@ func TestTCPForwardLimits_PerClient(t *testing.T) {
 		t.Errorf("got expvar metric %q=%s, want 1", metricName, v)
 	}

-	// client metric
-	if v := metricPerClientForwardLimit.Value(); v != 1 {
-		t.Errorf("got clientmetric limit metric=%d, want 1", v)
+	// client metric - check it increased by 1 from initial value
+	if v := metricPerClientForwardLimit.Value(); v != initialClientMetric+1 {
+		t.Errorf("got clientmetric limit metric=%d, want %d", v, initialClientMetric+1)
 	}
 }