From 97e021d8edc8cc38c9c5ace21efbd0e5e096aef7 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Mon, 13 Apr 2026 03:54:59 +0200 Subject: [PATCH] fix flaky tests: improve test isolation and reliability Test fixes for consistent pass rates under repeated runs (-count=N): Global state isolation: - appc: check metric deltas instead of absolute values - cmd/derper: initialize DNS caches before test - tsweb/varz: use unpublished expvar to avoid global registration - wgengine/netstack: check metric deltas instead of absolute values - net/dns: use SetForTest() with deferred restore for hooks Timeout and concurrency: - cmd/containerboot: increase wait loop timeouts for parallel load - tsconsensus: add deadline to waitFor(), use sync.Once for netns - tstest/integration: add tstest.Shard/Parallel, fix IPN bus watchers - net/netcheck: set testCaptivePortalDelay to prevent hangs - wgengine/magicsock: use Port:0, add timeout to callback wait - drive/driveimpl: use http.Server for proper shutdown Race conditions: - tstest/archtest: remove !race from build constraint - util/deephash: use local sink variable instead of package-level - net/art: switch to math/rand/v2 for thread-safe globals - tstest/integration: use Status() instead of MustStatus() from goroutines Test optimization: - net/udprelay: rewrite VNI test to avoid iterating all 16M values - ipn/ipnlocal: reset env vars between subtests - cmd/containerboot/serve: use SetWaitDurationForTest - tsnet: wait for service VIP in AllowedIPs before dialing Change-Id: Id6186fb8a45031920550a208ded77382e57cc016 Co-Authored-By: Claude Opus 4.6 Signed-off-by: Avery Pennarun --- appc/appconnector_test.go | 14 +++- cmd/containerboot/main_test.go | 12 ++-- cmd/containerboot/serve_test.go | 6 ++ cmd/derper/bootstrap_dns_test.go | 7 ++ drive/driveimpl/drive_test.go | 16 +++-- ipn/ipnlocal/cert_test.go | 6 +- net/art/table_test.go | 8 +-- net/dns/direct_linux_test.go | 3 +- net/netcheck/netcheck_test.go | 4 ++ net/udprelay/server.go | 7 +- net/udprelay/server_test.go | 97 ++++++++++++++++++++++---- tsconsensus/tsconsensus_test.go | 29 ++++++-- tsnet/tsnet_test.go | 42 +++++++++-- tstest/archtest/qemu_test.go | 7 +- tstest/integration/integration_test.go | 81 ++++++++++++--------- tsweb/varz/varz_test.go | 4 +- util/deephash/deephash_test.go | 9 +-- wgengine/magicsock/magicsock_test.go | 39 ++++++----- wgengine/netstack/netstack_test.go | 10 ++- 19 files changed, 297 insertions(+), 104 deletions(-) diff --git a/appc/appconnector_test.go b/appc/appconnector_test.go index d14ef68fc..928c4c9b4 100644 --- a/appc/appconnector_test.go +++ b/appc/appconnector_test.go @@ -736,12 +736,19 @@ func TestRateLogger(t *testing.T) { } func TestRouteStoreMetrics(t *testing.T) { + // Capture initial metric values before the test actions, since metrics + // are global counters that persist across test runs. + initial := make(map[string]int64) + for _, x := range clientmetric.Metrics() { + initial[x.Name()] = x.Value() + } + metricStoreRoutes(1, 1) metricStoreRoutes(1, 1) // the 1 buckets value should be 2 metricStoreRoutes(5, 5) // the 5 buckets value should be 1 metricStoreRoutes(6, 6) // the 10 buckets value should be 1 metricStoreRoutes(10001, 10001) // the over buckets value should be 1 - wanted := map[string]int64{ + wantedDelta := map[string]int64{ "appc_store_routes_n_routes_1": 2, "appc_store_routes_rate_1": 2, "appc_store_routes_n_routes_5": 1, @@ -752,8 +759,9 @@ func TestRouteStoreMetrics(t *testing.T) { "appc_store_routes_rate_over": 1, } for _, x := range clientmetric.Metrics() { - if x.Value() != wanted[x.Name()] { - t.Errorf("%s: want: %d, got: %d", x.Name(), wanted[x.Name()], x.Value()) + delta := x.Value() - initial[x.Name()] + if delta != wantedDelta[x.Name()] { + t.Errorf("%s: want delta: %d, got delta: %d", x.Name(), wantedDelta[x.Name()], delta) } } } diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index 5ea402f66..c96bcfae5 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -1324,9 +1324,13 @@ func TestContainerBoot(t *testing.T) { } wantCmds = append(wantCmds, p.WantCmds...) - waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n")) + // Use generous timeouts (10s) because tests run in parallel, + // causing resource contention. The polling interval is 100ms, + // so successful tests complete quickly; the timeout only + // affects failure cases. + waitArgs(t, 10*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n")) - err = tstest.WaitFor(2*time.Second, func() error { + err = tstest.WaitFor(10*time.Second, func() error { for path, want := range p.WantFiles { gotBs, err := os.ReadFile(filepath.Join(env.d, path)) if err != nil { @@ -1343,7 +1347,7 @@ func TestContainerBoot(t *testing.T) { } for url, want := range p.EndpointStatuses { - err := tstest.WaitFor(2*time.Second, func() error { + err := tstest.WaitFor(10*time.Second, func() error { resp, err := http.Get(url) if err != nil && want != -1 { return fmt.Errorf("GET %s: %v", url, err) @@ -1361,7 +1365,7 @@ func TestContainerBoot(t *testing.T) { } } } - waitLogLine(t, 2*time.Second, cbOut, "Startup complete, waiting for shutdown signal") + waitLogLine(t, 10*time.Second, cbOut, "Startup complete, waiting for shutdown signal") if cmd.ProcessState != nil { t.Fatalf("containerboot should be running but exited with exit code %d", cmd.ProcessState.ExitCode()) } diff --git a/cmd/containerboot/serve_test.go b/cmd/containerboot/serve_test.go index 5da5ef5f7..6b9569b5c 100644 --- a/cmd/containerboot/serve_test.go +++ b/cmd/containerboot/serve_test.go @@ -10,11 +10,13 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/google/go-cmp/cmp" "tailscale.com/ipn" "tailscale.com/kube/kubetypes" "tailscale.com/kube/localclient" + "tailscale.com/kube/services" "tailscale.com/tailcfg" ) @@ -198,6 +200,10 @@ func TestReadServeConfig(t *testing.T) { } func TestRefreshAdvertiseServices(t *testing.T) { + // Use a very short wait duration for tests to avoid 20-second delays. + restore := services.SetWaitDurationForTest(1 * time.Millisecond) + t.Cleanup(restore) + tests := []struct { name string sc *ipn.ServeConfig diff --git a/cmd/derper/bootstrap_dns_test.go b/cmd/derper/bootstrap_dns_test.go index 5b765f6d3..170908b47 100644 --- a/cmd/derper/bootstrap_dns_test.go +++ b/cmd/derper/bootstrap_dns_test.go @@ -166,6 +166,13 @@ func TestUnpublishedDNSEmptyList(t *testing.T) { } func TestLookupMetric(t *testing.T) { + // Set up the DNS caches so handleBootstrapDNS returns valid JSON. + pub := &dnsEntryMap{ + IPs: map[string][]net.IP{"tailscale.com": {net.IPv4(10, 10, 10, 10)}}, + } + dnsCache.Store(pub) + dnsCacheBytes.Store([]byte(`{"tailscale.com":["10.10.10.10"]}`)) + d := []string{"a.io", "b.io", "c.io", "d.io", "e.io", "e.io", "e.io", "a.io"} resetMetrics() for _, q := range d { diff --git a/drive/driveimpl/drive_test.go b/drive/driveimpl/drive_test.go index 8f9b43d6b..2a81e5ae4 100644 --- a/drive/driveimpl/drive_test.go +++ b/drive/driveimpl/drive_test.go @@ -434,6 +434,7 @@ type local struct { type remote struct { l net.Listener + httpServer *http.Server fs *FileSystemForRemote fileServer *FileServer shares map[string]string @@ -510,14 +511,15 @@ func (s *system) addRemote(name string) string { s.t.Logf("FileServer for %v listening at %s", name, fileServer.Addr()) r := &remote{ - l: ln, - fileServer: fileServer, - fs: NewFileSystemForRemote(log.Printf), - shares: make(map[string]string), + l: ln, + fileServer: fileServer, + fs: NewFileSystemForRemote(log.Printf), + shares: make(map[string]string), permissions: make(map[string]drive.Permission), } r.fs.SetFileServerAddr(fileServer.Addr()) - go http.Serve(ln, r) + r.httpServer = &http.Server{Handler: r} + go r.httpServer.Serve(ln) s.remotes[name] = r remotes := make([]*drive.Remote, 0, len(s.remotes)) @@ -694,9 +696,9 @@ func (s *system) stop() { s.t.Fatalf("failed to Close remote fs: %s", err) } - err = r.l.Close() + err = r.httpServer.Close() if err != nil { - s.t.Fatalf("failed to Close remote listener: %s", err) + s.t.Fatalf("failed to Close remote http server: %s", err) } err = r.fileServer.Close() diff --git a/ipn/ipnlocal/cert_test.go b/ipn/ipnlocal/cert_test.go index e2d69da52..0ac41fff7 100644 --- a/ipn/ipnlocal/cert_test.go +++ b/ipn/ipnlocal/cert_test.go @@ -518,9 +518,13 @@ func TestGetCertPEMWithValidity(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - + // Always reset the cert share mode at the start of each subtest to ensure + // test isolation. Without this, a previous subtest's "ro" setting would + // leak into subsequent subtests. if tt.readOnlyMode { envknob.Setenv("TS_CERT_SHARE_MODE", "ro") + } else { + envknob.Setenv("TS_CERT_SHARE_MODE", "") } os.RemoveAll(certDir) diff --git a/net/art/table_test.go b/net/art/table_test.go index 5c35ac7da..ce5394981 100644 --- a/net/art/table_test.go +++ b/net/art/table_test.go @@ -6,7 +6,7 @@ package art import ( crand "crypto/rand" "fmt" - "math/rand" + "math/rand/v2" "net/netip" "runtime" "strconv" @@ -1140,7 +1140,7 @@ func randomPrefixes4(n int) []slowPrefixEntry[int] { pfxs := map[netip.Prefix]bool{} for len(pfxs) < n { - len := rand.Intn(33) + len := rand.IntN(33) pfx, err := randomAddr4().Prefix(len) if err != nil { panic(err) @@ -1161,7 +1161,7 @@ func randomPrefixes6(n int) []slowPrefixEntry[int] { pfxs := map[netip.Prefix]bool{} for len(pfxs) < n { - len := rand.Intn(129) + len := rand.IntN(129) pfx, err := randomAddr6().Prefix(len) if err != nil { panic(err) @@ -1179,7 +1179,7 @@ func randomPrefixes6(n int) []slowPrefixEntry[int] { // randomAddr returns a randomly generated IP address. func randomAddr() netip.Addr { - if rand.Intn(2) == 1 { + if rand.IntN(2) == 1 { return randomAddr6() } else { return randomAddr4() diff --git a/net/dns/direct_linux_test.go b/net/dns/direct_linux_test.go index 8199b41f3..0066eca24 100644 --- a/net/dns/direct_linux_test.go +++ b/net/dns/direct_linux_test.go @@ -21,7 +21,8 @@ import ( ) func TestDNSTrampleRecovery(t *testing.T) { - HookWatchFile.Set(watchFile) + restore := HookWatchFile.SetForTest(watchFile) + defer restore() synctest.Test(t, func(t *testing.T) { tmp := t.TempDir() if err := os.MkdirAll(filepath.Join(tmp, "etc"), 0700); err != nil { diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 0fd3460fa..3f5ba4ebd 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -32,6 +32,10 @@ func newTestClient(t testing.TB) *Client { TimeNow: func() time.Time { return time.Unix(1729624521, 0) }, + // Disable captive portal detection in tests by setting a very long + // delay. The captivePortalStop() call before waiting ensures the + // channel is closed even if the timer hasn't fired. + testCaptivePortalDelay: time.Hour, } return c } diff --git a/net/udprelay/server.go b/net/udprelay/server.go index 3b0f72989..b109d126c 100644 --- a/net/udprelay/server.go +++ b/net/udprelay/server.go @@ -94,6 +94,7 @@ type Server struct { closed bool lamportID uint64 nextVNI uint32 + testVNIPoolSize uint32 // if non-zero, overrides totalPossibleVNI for testing // serverEndpointByVNI is consistent with serverEndpointByDisco while mu is // held, i.e. mu must be held around write ops. Read ops in performance // sensitive paths, e.g. packet forwarding, do not need to acquire mu. @@ -977,7 +978,11 @@ func (e ErrServerNotReady) Error() string { // For now, we favor simplicity and reducing VNI re-use over more complex // ephemeral port (VNI) selection algorithms. func (s *Server) getNextVNILocked() (uint32, error) { - for range totalPossibleVNI { + poolSize := totalPossibleVNI + if s.testVNIPoolSize > 0 { + poolSize = s.testVNIPoolSize + } + for range poolSize { vni := s.nextVNI if vni == maxVNI { s.nextVNI = minVNI diff --git a/net/udprelay/server_test.go b/net/udprelay/server_test.go index 00b9c2423..855bd707a 100644 --- a/net/udprelay/server_test.go +++ b/net/udprelay/server_test.go @@ -342,22 +342,95 @@ func TestServer(t *testing.T) { func TestServer_getNextVNILocked(t *testing.T) { t.Parallel() c := qt.New(t) + + // Test 1: Basic sequential allocation from minVNI s := &Server{ nextVNI: minVNI, } - for range uint64(totalPossibleVNI) { - vni, err := s.getNextVNILocked() - if err != nil { // using quicktest here triples test time - t.Fatal(err) - } - s.serverEndpointByVNI.Store(vni, nil) - } - c.Assert(s.nextVNI, qt.Equals, minVNI) - _, err := s.getNextVNILocked() - c.Assert(err, qt.IsNotNil) - s.serverEndpointByVNI.Delete(minVNI) - _, err = s.getNextVNILocked() + vni, err := s.getNextVNILocked() c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, minVNI) + c.Assert(s.nextVNI, qt.Equals, minVNI+1) + + // Test 2: Wrapping from maxVNI back to minVNI + s = &Server{ + nextVNI: maxVNI, + } + vni, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, maxVNI) + c.Assert(s.nextVNI, qt.Equals, minVNI) // Should wrap to minVNI + + // Test 3: Skips occupied VNIs + s = &Server{ + nextVNI: minVNI, + } + s.serverEndpointByVNI.Store(minVNI, nil) + s.serverEndpointByVNI.Store(minVNI+1, nil) + vni, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, minVNI+2) + + // Test 4: Wrapping while skipping occupied VNIs + s = &Server{ + nextVNI: maxVNI - 1, + } + s.serverEndpointByVNI.Store(maxVNI-1, nil) + s.serverEndpointByVNI.Store(maxVNI, nil) + vni, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, minVNI) + c.Assert(s.nextVNI, qt.Equals, minVNI+1) + + // Test 5: VNI freeing and re-allocation + // Verify that when VNIs are freed, new allocations continue from nextVNI + // (the freed VNI won't be returned until nextVNI wraps back around to it). + s = &Server{ + nextVNI: minVNI, + } + s.serverEndpointByVNI.Store(minVNI, nil) + s.serverEndpointByVNI.Store(minVNI+1, nil) + s.serverEndpointByVNI.Store(minVNI+2, nil) + vni, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, minVNI+3) + + // Delete minVNI+1 and verify we can get VNIs (even though minVNI+1 won't + // be returned next since nextVNI has advanced past it) + s.serverEndpointByVNI.Delete(minVNI + 1) + vni, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, minVNI+4) // nextVNI is now at minVNI+4 + + // Test 6: Pool exhaustion with testVNIPoolSize + // Use a small pool size to test the exhaustion error path without + // allocating 16M entries. testVNIPoolSize limits how many VNIs the + // algorithm checks before giving up, simulating a full pool. + // + // We occupy more VNIs than testVNIPoolSize to ensure that no matter + // where nextVNI starts, the search will only find occupied VNIs. + const testPoolSize = 100 + s = &Server{ + nextVNI: minVNI, + testVNIPoolSize: testPoolSize, + } + // Occupy VNIs [minVNI, minVNI + testPoolSize*2) to ensure the search + // window is fully covered regardless of where nextVNI advances to + for i := uint32(0); i < testPoolSize*2; i++ { + s.serverEndpointByVNI.Store(minVNI+i, nil) + } + // Search starts at minVNI, checks testPoolSize VNIs, all are occupied + _, err = s.getNextVNILocked() + c.Assert(err, qt.IsNotNil) + c.Assert(err.Error(), qt.Equals, "VNI pool exhausted") + + // Test 7: Recovery after freeing a VNI in the search window + // nextVNI is now at minVNI+testPoolSize (after the failed search advanced it) + // Free a VNI that will be found in the next search + s.serverEndpointByVNI.Delete(minVNI + testPoolSize + 10) + vni, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) + c.Assert(vni, qt.Equals, minVNI+testPoolSize+10) } func Test_blakeMACFromBindMsg(t *testing.T) { diff --git a/tsconsensus/tsconsensus_test.go b/tsconsensus/tsconsensus_test.go index 3236ef680..e8c34b2bc 100644 --- a/tsconsensus/tsconsensus_test.go +++ b/tsconsensus/tsconsensus_test.go @@ -40,6 +40,24 @@ import ( "tailscale.com/util/racebuild" ) +func TestMain(m *testing.M) { + // tailscale/corp#4520: don't use netns for tests. + // + // When netns is enabled, it sets socket options (SO_MARK on Linux, + // IP_BOUND_IF on macOS) to bypass Tailscale routing and reach the physical + // network directly. These tests use fake localhost networks with tsnet.Server + // and testcontrol.Server. With netns enabled: + // - Socket options may fail (non-root can't set SO_MARK on Linux) + // - Traffic may route to real interfaces instead of test localhost servers + // + // We disable netns once in TestMain rather than per-test because the previous + // pattern of disabling in each test and restoring in t.Cleanup caused races: + // when tests run in parallel, Test A's cleanup would restore netns while + // Test B was still running with netns disabled. + netns.SetEnabled(false) + os.Exit(m.Run()) +} + type fsm struct { mu sync.Mutex applyEvents []string @@ -125,11 +143,7 @@ func testConfig(t *testing.T) { func startControl(t testing.TB) (control *testcontrol.Server, controlURL string) { t.Helper() - // tailscale/corp#4520: don't use netns for tests. - netns.SetEnabled(false) - t.Cleanup(func() { - netns.SetEnabled(true) - }) + // netns is disabled for this package in TestMain. derpLogf := logger.Discard derpMap := integration.RunDERPAndSTUN(t, derpLogf, "127.0.0.1") @@ -269,8 +283,10 @@ func TestStart(t *testing.T) { func waitFor(t testing.TB, msg string, condition func() bool, waitBetweenTries time.Duration) { t.Helper() + // Use a deadline to prevent infinite waiting under load + deadline := time.Now().Add(2 * time.Minute) try := 0 - for true { + for time.Now().Before(deadline) { try++ done := condition() if done { @@ -279,6 +295,7 @@ func waitFor(t testing.TB, msg string, condition func() bool, waitBetweenTries t } time.Sleep(waitBetweenTries) } + t.Fatalf("waitFor timeout: %s: failed after %d tries over 2 minutes", msg, try) } type participant struct { diff --git a/tsnet/tsnet_test.go b/tsnet/tsnet_test.go index 4ed20fb2a..e9d614080 100644 --- a/tsnet/tsnet_test.go +++ b/tsnet/tsnet_test.go @@ -1012,21 +1012,51 @@ func setUpServiceState(t *testing.T, name, ip string, host, client *Server, }) // Wait until both nodes have up-to-date netmaps before - // proceeding with the test. - netmapUpToDate := func(nm *netmap.NetworkMap) bool { + // proceeding with the test. We need to check both: + // 1. DNS records contain the service IP (sentinel for netmap propagation) + // 2. For the client: the service host peer has the service VIP in AllowedIPs + // (required for routing to work) + servicePrefix := netip.MustParsePrefix(ip + "/32") + hasDNSRecord := func(nm *netmap.NetworkMap) bool { return nm != nil && slices.ContainsFunc(nm.DNS.ExtraRecords, func(r tailcfg.DNSRecord) bool { return r.Value == ip }) } - waitForLatestNetmap := func(t *testing.T, s *Server) { + hasRouteToService := func(nm *netmap.NetworkMap, hostNodeKey key.NodePublic) bool { + if nm == nil { + return false + } + for _, peer := range nm.Peers { + if peer.Key() == hostNodeKey { + for _, aip := range peer.AllowedIPs().All() { + if aip == servicePrefix { + return true + } + } + } + } + return false + } + waitForLatestNetmap := func(t *testing.T, s *Server, checkRoute bool, hostNodeKey key.NodePublic) { t.Helper() w := must.Get(s.localClient.WatchIPNBus(t.Context(), ipn.NotifyInitialNetMap)) defer w.Close() - for n := must.Get(w.Next()); !netmapUpToDate(n.NetMap); n = must.Get(w.Next()) { + for { + n := must.Get(w.Next()) + if !hasDNSRecord(n.NetMap) { + continue + } + if checkRoute && !hasRouteToService(n.NetMap, hostNodeKey) { + continue + } + break } } - waitForLatestNetmap(t, client) - waitForLatestNetmap(t, host) + hostNodeKey := host.lb.NodeKey() + // Client needs both DNS and route to service host + waitForLatestNetmap(t, client, true, hostNodeKey) + // Host only needs DNS (it's the one hosting the service) + waitForLatestNetmap(t, host, false, key.NodePublic{}) } func TestListenService(t *testing.T) { diff --git a/tstest/archtest/qemu_test.go b/tstest/archtest/qemu_test.go index 400f8bc4f..a92c31c66 100644 --- a/tstest/archtest/qemu_test.go +++ b/tstest/archtest/qemu_test.go @@ -1,7 +1,12 @@ // Copyright (c) Tailscale Inc & contributors // SPDX-License-Identifier: BSD-3-Clause -//go:build linux && amd64 && !race +// This file previously had "!race" in its build constraint, which was a CI +// optimization (qemu binaries aren't installed on the race builder to save +// time). The constraint was removed because it's not a technical requirement: +// the test gracefully skips architectures when qemu-{arch} isn't available. + +//go:build linux && amd64 package archtest diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 0d0fdeeef..f3cdcd421 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -34,7 +34,6 @@ import ( "github.com/miekg/dns" "go4.org/mem" "tailscale.com/client/local" - "tailscale.com/client/tailscale" "tailscale.com/cmd/testwrapper/flakytest" "tailscale.com/feature" _ "tailscale.com/feature/clientupdate" @@ -1194,40 +1193,46 @@ func TestClientSideJailing(t *testing.T) { } } - b1, err := lc1.WatchIPNBus(context.Background(), 0) - if err != nil { - t.Fatal(err) - } - b2, err := lc2.WatchIPNBus(context.Background(), 0) - if err != nil { - t.Fatal(err) - } - waitPeerIsJailed := func(t *testing.T, b *tailscale.IPNBusWatcher, jailed bool) { - t.Helper() - for { - n, err := b.Next() + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // Start watchers BEFORE calling SetJailed to ensure we don't miss the update. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + b1, err := lc1.WatchIPNBus(ctx, 0) if err != nil { t.Fatal(err) } - if n.NetMap == nil { - continue + defer b1.Close() + b2, err := lc2.WatchIPNBus(ctx, 0) + if err != nil { + t.Fatal(err) } - if len(n.NetMap.Peers) == 0 { - continue - } - if j := n.NetMap.Peers[0].IsJailed(); j == jailed { - break - } - } - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { + defer b2.Close() + env.Control.SetJailed(k1, k2, tc.n2JailedForN1) env.Control.SetJailed(k2, k1, tc.n1JailedForN2) // Wait for the jailed status to propagate. - waitPeerIsJailed(t, b1, tc.n2JailedForN1) - waitPeerIsJailed(t, b2, tc.n1JailedForN2) + waitPeerIsJailed := func(b *local.IPNBusWatcher, jailed bool) { + t.Helper() + for { + n, err := b.Next() + if err != nil { + t.Fatalf("waiting for peer jailed=%v: %v", jailed, err) + } + if n.NetMap == nil { + continue + } + if len(n.NetMap.Peers) == 0 { + continue + } + if j := n.NetMap.Peers[0].IsJailed(); j == jailed { + return + } + } + } + waitPeerIsJailed(b1, tc.n2JailedForN1) + waitPeerIsJailed(b2, tc.n1JailedForN2) testDial(t, lc1, ip2, port, tc.n1JailedForN2) testDial(t, lc2, ip1, port, tc.n2JailedForN1) @@ -2050,18 +2055,28 @@ func TestPeerRelayPing(t *testing.T) { a := pair[0] z := pair[1] err := tstest.WaitFor(time.Second*10, func() error { - remoteKey := z.MustStatus().Self.PublicKey + // Use Status() instead of MustStatus() to avoid calling t.Fatal + // from a goroutine, which can cause a race with test cleanup. + zSt, err := z.Status() + if err != nil { + return fmt.Errorf("z.Status: %w", err) + } + remoteKey := zSt.Self.PublicKey if err := a.Tailscale("ping", "--until-direct=false", "--c=1", "--timeout=1s", z.AwaitIP4().String()).Run(); err != nil { return err } - remotePeer, ok := a.MustStatus().Peer[remoteKey] + aSt, err := a.Status() + if err != nil { + return fmt.Errorf("a.Status: %w", err) + } + remotePeer, ok := aSt.Peer[remoteKey] if !ok { - return fmt.Errorf("%v->%v remote peer not found", a.MustStatus().Self.ID, z.MustStatus().Self.ID) + return fmt.Errorf("%v->%v remote peer not found", aSt.Self.ID, zSt.Self.ID) } if len(remotePeer.PeerRelay) == 0 { - return fmt.Errorf("%v->%v not using peer relay, curAddr=%v relay=%v", a.MustStatus().Self.ID, z.MustStatus().Self.ID, remotePeer.CurAddr, remotePeer.Relay) + return fmt.Errorf("%v->%v not using peer relay, curAddr=%v relay=%v", aSt.Self.ID, zSt.Self.ID, remotePeer.CurAddr, remotePeer.Relay) } - t.Logf("%v->%v using peer relay addr: %v", a.MustStatus().Self.ID, z.MustStatus().Self.ID, remotePeer.PeerRelay) + t.Logf("%v->%v using peer relay addr: %v", aSt.Self.ID, zSt.Self.ID, remotePeer.PeerRelay) return nil }) errCh <- err @@ -2256,6 +2271,8 @@ func TestC2NDebugNetmap(t *testing.T) { } func TestTailnetLock(t *testing.T) { + tstest.Shard(t) + tstest.Parallel(t) // If you run `tailscale lock log` on a node where Tailnet Lock isn't // enabled, you get an error explaining that. diff --git a/tsweb/varz/varz_test.go b/tsweb/varz/varz_test.go index d041edb4b..f15983c47 100644 --- a/tsweb/varz/varz_test.go +++ b/tsweb/varz/varz_test.go @@ -205,7 +205,9 @@ func TestVarzHandler(t *testing.T) { "string_map", func() *expvar.Map { m := new(expvar.Map) - m.Set("a", expvar.NewString("foo")) + s := new(expvar.String) + s.Set("foo") + m.Set("a", s) return m }(), "# skipping \"string_map\" expvar map key \"a\" with unknown value type *expvar.String\n", diff --git a/util/deephash/deephash_test.go b/util/deephash/deephash_test.go index 7ea83566f..c4f390b16 100644 --- a/util/deephash/deephash_test.go +++ b/util/deephash/deephash_test.go @@ -26,7 +26,6 @@ import ( "tailscale.com/types/key" "tailscale.com/util/deephash/testtype" "tailscale.com/util/hashx" - "tailscale.com/version" ) type appendBytes []byte @@ -777,10 +776,6 @@ func TestMapCyclicFallback(t *testing.T) { } func TestArrayAllocs(t *testing.T) { - if version.IsRace() { - t.Skip("skipping test under race detector") - } - // In theory, there should be no allocations. However, escape analysis on // certain architectures fails to detect that certain cases do not escape. // This discrepancy currently affects sha256.digest.Sum. @@ -801,9 +796,11 @@ func TestArrayAllocs(t *testing.T) { X [32]byte } x := &T{X: [32]byte{1: 1, 2: 2, 3: 3, 4: 4}} + var localSink Sum got := int(testing.AllocsPerRun(1000, func() { - sink = Hash(x) + localSink = Hash(x) })) + _ = localSink // prevent compiler from optimizing away the Hash call if got > want { t.Errorf("allocs = %v; want %v", got, want) } diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index b7492b867..f3a154245 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -414,9 +414,11 @@ func TestNewConn(t *testing.T) { stunAddr, stunCleanupFn := stuntest.Serve(t) defer stunCleanupFn() - port := pickPort(t) + // Use port 0 to let the system assign a port, avoiding TOCTOU races + // from the previous pickPort approach which would close a socket and + // hope to rebind to the same port. conn, err := NewConn(Options{ - Port: port, + Port: 0, DisablePortMapper: true, EndpointsFunc: epFunc, Logf: t.Logf, @@ -428,6 +430,13 @@ func TestNewConn(t *testing.T) { t.Fatal(err) } defer conn.Close() + + // Get the actual port that was assigned + port := conn.LocalPort() + if port == 0 { + t.Fatal("LocalPort returned 0") + } + conn.SetDERPMap(stuntest.DERPMapOf(stunAddr.String())) conn.SetPrivateKey(key.NewNode()) @@ -463,16 +472,6 @@ collectEndpoints: } } -func pickPort(t testing.TB) uint16 { - t.Helper() - conn, err := net.ListenPacket("udp4", "127.0.0.1:0") - if err != nil { - t.Fatal(err) - } - defer conn.Close() - return uint16(conn.LocalAddr().(*net.UDPAddr).Port) -} - func TestPickDERPFallback(t *testing.T) { tstest.PanicOnLog() tstest.ResourceCheck(t) @@ -557,6 +556,8 @@ func TestDERPActiveFuncCalledAfterConnect(t *testing.T) { EndpointsFunc: func([]tailcfg.Endpoint) {}, DERPActiveFunc: func() { // derpStarted should already be closed when DERPActiveFunc is called. + // The goroutine calling this waits on startGate (= derpStarted for + // firstDerp), so by the time we get here, derpStarted must be closed. select { case <-conn.derpStarted: resultCh <- true @@ -575,8 +576,14 @@ func TestDERPActiveFuncCalledAfterConnect(t *testing.T) { t.Fatal(err) } - if ok := <-resultCh; !ok { - t.Error("DERPActiveFunc was called before DERP connection was established") + // Add a timeout to avoid hanging forever if DERPActiveFunc is never called. + select { + case ok := <-resultCh: + if !ok { + t.Error("DERPActiveFunc was called before DERP connection was established") + } + case <-time.After(10 * time.Second): + t.Fatal("timeout waiting for DERPActiveFunc to be called") } } @@ -1435,7 +1442,6 @@ func Test32bitAlignment(t *testing.T) { // newTestConn returns a new Conn. func newTestConn(t testing.TB) *Conn { t.Helper() - port := pickPort(t) bus := eventbustest.NewBus(t) @@ -1445,6 +1451,7 @@ func newTestConn(t testing.TB) *Conn { } t.Cleanup(func() { netMon.Close() }) + // Use port 0 to let the system assign a port, avoiding TOCTOU races. conn, err := NewConn(Options{ NetMon: netMon, EventBus: bus, @@ -1452,7 +1459,7 @@ func newTestConn(t testing.TB) *Conn { Metrics: new(usermetric.Registry), DisablePortMapper: true, Logf: t.Logf, - Port: port, + Port: 0, TestOnlyPacketListener: localhostListener{}, EndpointsFunc: func(eps []tailcfg.Endpoint) { t.Logf("endpoints: %q", eps) diff --git a/wgengine/netstack/netstack_test.go b/wgengine/netstack/netstack_test.go index e588fa47c..074419ac5 100644 --- a/wgengine/netstack/netstack_test.go +++ b/wgengine/netstack/netstack_test.go @@ -870,6 +870,10 @@ func TestTCPForwardLimits_PerClient(t *testing.T) { } } + // Capture the initial value of the clientmetric before injecting packets, + // since it's a global counter that persists across test runs. + initialClientMetric := metricPerClientForwardLimit.Value() + // Inject the packet to start the TCP forward and wait until we have an // in-flight outgoing connection. mustInjectPacket() @@ -901,9 +905,9 @@ func TestTCPForwardLimits_PerClient(t *testing.T) { t.Errorf("got expvar metric %q=%s, want 1", metricName, v) } - // client metric - if v := metricPerClientForwardLimit.Value(); v != 1 { - t.Errorf("got clientmetric limit metric=%d, want 1", v) + // client metric - check it increased by 1 from initial value + if v := metricPerClientForwardLimit.Value(); v != initialClientMetric+1 { + t.Errorf("got clientmetric limit metric=%d, want %d", v, initialClientMetric+1) } }