fix flaky tests: make code more testable and fix race conditions

Changes to support flaky test fixes:

- kube/services: add SetWaitDurationForTest() to avoid hardcoded 20s wait
- net/netcheck: check context before captive portal detection
- net/captivedetection: stop timer before waiting on channel
- util/eventbus/eventbustest: use sync.Once to prevent double-close
- cmd/tailscale/cli: fix noDupFlagify to handle repeated test runs

Change-Id: I9c4af6996c7ff809df5fa8211c4de39a0a9183c9
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Avery Pennarun <apenwarr@tailscale.com>
This commit is contained in:
Avery Pennarun 2026-04-13 03:53:51 +02:00
parent b9f468240f
commit ebbf47ca1a
5 changed files with 38 additions and 8 deletions

View File

@ -196,11 +196,17 @@ func (v *onceFlagValue) IsBoolFlag() bool {
// noDupFlagify modifies c recursively to make all the
// flag values be wrappers that permit setting the value
// at most once.
// at most once. If a flag is already wrapped, it resets
// the wrapper's state instead of double-wrapping.
func noDupFlagify(c *ffcli.Command) {
if c.FlagSet != nil {
c.FlagSet.VisitAll(func(f *flag.Flag) {
f.Value = &onceFlagValue{Value: f.Value}
if ofv, ok := f.Value.(*onceFlagValue); ok {
// Already wrapped; reset the flag state
ofv.set = false
} else {
f.Value = &onceFlagValue{Value: f.Value}
}
})
}
for _, sub := range c.Subcommands {

View File

@ -16,6 +16,19 @@ import (
"tailscale.com/types/logger"
)
// serviceWaitDuration is the time to wait for services to propagate after
// advertising/unadvertising. This can be overridden in tests via
// SetWaitDurationForTest.
var serviceWaitDuration = 20 * time.Second
// SetWaitDurationForTest sets the service wait duration and returns a function
// to restore the original value. This should only be used in tests.
func SetWaitDurationForTest(d time.Duration) func() {
old := serviceWaitDuration
serviceWaitDuration = d
return func() { serviceWaitDuration = old }
}
// EnsureServicesAdvertised is a function that gets called on containerboot
// startup and ensures that Services get advertised if they exist.
func EnsureServicesAdvertised(ctx context.Context, services []string, lc localclient.LocalClient, logf logger.Logf) error {
@ -47,7 +60,7 @@ func EnsureServicesAdvertised(ctx context.Context, services []string, lc localcl
select {
case <-ctx.Done():
return nil
case <-time.After(20 * time.Second):
case <-time.After(serviceWaitDuration):
return nil
}
}
@ -94,7 +107,7 @@ func EnsureServicesNotAdvertised(ctx context.Context, lc *local.Client, logf log
select {
case <-ctx.Done():
return nil
case <-time.After(20 * time.Second):
case <-time.After(serviceWaitDuration):
return nil
}
}

View File

@ -93,6 +93,9 @@ func (d *Detector) detectCaptivePortalWithGOOS(ctx context.Context, netMon *netm
// the captive portal alert thrown by the system. If no default route interface is known,
// we need to try with anything that might remotely resemble a Wi-Fi interface.
for ifName, i := range ifState.Interface {
if ctx.Err() != nil {
return false
}
if !i.IsUp() || i.IsLoopback() || interfaceNameDoesNotNeedCaptiveDetection(ifName, goos) {
continue
}

View File

@ -1015,6 +1015,10 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
}
// Wait for captive portal check before finishing the report.
// Try to stop the captive portal check timer in case it hasn't fired yet.
// This is safe to call multiple times - if the timer already fired, the
// goroutine will close the channel when it completes.
captivePortalStop()
<-captivePortalDone
return c.finishAndStoreReport(rs, dm), nil

View File

@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"reflect"
"sync"
"testing"
"time"
@ -43,9 +44,10 @@ func NewWatcher(t *testing.T, bus *eventbus.Bus) *Watcher {
//
// For usage examples, see the documentation in the top of the package.
type Watcher struct {
mon *eventbus.Subscriber[eventbus.RoutedEvent]
events chan any
chDone chan bool
mon *eventbus.Subscriber[eventbus.RoutedEvent]
events chan any
chDone chan bool
doneOnce sync.Once
}
// Type is a helper representing the expectation to see an event of type T, without
@ -174,7 +176,9 @@ func (tw *Watcher) watch() {
// done tells the watcher to stop monitoring for new events.
func (tw *Watcher) done() {
close(tw.chDone)
tw.doneOnce.Do(func() {
close(tw.chDone)
})
}
type filter = func(any) (bool, error)