tstest/natlab/vmtest: serialize per-platform setup with sync.Once

Two cloud-platform nodes (e.g. sr-a and sr-b in TestSiteToSite) boot in
parallel via errgroup and both call ensureCompiled and the inline image
preparation block, racing to Begin() the same shared *Step (which is
deduped by name in Env.Step). The second goroutine panics:

    panic: Step "Compile linux_amd64 binaries": Begin called in state running
    panic: Step "Prepare ubuntu-24.04 image": Begin called in state done

ensureCompiled had a TOCTOU dedup attempt (released compileMu before
doing the work, only added to the compiled set at the end), and image
preparation had no dedup at all.

Replace the compiled set with a per-key map[string]*sync.Once for each
of compile and image preparation, so concurrent callers serialize on
the Once and only the first executes Begin/work/End.

Fixes commit 02ffe5baa8ccb2b81c4cfba3b59653e2cff10e01.

Updates #13038

Change-Id: If710bcc9e0aafebf0ad5b61553bae11458d976d7
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2026-04-29 16:29:20 +00:00 committed by Brad Fitzpatrick
parent 02ffe5baa8
commit fd6ae2fad4
2 changed files with 36 additions and 20 deletions

View File

@ -65,13 +65,9 @@ func (qemuCloudPlatform) boot(ctx context.Context, e *Env, n *Node) error {
e.ensureCompiled(ctx, goos, goarch)
imgStep := e.Step(fmt.Sprintf("Prepare %s image", n.os.Name))
imgStep.Begin()
if err := ensureImage(ctx, n.os); err != nil {
imgStep.End(err)
if err := e.ensureImage(ctx, n.os); err != nil {
return err
}
imgStep.End(nil)
e.ensureQEMUSocket()

View File

@ -46,7 +46,7 @@ import (
"tailscale.com/tstest/integration/testcontrol"
"tailscale.com/tstest/natlab/vnet"
"tailscale.com/types/key"
"tailscale.com/util/set"
"tailscale.com/util/mak"
)
var (
@ -92,7 +92,8 @@ type Env struct {
qemuSockOnce sync.Once
dgramSockOnce sync.Once
compileMu sync.Mutex
compiled set.Set[string]
compileOnce map[string]*sync.Once // keyed by goos_goarch
imageOnce map[string]*sync.Once // keyed by OSImage.Name
// Web UI support.
ctx context.Context // cancelled when test ends
@ -1143,25 +1144,44 @@ func (e *Env) ensureCompiled(ctx context.Context, goos, goarch string) {
key := goos + "_" + goarch
e.compileMu.Lock()
if e.compiled.Contains(key) {
e.compileMu.Unlock()
return
once, ok := e.compileOnce[key]
if !ok {
once = new(sync.Once)
mak.Set(&e.compileOnce, key, once)
}
e.compileMu.Unlock()
step := e.Step(fmt.Sprintf("Compile %s_%s binaries", goos, goarch))
step.Begin()
if err := e.compileBinariesForOS(ctx, goos, goarch); err != nil {
step.End(err)
e.t.Fatalf("compileBinariesForOS(%s, %s): %v", goos, goarch, err)
}
step.End(nil)
e.registerBinaries(goos, goarch)
once.Do(func() {
step := e.Step(fmt.Sprintf("Compile %s_%s binaries", goos, goarch))
step.Begin()
if err := e.compileBinariesForOS(ctx, goos, goarch); err != nil {
step.End(err)
e.t.Fatalf("compileBinariesForOS(%s, %s): %v", goos, goarch, err)
}
step.End(nil)
e.registerBinaries(goos, goarch)
})
}
// ensureImage prepares the cloud image for os and returns any error from the
// preparation. Safe for concurrent use; only prepares once per OS name.
func (e *Env) ensureImage(ctx context.Context, os OSImage) error {
e.compileMu.Lock()
e.compiled.Make()
e.compiled.Add(key)
once, ok := e.imageOnce[os.Name]
if !ok {
once = new(sync.Once)
mak.Set(&e.imageOnce, os.Name, once)
}
e.compileMu.Unlock()
var err error
once.Do(func() {
step := e.Step(fmt.Sprintf("Prepare %s image", os.Name))
step.Begin()
err = ensureImage(ctx, os)
step.End(err)
})
return err
}
// registerBinaries registers compiled binaries with the vnet file server.