diff --git a/cmd/tta/ipassign_darwin.go b/cmd/tta/ipassign_darwin.go new file mode 100644 index 000000000..69a178956 --- /dev/null +++ b/cmd/tta/ipassign_darwin.go @@ -0,0 +1,135 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build darwin + +package main + +import ( + "encoding/json" + "fmt" + "log" + "net" + "os/exec" + "strconv" + "time" + "unsafe" + + "golang.org/x/sys/unix" + "tailscale.com/tstest/natlab/vnet" +) + +const ( + afVSOCK = 40 // AF_VSOCK on macOS + vmaddrCIDHost = 2 // VMADDR_CID_HOST + vsockPort = 51011 // port for IP assignment protocol +) + +// sockaddrVM is the Go equivalent of struct sockaddr_vm from . +type sockaddrVM struct { + Len uint8 + Family uint8 + Reserved1 uint16 + Port uint32 + CID uint32 +} + +type netConfig struct { + IP string `json:"ip"` + Mask string `json:"mask"` + GW string `json:"gw"` +} + +// startIPAssignLoop starts a background goroutine that polls the host +// via the virtio socket for an IP assignment. When the host responds +// with a JSON config (rather than "wait"), TTA sets the IP statically +// using ifconfig and stops polling. +func startIPAssignLoop() { + go ipAssignLoop() +} + +func ipAssignLoop() { + log.Printf("ipassign: starting vsock poll loop") + var lastErr string + for attempt := 0; ; attempt++ { + resp, err := askHostForIP() + if err != nil { + if e := err.Error(); e != lastErr { + log.Printf("ipassign: attempt %d: %v", attempt, err) + lastErr = e + } + time.Sleep(500 * time.Millisecond) + continue + } + if resp == "wait" { + time.Sleep(500 * time.Millisecond) + continue + } + var nc netConfig + if err := json.Unmarshal([]byte(resp), &nc); err != nil { + log.Printf("ipassign: bad config: %v", err) + time.Sleep(500 * time.Millisecond) + continue + } + if err := setStaticIP(nc); err != nil { + log.Printf("ipassign: %v", err) + time.Sleep(500 * time.Millisecond) + continue + } + log.Printf("ipassign: configured en0 with %s/%s gw %s", nc.IP, nc.Mask, nc.GW) + + // Switch the driver address from the DNS name to the IP directly + // (avoids DNS resolution delay) and kick the dial-out loop so it + // retries immediately with the new address. + ipAddr := net.JoinHostPort(vnet.TestDriverIPv4().String(), strconv.Itoa(vnet.TestDriverPort)) + *driverAddr = ipAddr + log.Printf("ipassign: switched driver addr to %s", ipAddr) + resetDialCancels() + return + } +} + +// askHostForIP connects to the host via AF_VSOCK and reads the response. +func askHostForIP() (string, error) { + fd, err := unix.Socket(afVSOCK, unix.SOCK_STREAM, 0) + if err != nil { + return "", fmt.Errorf("socket: %w", err) + } + defer unix.Close(fd) + + // Set a short connect+read timeout via SO_RCVTIMEO. + tv := unix.Timeval{Sec: 1} + unix.SetsockoptTimeval(fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv) + + addr := sockaddrVM{ + Len: uint8(unsafe.Sizeof(sockaddrVM{})), + Family: afVSOCK, + Port: vsockPort, + CID: vmaddrCIDHost, + } + _, _, errno := unix.RawSyscall(unix.SYS_CONNECT, uintptr(fd), + uintptr(unsafe.Pointer(&addr)), unsafe.Sizeof(addr)) + if errno != 0 { + return "", fmt.Errorf("connect: %w", errno) + } + + var buf [1024]byte + n, err := unix.Read(fd, buf[:]) + if err != nil { + return "", fmt.Errorf("read: %w", err) + } + return string(buf[:n]), nil +} + +// setStaticIP configures en0 with a static IP address and default route. +func setStaticIP(nc netConfig) error { + out, err := exec.Command("ifconfig", "en0", nc.IP, "netmask", nc.Mask, "up").CombinedOutput() + if err != nil { + return fmt.Errorf("ifconfig: %v: %s", err, out) + } + out, err = exec.Command("route", "add", "default", nc.GW).CombinedOutput() + if err != nil { + return fmt.Errorf("route add: %v: %s", err, out) + } + return nil +} diff --git a/cmd/tta/ipassign_other.go b/cmd/tta/ipassign_other.go new file mode 100644 index 000000000..dc331b5e0 --- /dev/null +++ b/cmd/tta/ipassign_other.go @@ -0,0 +1,14 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !darwin + +package main + +// startIPAssignLoop is a no-op on non-macOS platforms. +// macOS VMs use vsock-based IP assignment to bypass slow DHCP. +func startIPAssignLoop() {} + +// Reference resetDialCancels to prevent unused-function lint errors. +// It's called from ipassign_darwin.go on macOS builds. +var _ = resetDialCancels diff --git a/cmd/tta/tta.go b/cmd/tta/tta.go index 56cf894cb..4d59b827b 100644 --- a/cmd/tta/tta.go +++ b/cmd/tta/tta.go @@ -105,6 +105,10 @@ func main() { } flag.Parse() + // On macOS VMs, start polling the host via vsock for an IP assignment. + // This bypasses DHCP for near-instant network configuration. + startIPAssignLoop() + debug := false if distro.Get() == distro.Gokrazy { cmdLine, _ := os.ReadFile("/proc/cmdline") @@ -408,12 +412,48 @@ func main() { revSt.runDialOutLoop(conns) } +// dialCancels tracks cancel funcs for in-flight connect() and sleep contexts. +// resetDialCancels cancels them all so the dial loop retries immediately. +var ( + dialCancelMu sync.Mutex + dialCancels set.HandleSet[context.CancelFunc] +) + +// registerDialCancel adds a cancel func and returns a handle for removal. +func registerDialCancel(cancel context.CancelFunc) set.Handle { + dialCancelMu.Lock() + defer dialCancelMu.Unlock() + return dialCancels.Add(cancel) +} + +// unregisterDialCancel removes a previously registered cancel func. +func unregisterDialCancel(h set.Handle) { + dialCancelMu.Lock() + defer dialCancelMu.Unlock() + delete(dialCancels, h) +} + +// resetDialCancels cancels all in-flight connect and sleep contexts, +// causing the dial loop to retry immediately with the updated driver address. +func resetDialCancels() { + dialCancelMu.Lock() + defer dialCancelMu.Unlock() + for h, cancel := range dialCancels { + cancel() + delete(dialCancels, h) + } +} + func connect() (net.Conn, error) { d := net.Dialer{ Control: bypassControlFunc, } ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() + h := registerDialCancel(cancel) + defer func() { + cancel() + unregisterDialCancel(h) + }() c, err := d.DialContext(ctx, "tcp", *driverAddr) if err != nil { return nil, err @@ -510,7 +550,11 @@ func (s *revDialState) runDialOutLoop(conns chan<- net.Conn) { log.Printf("[dial-driver] connect failure: %v", s) } lastErr = s - time.Sleep(time.Second) + sleepCtx, sleepCancel := context.WithTimeout(context.Background(), time.Second) + h := registerDialCancel(sleepCancel) + <-sleepCtx.Done() + sleepCancel() + unregisterDialCancel(h) continue } if !connected { diff --git a/tstest/natlab/vmtest/qemu.go b/tstest/natlab/vmtest/qemu.go index a2ccd780c..3726c17fe 100644 --- a/tstest/natlab/vmtest/qemu.go +++ b/tstest/natlab/vmtest/qemu.go @@ -20,12 +20,69 @@ import ( "tailscale.com/tstest/natlab/vnet" ) -// startQEMU launches a QEMU process for the given node. -func (e *Env) startQEMU(n *Node) error { - if n.os.IsGokrazy { - return e.startGokrazyQEMU(n) +// gokrazyPlatform boots gokrazy (Linux) VMs via QEMU. +type gokrazyPlatform struct{} + +func (gokrazyPlatform) planSteps(e *Env, n *Node) { + e.Step("Build gokrazy image") + e.Step("Launch QEMU: " + n.name) +} + +func (gokrazyPlatform) boot(ctx context.Context, e *Env, n *Node) error { + e.gokrazyOnce.Do(func() { + step := e.Step("Build gokrazy image") + step.Begin() + if err := e.ensureGokrazy(ctx); err != nil { + step.End(err) + e.t.Fatalf("ensureGokrazy: %v", err) + } + step.End(nil) + }) + + e.ensureQEMUSocket() + + vmStep := e.Step("Launch QEMU: " + n.name) + vmStep.Begin() + if err := e.startGokrazyQEMU(n); err != nil { + vmStep.End(err) + return err } - return e.startCloudQEMU(n) + vmStep.End(nil) + return nil +} + +// qemuCloudPlatform boots cloud images (Ubuntu, Debian, FreeBSD) via QEMU. +type qemuCloudPlatform struct{} + +func (qemuCloudPlatform) planSteps(e *Env, n *Node) { + e.Step(fmt.Sprintf("Compile %s_%s binaries", n.os.GOOS(), n.os.GOARCH())) + e.Step(fmt.Sprintf("Prepare %s image", n.os.Name)) + e.Step("Launch QEMU: " + n.name) +} + +func (qemuCloudPlatform) boot(ctx context.Context, e *Env, n *Node) error { + goos, goarch := n.os.GOOS(), n.os.GOARCH() + + e.ensureCompiled(ctx, goos, goarch) + + imgStep := e.Step(fmt.Sprintf("Prepare %s image", n.os.Name)) + imgStep.Begin() + if err := ensureImage(ctx, n.os); err != nil { + imgStep.End(err) + return err + } + imgStep.End(nil) + + e.ensureQEMUSocket() + + vmStep := e.Step("Launch QEMU: " + n.name) + vmStep.Begin() + if err := e.startCloudQEMU(n); err != nil { + vmStep.End(err) + return err + } + vmStep.End(nil) + return nil } // startGokrazyQEMU launches a QEMU process for a gokrazy node. diff --git a/tstest/natlab/vmtest/tailmac.go b/tstest/natlab/vmtest/tailmac.go index 0ad47a9af..167feeb04 100644 --- a/tstest/natlab/vmtest/tailmac.go +++ b/tstest/natlab/vmtest/tailmac.go @@ -5,31 +5,76 @@ package vmtest import ( "bufio" + "bytes" "context" "encoding/base64" "encoding/json" "fmt" "io" + "net" "net/http" + "net/netip" "os" "os/exec" "path/filepath" "strings" "testing" "time" + + "golang.org/x/crypto/ssh" ) +// macPlatform boots macOS VMs via Tart base images and tailmac Host.app. +type macPlatform struct{} + +func (macPlatform) planSteps(e *Env, n *Node) { + e.Step("Prepare macOS Tart image") + e.Step("Launch macOS VM: " + n.name) +} + +func (macPlatform) boot(ctx context.Context, e *Env, n *Node) error { + imgStep := e.Step("Prepare macOS Tart image") + e.macosSnapshotOnce.Do(func() { + imgStep.Begin() + e.macosSnapshot = ensureSnapshot(e.t) + imgStep.End(nil) + }) + + e.ensureDgramSocket() + + vmStep := e.Step("Launch macOS VM: " + n.name) + vmStep.Begin() + if err := e.startTailMacVM(n); err != nil { + vmStep.End(err) + return err + } + vmStep.End(nil) + return nil +} + const tartImage = "ghcr.io/cirruslabs/macos-tahoe-base:latest" +// macOSSnapshotCodeVersion is bumped when the snapshot preparation logic +// changes in a way that invalidates old snapshots. Old snapshots with a +// different version are cleaned up automatically. +const macOSSnapshotCodeVersion = 5 + // tartConfig is the subset of Tart's config.json we need. type tartConfig struct { HardwareModel string `json:"hardwareModel"` // base64 ECID string `json:"ecid"` // base64 } +// tartManifest is the subset of Tart's OCI manifest.json we need. +type tartManifest struct { + Config struct { + Digest string `json:"digest"` // e.g. "sha256:3a6cb4eb6201..." + } `json:"config"` +} + // ensureTartImage checks that the Tart base image is available, pulling it -// if necessary. Returns the path to a directory containing disk.img, -// nvram.bin, and config.json. +// if necessary. Returns the path to the OCI cache directory containing +// disk.img, nvram.bin, config.json, and manifest.json. func ensureTartImage(t testing.TB) string { if _, err := exec.LookPath("tart"); err != nil { t.Skip("tart not installed; skipping macOS VM test") @@ -40,7 +85,6 @@ func ensureTartImage(t testing.TB) string { t.Fatalf("UserHomeDir: %v", err) } - // Check OCI cache first (from a previous "tart pull"). ociDir := filepath.Join(home, ".tart", "cache", "OCIs", "ghcr.io", "cirruslabs", "macos-tahoe-base", "latest") if _, err := os.Stat(filepath.Join(ociDir, "disk.img")); err == nil { @@ -55,7 +99,6 @@ func ensureTartImage(t testing.TB) string { t.Fatalf("tart pull: %v", err) } - // After pull, the OCI cache should have it. if _, err := os.Stat(filepath.Join(ociDir, "disk.img")); err == nil { return ociDir } @@ -63,6 +106,368 @@ func ensureTartImage(t testing.TB) string { return "" } +// snapshotCacheKey computes a cache key for the macOS VM snapshot. +// The key combines the image name, the first 12 hex chars of the Tart +// config digest (changes when the upstream image is updated), and the +// snapshot code version (changes when our prep logic changes). +func snapshotCacheKey(tartDir string) (string, error) { + manifestPath := filepath.Join(tartDir, "manifest.json") + data, err := os.ReadFile(manifestPath) + if err != nil { + return "", fmt.Errorf("reading manifest: %w", err) + } + var m tartManifest + if err := json.Unmarshal(data, &m); err != nil { + return "", fmt.Errorf("parsing manifest: %w", err) + } + digest := m.Config.Digest + // Strip "sha256:" prefix and take first 12 hex chars. + digest = strings.TrimPrefix(digest, "sha256:") + if len(digest) > 12 { + digest = digest[:12] + } + return fmt.Sprintf("snap-tahoe-%s-v%d", digest, macOSSnapshotCodeVersion), nil +} + +// macosVMBaseDir returns ~/.cache/tailscale/vmtest/macos/, the directory +// where Host.app expects to find VM directories by ID. +func macosVMBaseDir() (string, error) { + home, err := os.UserHomeDir() + if err != nil { + return "", err + } + return filepath.Join(home, ".cache", "tailscale", "vmtest", "macos"), nil +} + +// cleanOldSnapshots removes any snapshot directories for the given image +// prefix (e.g. "snap-tahoe") that don't match the current cache key. +func cleanOldSnapshots(t testing.TB, imagePrefix, currentKey string) { + base, err := macosVMBaseDir() + if err != nil { + return + } + matches, _ := filepath.Glob(filepath.Join(base, imagePrefix+"-*")) + currentPath := filepath.Join(base, currentKey) + for _, m := range matches { + if m != currentPath { + t.Logf("removing stale snapshot: %s", filepath.Base(m)) + os.RemoveAll(m) + } + } +} + +// ensureSnapshot returns the path to a cached macOS VM snapshot, creating +// one if necessary. The snapshot contains a fully booted VM with +// SaveFile.vzvmsave ready for fast restore. +func ensureSnapshot(t testing.TB) string { + tartDir := ensureTartImage(t) + + key, err := snapshotCacheKey(tartDir) + if err != nil { + t.Fatalf("snapshot cache key: %v", err) + } + + base, err := macosVMBaseDir() + if err != nil { + t.Fatalf("macOS VM base dir: %v", err) + } + os.MkdirAll(base, 0755) + + snapDir := filepath.Join(base, key) + saveFile := filepath.Join(snapDir, "SaveFile.vzvmsave") + if _, err := os.Stat(saveFile); err == nil { + t.Logf("using cached macOS snapshot: %s", key) + return snapDir + } + + // Clean up old snapshots for this image. + cleanOldSnapshots(t, "snap-tahoe", key) + + t.Logf("preparing macOS snapshot: %s (this takes ~30s on first run)", key) + if err := prepareSnapshot(t, tartDir, snapDir); err != nil { + os.RemoveAll(snapDir) + t.Fatalf("preparing snapshot: %v", err) + } + return snapDir +} + +// prepareSnapshot creates a new macOS VM snapshot by booting the Tart base +// image with a NAT NIC, waiting for SSH, and saving VM state. +func prepareSnapshot(t testing.TB, tartDir, snapDir string) error { + // The vmID must match the directory name under macosVMBaseDir + // because Host.app looks up VM files at //. + snapID := filepath.Base(snapDir) + + if err := cloneTartToTailmac(tartDir, snapDir, snapID, "52:cc:cc:cc:ce:01", "/dev/null"); err != nil { + return fmt.Errorf("cloning tart: %w", err) + } + + modRoot, err := findModRoot() + if err != nil { + return err + } + tailmacDir := filepath.Join(modRoot, "tstest", "tailmac", "bin") + hostBin := filepath.Join(tailmacDir, "Host.app", "Contents", "MacOS", "Host") + if _, err := os.Stat(hostBin); err != nil { + return fmt.Errorf("Host.app not found at %s; run 'make all' in tstest/tailmac/", hostBin) + } + + // Host.app reads VM files from ~/.cache/tailscale/vmtest/macos//. + // Our snapDir is already under that tree, and the config.json vmID matches. + cmd := exec.Command(hostBin, "run", "--id", snapID, "--headless", "--nat-nic") + cmd.Env = append(os.Environ(), "NSUnbufferedIO=YES") + + logPath := snapDir + ".prep.log" + logFile, err := os.Create(logPath) + if err != nil { + return err + } + defer logFile.Close() + cmd.Stdout = logFile + cmd.Stderr = logFile + devNull, _ := os.Open(os.DevNull) + cmd.Stdin = devNull + defer devNull.Close() + + if err := cmd.Start(); err != nil { + return fmt.Errorf("starting Host.app: %w", err) + } + t.Logf("snapshot prep: launched Host.app (pid %d)", cmd.Process.Pid) + + // Wait for SSH to become available via the NAT NIC. + // The VM gets an IP from macOS's vmnet DHCP (typically 192.168.64.x). + ip, err := waitForVMIP(t, "52:cc:cc:cc:ce:01", 60*time.Second) + if err != nil { + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("waiting for VM IP: %w", err) + } + t.Logf("snapshot prep: VM IP is %s, waiting for SSH...", ip) + + sc, err := waitForSSH(ip, 60*time.Second) + if err != nil { + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("waiting for SSH: %w", err) + } + t.Logf("snapshot prep: SSH connected") + + // Compile and install TTA in the macOS VM. + t.Logf("snapshot prep: installing TTA...") + if err := installTTA(t, sc); err != nil { + sc.Close() + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("installing TTA: %w", err) + } + sc.Close() + + // Save VM state by sending SIGINT. + t.Logf("snapshot prep: saving VM state...") + cmd.Process.Signal(os.Interrupt) + done := make(chan error, 1) + go func() { done <- cmd.Wait() }() + select { + case err := <-done: + if err != nil { + // Host.app exits 0 after saving state, non-zero is unexpected. + t.Logf("snapshot prep: Host.app exited with: %v", err) + } + case <-time.After(60 * time.Second): + cmd.Process.Kill() + <-done + return fmt.Errorf("Host.app did not exit after SIGINT") + } + + // Verify the save file was created. + saveFile := filepath.Join(snapDir, "SaveFile.vzvmsave") + if _, err := os.Stat(saveFile); err != nil { + return fmt.Errorf("SaveFile.vzvmsave not found after prep") + } + t.Logf("snapshot prep: done, saved to %s", filepath.Base(snapDir)) + os.Remove(logPath) + return nil +} + +// installTTA compiles TTA for darwin/arm64 and installs it in the macOS VM +// as a LaunchDaemon via SSH/SCP. +func installTTA(t testing.TB, sc *ssh.Client) error { + modRoot, err := findModRoot() + if err != nil { + return err + } + + // Compile TTA for the macOS VM. + tmpDir := t.TempDir() + ttaBin := filepath.Join(tmpDir, "tta") + t.Logf("snapshot prep: compiling TTA for darwin/arm64...") + buildCmd := exec.Command("go", "build", "-o", ttaBin, "./cmd/tta") + buildCmd.Dir = modRoot + buildCmd.Env = append(os.Environ(), "GOOS=darwin", "GOARCH=arm64", "CGO_ENABLED=0") + if out, err := buildCmd.CombinedOutput(); err != nil { + return fmt.Errorf("compiling TTA: %v\n%s", err, out) + } + + // Read the binary. + ttaData, err := os.ReadFile(ttaBin) + if err != nil { + return fmt.Errorf("reading TTA binary: %w", err) + } + t.Logf("snapshot prep: TTA binary is %d bytes", len(ttaData)) + + // SCP the TTA binary to the VM via a temp file (admin user can't write /usr/local/bin directly). + if err := scpFile(sc, ttaData, "/tmp/tta", 0755); err != nil { + return fmt.Errorf("uploading TTA: %w", err) + } + if err := runSSHCmd(sc, "echo admin | sudo -S mv /tmp/tta /usr/local/bin/tta"); err != nil { + return fmt.Errorf("moving TTA to /usr/local/bin: %w", err) + } + + // Install the LaunchDaemon plist. + plist := ` + + + + Label + com.tailscale.tta + ProgramArguments + + /usr/local/bin/tta + + RunAtLoad + + KeepAlive + + StandardOutPath + /tmp/tta.log + StandardErrorPath + /tmp/tta.log + + +` + if err := scpFile(sc, []byte(plist), "/tmp/com.tailscale.tta.plist", 0644); err != nil { + return fmt.Errorf("uploading plist: %w", err) + } + if err := runSSHCmd(sc, "echo admin | sudo -S mv /tmp/com.tailscale.tta.plist /Library/LaunchDaemons/ && echo admin | sudo -S chown root:wheel /Library/LaunchDaemons/com.tailscale.tta.plist"); err != nil { + return fmt.Errorf("installing plist: %w", err) + } + + // Load the LaunchDaemon. + if err := runSSHCmd(sc, "echo admin | sudo -S launchctl load /Library/LaunchDaemons/com.tailscale.tta.plist"); err != nil { + return fmt.Errorf("loading LaunchDaemon: %w", err) + } + + // Wait for TTA to start. + for range 20 { + if err := runSSHCmd(sc, "pgrep -x tta"); err == nil { + break + } + time.Sleep(250 * time.Millisecond) + } + if err := runSSHCmd(sc, "pgrep -x tta"); err != nil { + return fmt.Errorf("TTA not running after install: %w", err) + } + t.Logf("snapshot prep: TTA installed and running") + return nil +} + +// scpFile uploads data to a remote path via SSH/SCP. +func scpFile(sc *ssh.Client, data []byte, remotePath string, mode os.FileMode) error { + sess, err := sc.NewSession() + if err != nil { + return err + } + defer sess.Close() + + // Use a simple shell command to write the file. + cmd := fmt.Sprintf("cat > %s && chmod %o %s", remotePath, mode, remotePath) + sess.Stdin = bytes.NewReader(data) + out, err := sess.CombinedOutput(cmd) + if err != nil { + return fmt.Errorf("%s: %v: %s", cmd, err, out) + } + return nil +} + +// runSSHCmd runs a command on the SSH client and returns an error if it fails. +func runSSHCmd(sc *ssh.Client, cmd string) error { + sess, err := sc.NewSession() + if err != nil { + return err + } + defer sess.Close() + out, err := sess.CombinedOutput(cmd) + if err != nil { + return fmt.Errorf("%s: %v: %s", cmd, err, out) + } + return nil +} + +// waitForVMIP polls /var/db/dhcpd_leases for a DHCP lease matching the +// given MAC address (from macOS's vmnet NAT). Returns the IP. +func waitForVMIP(t testing.TB, mac string, timeout time.Duration) (string, error) { + // Normalize MAC format: vmnet leases use "1,xx:xx:xx:xx:xx:xx" format + // with leading zeros stripped from each octet (e.g. "1,52:cc:cc:cc:ce:1" + // instead of "1,52:cc:cc:cc:ce:01"). + mac = strings.ToLower(mac) + parts := strings.Split(mac, ":") + for i, p := range parts { + parts[i] = strings.TrimLeft(p, "0") + if parts[i] == "" { + parts[i] = "0" + } + } + leaseMAC := "1," + strings.Join(parts, ":") + + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + data, err := os.ReadFile("/var/db/dhcpd_leases") + if err == nil { + // Parse the plist-like lease file. + lines := strings.Split(string(data), "\n") + var currentIP string + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "ip_address=") { + currentIP = strings.TrimPrefix(line, "ip_address=") + } + if strings.HasPrefix(line, "hw_address=") { + hw := strings.TrimPrefix(line, "hw_address=") + if strings.ToLower(hw) == leaseMAC && currentIP != "" { + return currentIP, nil + } + } + if line == "}" { + currentIP = "" + } + } + } + time.Sleep(time.Second) + } + return "", fmt.Errorf("no DHCP lease for MAC %s after %v", mac, timeout) +} + +// waitForSSH retries SSH connection to the given IP until it succeeds or +// the timeout expires. +func waitForSSH(ip string, timeout time.Duration) (*ssh.Client, error) { + deadline := time.Now().Add(timeout) + addr := net.JoinHostPort(ip, "22") + cfg := &ssh.ClientConfig{ + User: "admin", + Auth: []ssh.AuthMethod{ssh.Password("admin")}, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), + Timeout: 2 * time.Second, + } + for time.Now().Before(deadline) { + sc, err := ssh.Dial("tcp", addr, cfg) + if err == nil { + return sc, nil + } + time.Sleep(time.Second) + } + return nil, fmt.Errorf("SSH to %s timed out after %v", addr, timeout) +} + // ensureTailMac locates the pre-built tailmac Host.app binary. func (e *Env) ensureTailMac() error { modRoot, err := findModRoot() @@ -85,7 +490,6 @@ func cloneTartToTailmac(tartDir, cloneDir, testID, mac, dgramSock string) error return err } - // Read Tart's config.json for hardware identity. cfgData, err := os.ReadFile(filepath.Join(tartDir, "config.json")) if err != nil { return fmt.Errorf("reading tart config: %w", err) @@ -95,7 +499,6 @@ func cloneTartToTailmac(tartDir, cloneDir, testID, mac, dgramSock string) error return fmt.Errorf("parsing tart config: %w", err) } - // Decode and write HardwareModel. hwModel, err := base64.StdEncoding.DecodeString(tc.HardwareModel) if err != nil { return fmt.Errorf("decoding hardwareModel: %w", err) @@ -104,7 +507,6 @@ func cloneTartToTailmac(tartDir, cloneDir, testID, mac, dgramSock string) error return err } - // Decode and write MachineIdentifier (ECID). ecid, err := base64.StdEncoding.DecodeString(tc.ECID) if err != nil { return fmt.Errorf("decoding ecid: %w", err) @@ -113,22 +515,18 @@ func cloneTartToTailmac(tartDir, cloneDir, testID, mac, dgramSock string) error return err } - // APFS clone the disk image (nearly instant, copy-on-write). if out, err := exec.Command("cp", "-c", filepath.Join(tartDir, "disk.img"), filepath.Join(cloneDir, "Disk.img")).CombinedOutput(); err != nil { - // Fallback to regular copy. if out2, err2 := exec.Command("cp", filepath.Join(tartDir, "disk.img"), filepath.Join(cloneDir, "Disk.img")).CombinedOutput(); err2 != nil { return fmt.Errorf("copying disk: %v: %s (APFS clone: %v: %s)", err2, out2, err, out) } } - // APFS clone the NVRAM. if out, err := exec.Command("cp", "-c", filepath.Join(tartDir, "nvram.bin"), filepath.Join(cloneDir, "AuxiliaryStorage")).CombinedOutput(); err != nil { if out2, err2 := exec.Command("cp", filepath.Join(tartDir, "nvram.bin"), filepath.Join(cloneDir, "AuxiliaryStorage")).CombinedOutput(); err2 != nil { return fmt.Errorf("copying nvram: %v: %s (APFS clone: %v: %s)", err2, out2, err, out) } } - // Write tailmac config.json. tmCfg := struct { VMid string `json:"vmID"` ServerSocket string `json:"serverSocket"` @@ -137,17 +535,17 @@ func cloneTartToTailmac(tartDir, cloneDir, testID, mac, dgramSock string) error }{ VMid: testID, ServerSocket: dgramSock, - MemorySize: 8 * 1024 * 1024 * 1024, + MemorySize: 4 * 1024 * 1024 * 1024, Mac: mac, } tmData, _ := json.MarshalIndent(tmCfg, "", " ") return os.WriteFile(filepath.Join(cloneDir, "config.json"), tmData, 0644) } -// startTailMacVM clones a Tart base image and launches it via tailmac -// Host.app in headless mode, connected to vnet's dgram socket. +// startTailMacVM restores a macOS VM from a cached snapshot and launches it +// via tailmac Host.app in headless mode, connected to vnet's dgram socket. func (e *Env) startTailMacVM(n *Node) error { - tartDir := ensureTartImage(e.t) + snapDir := e.macosSnapshot if err := e.ensureTailMac(); err != nil { return err @@ -156,7 +554,6 @@ func (e *Env) startTailMacVM(n *Node) error { testID := fmt.Sprintf("vmtest-%s-%d", n.name, os.Getpid()) // Host.app expects VM files under ~/.cache/tailscale/vmtest/macos// - // (hardcoded in Config.swift's vmBundleURL). home, err := os.UserHomeDir() if err != nil { return fmt.Errorf("UserHomeDir: %w", err) @@ -165,16 +562,51 @@ func (e *Env) startTailMacVM(n *Node) error { os.MkdirAll(vmBase, 0755) cloneDir := filepath.Join(vmBase, testID) - mac := n.vnetNode.NICMac(0) - e.t.Logf("[%s] cloning Tart image -> %s (mac=%s)", n.name, testID, mac) - if err := cloneTartToTailmac(tartDir, cloneDir, testID, mac.String(), e.dgramSockAddr); err != nil { - return fmt.Errorf("cloning tart VM: %w", err) + // APFS clone the entire snapshot directory (includes SaveFile.vzvmsave). + e.t.Logf("[%s] cloning snapshot -> %s", n.name, testID) + if out, err := exec.Command("cp", "-c", "-r", snapDir, cloneDir).CombinedOutput(); err != nil { + if out2, err2 := exec.Command("cp", "-r", snapDir, cloneDir).CombinedOutput(); err2 != nil { + return fmt.Errorf("cloning snapshot: %v: %s (APFS clone: %v: %s)", err2, out2, err, out) + } } e.t.Cleanup(func() { os.RemoveAll(cloneDir) }) + // Write test-specific config.json with the vnet MAC and dgram socket. + mac := n.vnetNode.NICMac(0) + cfg := struct { + VMid string `json:"vmID"` + ServerSocket string `json:"serverSocket"` + MemorySize uint64 `json:"memorySize"` + Mac string `json:"mac"` + }{ + VMid: testID, + ServerSocket: e.dgramSockAddr, + MemorySize: 8 * 1024 * 1024 * 1024, + Mac: mac.String(), + } + cfgData, _ := json.MarshalIndent(cfg, "", " ") + if err := os.WriteFile(filepath.Join(cloneDir, "config.json"), cfgData, 0644); err != nil { + return fmt.Errorf("writing config.json: %w", err) + } + + // Launch Host.app with disconnected NIC + hot-swap to vnet. + // Host.app will restore from SaveFile.vzvmsave (fast), then + // hot-swap the NIC to the vnet dgram socket. hostBin := filepath.Join(e.tailmacDir, "Host.app", "Contents", "MacOS", "Host") + + // Compute the node's IP and gateway for static assignment via vsock. + nodeIP := n.vnetNode.LanIP(n.nets[0]) + // The gateway is the network's base address (e.g. 192.168.1.1 for /24). + // We derive it from the node IP: same /24 prefix, host part = 1. + gwIP := nodeIP.As4() + gwIP[3] = 1 + gateway := netip.AddrFrom4(gwIP) + args := []string{ "run", "--id", testID, "--headless", + "--disconnected-nic", + "--attach-network", e.dgramSockAddr, + "--assign-ip", fmt.Sprintf("%s/255.255.255.0/%s", nodeIP, gateway), } wantScreenshots := *vmtestWeb != "" @@ -191,8 +623,6 @@ func (e *Env) startTailMacVM(n *Node) error { cmd := exec.Command(hostBin, args...) cmd.Env = append(os.Environ(), "NSUnbufferedIO=YES") - // If screenshots are enabled, we need to parse stdout for the - // SCREENSHOT_PORT= line, while also logging everything to file. var stdoutPipe io.ReadCloser if wantScreenshots { stdoutPipe, err = cmd.StdoutPipe() @@ -219,14 +649,13 @@ func (e *Env) startTailMacVM(n *Node) error { } e.t.Logf("[%s] launched tailmac (pid %d), log: %s", n.name, cmd.Process.Pid, logPath) - // Parse screenshot port from stdout and start polling goroutine. if wantScreenshots { screenshotPortCh := make(chan int, 1) go func() { scanner := bufio.NewScanner(stdoutPipe) for scanner.Scan() { line := scanner.Text() - fmt.Fprintln(logFile, line) // tee to log file + fmt.Fprintln(logFile, line) if port := 0; strings.HasPrefix(line, "SCREENSHOT_PORT=") { fmt.Sscanf(line, "SCREENSHOT_PORT=%d", &port) if port > 0 { @@ -252,15 +681,9 @@ func (e *Env) startTailMacVM(n *Node) error { clientSock := fmt.Sprintf("/tmp/qemu-dgram-%s.sock", testID) e.t.Cleanup(func() { - cmd.Process.Signal(os.Interrupt) - done := make(chan error, 1) - go func() { done <- cmd.Wait() }() - select { - case <-done: - case <-time.After(15 * time.Second): - cmd.Process.Kill() - <-done - } + // Kill immediately — no need to save state for ephemeral test clones. + cmd.Process.Kill() + cmd.Wait() devNull.Close() logFile.Close() os.Remove(clientSock) diff --git a/tstest/natlab/vmtest/vmtest.go b/tstest/natlab/vmtest/vmtest.go index 3eb42c2de..20f20cedf 100644 --- a/tstest/natlab/vmtest/vmtest.go +++ b/tstest/natlab/vmtest/vmtest.go @@ -78,16 +78,28 @@ type Env struct { gokrazyKernel string // path to gokrazy kernel // tailmac-specific paths (macOS VMs) - tailmacDir string // path to tailmac bin/ directory containing Host.app + tailmacDir string // path to tailmac bin/ directory containing Host.app + macosSnapshot string // path to cached macOS VM snapshot directory + macosSnapshotOnce sync.Once qemuProcs []*exec.Cmd // launched QEMU processes sameTailnetUser bool // all nodes register as the same Tailnet user + // Shared resource initialization (sync.Once for things multiple nodes share). + vnetOnce sync.Once + gokrazyOnce sync.Once + qemuSockOnce sync.Once + dgramSockOnce sync.Once + compileMu sync.Mutex + compiled set.Set[string] + // Web UI support. ctx context.Context // cancelled when test ends eventBus *EventBus testStatus *TestStatus + stepsMu sync.Mutex + stepsByKey map[string]*Step steps []*Step nodeStatusMu sync.Mutex @@ -102,6 +114,28 @@ func (e *Env) logVerbosef(format string, args ...any) { } } +// vmPlatform defines how a VM type boots. Each OS image type (gokrazy, +// cloud, macOS) implements this interface. +type vmPlatform interface { + // planSteps registers steps with the web UI in a dry-run pass. + planSteps(e *Env, n *Node) + + // boot does everything needed to get this node running: ensure images, + // compile binaries, set up sockets, launch VM. Called concurrently. + boot(ctx context.Context, e *Env, n *Node) error +} + +// platform returns the vmPlatform for this node's OS type. +func (n *Node) platform() vmPlatform { + if n.os.IsMacOS { + return macPlatform{} + } + if n.os.IsGokrazy { + return gokrazyPlatform{} + } + return qemuCloudPlatform{} +} + // AddStep declares an expected stage of the test. The web UI shows all steps // from the start, tracking their progress. Call before or during the test. // Returns a *Step whose Begin/End methods drive the progress display. @@ -115,6 +149,28 @@ func (e *Env) AddStep(name string) *Step { return s } +// Step returns a step by key, creating it if it doesn't exist. +// Safe for concurrent use. Both planSteps (dry-run) and boot (real-run) +// call this to get the same Step object. +func (e *Env) Step(key string) *Step { + e.stepsMu.Lock() + defer e.stepsMu.Unlock() + if s, ok := e.stepsByKey[key]; ok { + return s + } + s := &Step{ + name: key, + index: len(e.steps), + env: e, + } + e.steps = append(e.steps, s) + if e.stepsByKey == nil { + e.stepsByKey = make(map[string]*Step) + } + e.stepsByKey[key] = s + return s +} + // Steps returns all declared steps in order. func (e *Env) Steps() []*Step { return e.steps @@ -397,16 +453,14 @@ func SNATSubnetRoutes(v bool) nodeOptSNATSubnetRoutes { return nodeOptSNATSubnet // The webserver responds with "Hello world I am from " on all requests. func WebServer(port int) nodeOptWebServer { return nodeOptWebServer(port) } -// Start initializes the virtual network, builds/downloads images, compiles -// binaries, launches QEMU processes, and waits for all TTA agents to connect. -// It should be called after all AddNetwork/AddNode calls. +// Start initializes the virtual network, boots all VMs in parallel, and waits +// for all TTA agents to connect. It should be called after all AddNetwork/AddNode calls. func (e *Env) Start() { t := e.t ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) t.Cleanup(cancel) e.ctx = ctx - // Initialize node status and start web UI as early as possible. e.initNodeStatus() e.maybeStartWebServer() @@ -414,8 +468,6 @@ func (e *Env) Start() { t.Fatal(err) } - // Resolve --test-version up front (e.g. "unstable" -> "1.97.255") so all - // platforms see the same concrete version. if *testVersion != "" { v, err := resolveTestVersion(ctx, *testVersion) if err != nil { @@ -425,267 +477,42 @@ func (e *Env) Start() { t.Logf("using Tailscale release version %s (from --test-version=%q)", v, *testVersion) } - // Check if any macOS nodes are present; if so, verify prerequisites. - hasMacOS := false for _, n := range e.nodes { - if n.os.IsMacOS { - hasMacOS = true - break - } - } - if hasMacOS { - if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" { + if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") { t.Skip("macOS VM tests require macOS arm64 host") } } - // Determine which GOOS/GOARCH pairs need compiled binaries (non-gokrazy, - // non-macOS images). Gokrazy has binaries built-in. macOS VMs don't use - // compiled binaries (no TTA agent). - type platform struct{ goos, goarch string } - needPlatform := set.Set[platform]{} - for _, n := range e.nodes { - if !n.os.IsGokrazy && !n.os.IsMacOS { - needPlatform.Add(platform{n.os.GOOS(), n.os.GOARCH()}) - } - } - - // Declare framework steps for the web UI. - // User-declared steps (from AddStep before Start) get moved to the end - // so framework steps (compile, image, QEMU, etc.) come first. + // Dry-run: let each platform register its steps with the web UI. userSteps := e.steps e.steps = nil - - compileSteps := map[platform]*Step{} - for _, p := range needPlatform.Slice() { - compileSteps[p] = e.AddStep(fmt.Sprintf("Compile %s_%s binaries", p.goos, p.goarch)) - } - imageSteps := map[string]*Step{} // keyed by OS name - didOS := set.Set[string]{} // dedup by image name for _, n := range e.nodes { - if didOS.Contains(n.os.Name) { - continue - } - didOS.Add(n.os.Name) - if n.os.IsMacOS { - imageSteps[n.os.Name] = e.AddStep("Prepare macOS Tart image") - } else if n.os.IsGokrazy { - imageSteps["gokrazy"] = e.AddStep("Build gokrazy image") - } else { - imageSteps[n.os.Name] = e.AddStep(fmt.Sprintf("Prepare %s image", n.os.Name)) - } + n.platform().planSteps(e, n) } - vnetStep := e.AddStep("Create virtual network") - - vmSteps := map[string]*Step{} - agentSteps := map[string]*Step{} - tsUpSteps := map[string]*Step{} for _, n := range e.nodes { - if n.os.IsMacOS { - vmSteps[n.name] = e.AddStep(fmt.Sprintf("Launch macOS VM: %s", n.name)) - } else { - vmSteps[n.name] = e.AddStep(fmt.Sprintf("Launch QEMU: %s", n.name)) - } if !n.noAgent { - agentSteps[n.name] = e.AddStep(fmt.Sprintf("Wait for agent: %s", n.name)) + e.Step("Wait for agent: " + n.name) } if n.joinTailnet { - tsUpSteps[n.name] = e.AddStep(fmt.Sprintf("Tailscale up: %s", n.name)) + e.Step("Tailscale up: " + n.name) } } - - // Re-append user-declared steps after all framework steps. for _, s := range userSteps { s.index = len(e.steps) e.steps = append(e.steps, s) } - // Compile binaries and download/build images in parallel. - // Any failure cancels the others via the errgroup context. - eg, egCtx := errgroup.WithContext(ctx) - for _, p := range needPlatform.Slice() { - step := compileSteps[p] - eg.Go(func() error { - step.Begin() - err := e.compileBinariesForOS(egCtx, p.goos, p.goarch) - if err != nil { - step.End(err) - return err - } - step.End(nil) - return nil + // Boot all nodes in parallel. Each platform handles its own + // dependencies (image prep, binary compilation, socket setup) + // via sync.Once, so independent work overlaps naturally. + var bootEg errgroup.Group + for _, n := range e.nodes { + bootEg.Go(func() error { + return n.platform().boot(ctx, e, n) }) } - didOS = set.Set[string]{} // reset for second pass - for _, n := range e.nodes { - if didOS.Contains(n.os.Name) { - continue - } - didOS.Add(n.os.Name) - if n.os.IsMacOS { - step := imageSteps[n.os.Name] - eg.Go(func() error { - step.Begin() - ensureTartImage(t) - step.End(nil) - return nil - }) - } else if n.os.IsGokrazy { - step := imageSteps["gokrazy"] - eg.Go(func() error { - step.Begin() - err := e.ensureGokrazy(egCtx) - if err != nil { - step.End(err) - return err - } - step.End(nil) - return nil - }) - } else { - step := imageSteps[n.os.Name] - osImg := n.os - eg.Go(func() error { - step.Begin() - err := ensureImage(egCtx, osImg) - if err != nil { - step.End(err) - return err - } - step.End(nil) - return nil - }) - } - } - if err := eg.Wait(); err != nil { - t.Fatalf("setup: %v", err) - } - - // Create the vnet server. - vnetStep.Begin() - var err error - e.server, err = vnet.New(&e.cfg) - if err != nil { - t.Fatalf("vnet.New: %v", err) - } - t.Cleanup(func() { e.server.Close() }) - - // Register DHCP event callback for the web UI. - e.server.SetDHCPCallback(func(mac vnet.MAC, nodeNum int, msgType layers.DHCPMsgType, ip netip.Addr) { - name := e.nodeNameByNum(nodeNum) - nicIdx := e.nicIndexForMAC(name, mac) - ipStr := ip.String() - switch msgType { - case layers.DHCPMsgTypeDiscover: - e.setNodeDHCP(name, nicIdx, "Discover sent") - e.eventBus.Publish(VMEvent{ - NodeName: name, - Type: EventDHCPDiscover, - Message: "DHCP Discover sent", - NIC: nicIdx, - }) - case layers.DHCPMsgTypeOffer: - e.setNodeDHCP(name, nicIdx, "Offered "+ipStr) - e.eventBus.Publish(VMEvent{ - NodeName: name, - Type: EventDHCPOffer, - Message: "DHCP Offer received", - Detail: ipStr, - NIC: nicIdx, - }) - case layers.DHCPMsgTypeRequest: - e.setNodeDHCP(name, nicIdx, "Requesting "+ipStr) - e.eventBus.Publish(VMEvent{ - NodeName: name, - Type: EventDHCPRequest, - Message: "DHCP Request sent", - Detail: ipStr, - NIC: nicIdx, - }) - case layers.DHCPMsgTypeAck: - e.setNodeDHCP(name, nicIdx, "Got "+ipStr) - e.eventBus.Publish(VMEvent{ - NodeName: name, - Type: EventDHCPAck, - Message: "DHCP Ack: got " + ipStr, - Detail: ipStr, - NIC: nicIdx, - }) - } - }) - - if e.sameTailnetUser { - e.server.ControlServer().AllNodesSameUser = true - } - - // Register compiled binaries with the file server VIP. - // Binaries are registered at _/ (e.g. "linux_amd64/tta"). - for _, p := range needPlatform.Slice() { - dir := p.goos + "_" + p.goarch - for _, name := range []string{"tta", "tailscale", "tailscaled"} { - data, err := os.ReadFile(filepath.Join(e.binDir, dir, name)) - if err != nil { - t.Fatalf("reading compiled %s/%s: %v", dir, name, err) - } - e.server.RegisterFile(dir+"/"+name, data) - } - } - vnetStep.End(nil) - - // Cloud-init config is delivered via local seed ISOs (created in startCloudQEMU), - // not via the cloud-init HTTP VIP, because network-config must be available - // during init-local before systemd-networkd-wait-online blocks. - - // Start Unix stream socket listener (for QEMU VMs). - e.sockAddr = filepath.Join(e.tempDir, "vnet.sock") - srv, err := net.Listen("unix", e.sockAddr) - if err != nil { - t.Fatalf("listen unix: %v", err) - } - t.Cleanup(func() { srv.Close() }) - - go func() { - for { - c, err := srv.Accept() - if err != nil { - return - } - go e.server.ServeUnixConn(c.(*net.UnixConn), vnet.ProtocolQEMU) - } - }() - - // Start Unix dgram socket listener (for macOS VMs via tailmac). - // Use /tmp/ instead of the test temp dir because Unix socket paths - // are limited to 104 bytes on macOS, and test temp dir paths are long. - if hasMacOS { - e.dgramSockAddr = fmt.Sprintf("/tmp/vmtest-dgram-%d.sock", os.Getpid()) - t.Cleanup(func() { os.Remove(e.dgramSockAddr) }) - dgramAddr, err := net.ResolveUnixAddr("unixgram", e.dgramSockAddr) - if err != nil { - t.Fatalf("resolve dgram addr: %v", err) - } - uc, err := net.ListenUnixgram("unixgram", dgramAddr) - if err != nil { - t.Fatalf("listen unixgram: %v", err) - } - t.Cleanup(func() { uc.Close() }) - go e.server.ServeUnixConn(uc, vnet.ProtocolUnixDGRAM) - } - - // Launch VM processes. - for _, n := range e.nodes { - step := vmSteps[n.name] - step.Begin() - if n.os.IsMacOS { - if err := e.startTailMacVM(n); err != nil { - t.Fatalf("startTailMacVM(%s): %v", n.name, err) - } - } else { - if err := e.startQEMU(n); err != nil { - t.Fatalf("startQEMU(%s): %v", n.name, err) - } - } - step.End(nil) + if err := bootEg.Wait(); err != nil { + t.Fatalf("boot: %v", err) } // Set up agent clients and wait for all agents to connect. @@ -693,25 +520,32 @@ func (e *Env) Start() { if n.noAgent { continue } + e.initVnet() // ensure vnet is ready for agent clients n.agent = e.server.NodeAgentClient(n.vnetNode) n.vnetNode.SetClient(n.agent) } - // Wait for agents, then bring up tailscale. var agentEg errgroup.Group for _, n := range e.nodes { if n.noAgent { continue } agentEg.Go(func() error { - aStep := agentSteps[n.name] + aStep := e.Step("Wait for agent: " + n.name) aStep.Begin() t.Logf("[%s] waiting for agent...", n.name) - st, err := n.agent.Status(ctx) - if err != nil { - return fmt.Errorf("[%s] agent status: %w", n.name, err) + if n.joinTailnet { + st, err := n.agent.Status(ctx) + if err != nil { + return fmt.Errorf("[%s] agent status: %w", n.name, err) + } + t.Logf("[%s] agent connected, backend state: %s", n.name, st.BackendState) + } else { + if err := e.waitForAgentConn(ctx, n); err != nil { + return fmt.Errorf("[%s] agent connect: %w", n.name, err) + } + t.Logf("[%s] agent connected (no tailscale)", n.name) } - t.Logf("[%s] agent connected, backend state: %s", n.name, st.BackendState) aStep.End(nil) if n.vnetNode.HostFirewall() { @@ -721,21 +555,21 @@ func (e *Env) Start() { } if n.joinTailnet { - tsStep := tsUpSteps[n.name] + tsStep := e.Step("Tailscale up: " + n.name) tsStep.Begin() if err := e.tailscaleUp(ctx, n); err != nil { return fmt.Errorf("[%s] tailscale up: %w", n.name, err) } - st, err = n.agent.Status(ctx) + st2, err := n.agent.Status(ctx) if err != nil { return fmt.Errorf("[%s] status after up: %w", n.name, err) } - if st.BackendState != "Running" { - return fmt.Errorf("[%s] state = %q, want Running", n.name, st.BackendState) + if st2.BackendState != "Running" { + return fmt.Errorf("[%s] state = %q, want Running", n.name, st2.BackendState) } - ips := fmt.Sprintf("%v", st.Self.TailscaleIPs) + ips := fmt.Sprintf("%v", st2.Self.TailscaleIPs) e.setNodeTailscale(n.name, "Running "+ips) - t.Logf("[%s] up with %v", n.name, st.Self.TailscaleIPs) + t.Logf("[%s] up with %v", n.name, st2.Self.TailscaleIPs) tsStep.End(nil) } @@ -1226,6 +1060,147 @@ func (e *Env) nodeScreenshotPort(name string) int { return 0 } +// initVnet creates the vnet server. Called once via sync.Once. +func (e *Env) initVnet() { + e.vnetOnce.Do(func() { + var err error + e.server, err = vnet.New(&e.cfg) + if err != nil { + e.t.Fatalf("vnet.New: %v", err) + } + e.t.Cleanup(func() { e.server.Close() }) + + e.server.SetDHCPCallback(func(mac vnet.MAC, nodeNum int, msgType layers.DHCPMsgType, ip netip.Addr) { + name := e.nodeNameByNum(nodeNum) + nicIdx := e.nicIndexForMAC(name, mac) + ipStr := ip.String() + switch msgType { + case layers.DHCPMsgTypeDiscover: + e.setNodeDHCP(name, nicIdx, "Discover sent") + e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPDiscover, Message: "DHCP Discover sent", NIC: nicIdx}) + case layers.DHCPMsgTypeOffer: + e.setNodeDHCP(name, nicIdx, "Offered "+ipStr) + e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPOffer, Message: "DHCP Offer received", Detail: ipStr, NIC: nicIdx}) + case layers.DHCPMsgTypeRequest: + e.setNodeDHCP(name, nicIdx, "Requesting "+ipStr) + e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPRequest, Message: "DHCP Request sent", Detail: ipStr, NIC: nicIdx}) + case layers.DHCPMsgTypeAck: + e.setNodeDHCP(name, nicIdx, "Got "+ipStr) + e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPAck, Message: "DHCP Ack: got " + ipStr, Detail: ipStr, NIC: nicIdx}) + } + }) + + if e.sameTailnetUser { + e.server.ControlServer().AllNodesSameUser = true + } + }) +} + +// ensureQEMUSocket creates the Unix stream socket for QEMU VMs. Called once. +func (e *Env) ensureQEMUSocket() { + e.qemuSockOnce.Do(func() { + e.initVnet() + e.sockAddr = filepath.Join(e.tempDir, "vnet.sock") + srv, err := net.Listen("unix", e.sockAddr) + if err != nil { + e.t.Fatalf("listen unix: %v", err) + } + e.t.Cleanup(func() { srv.Close() }) + go func() { + for { + c, err := srv.Accept() + if err != nil { + return + } + go e.server.ServeUnixConn(c.(*net.UnixConn), vnet.ProtocolQEMU) + } + }() + }) +} + +// ensureDgramSocket creates the Unix dgram socket for macOS VMs. Called once. +func (e *Env) ensureDgramSocket() { + e.dgramSockOnce.Do(func() { + e.initVnet() + e.dgramSockAddr = fmt.Sprintf("/tmp/vmtest-dgram-%d.sock", os.Getpid()) + e.t.Cleanup(func() { os.Remove(e.dgramSockAddr) }) + dgramAddr, err := net.ResolveUnixAddr("unixgram", e.dgramSockAddr) + if err != nil { + e.t.Fatalf("resolve dgram addr: %v", err) + } + uc, err := net.ListenUnixgram("unixgram", dgramAddr) + if err != nil { + e.t.Fatalf("listen unixgram: %v", err) + } + e.t.Cleanup(func() { uc.Close() }) + go e.server.ServeUnixConn(uc, vnet.ProtocolUnixDGRAM) + }) +} + +// ensureCompiled compiles binaries for the given platform and registers them +// with the vnet file server. Safe for concurrent use; only compiles once per platform. +func (e *Env) ensureCompiled(ctx context.Context, goos, goarch string) { + key := goos + "_" + goarch + + e.compileMu.Lock() + if e.compiled.Contains(key) { + e.compileMu.Unlock() + return + } + e.compileMu.Unlock() + + step := e.Step(fmt.Sprintf("Compile %s_%s binaries", goos, goarch)) + step.Begin() + if err := e.compileBinariesForOS(ctx, goos, goarch); err != nil { + step.End(err) + e.t.Fatalf("compileBinariesForOS(%s, %s): %v", goos, goarch, err) + } + step.End(nil) + e.registerBinaries(goos, goarch) + + e.compileMu.Lock() + e.compiled.Make() + e.compiled.Add(key) + e.compileMu.Unlock() +} + +// registerBinaries registers compiled binaries with the vnet file server. +// Safe for concurrent use. +func (e *Env) registerBinaries(goos, goarch string) { + e.initVnet() + dir := goos + "_" + goarch + for _, name := range []string{"tta", "tailscale", "tailscaled"} { + data, err := os.ReadFile(filepath.Join(e.binDir, dir, name)) + if err != nil { + e.t.Fatalf("reading compiled %s/%s: %v", dir, name, err) + } + e.server.RegisterFile(dir+"/"+name, data) + } +} + +// waitForAgentConn waits for a TTA agent to connect by issuing a simple +// HTTP GET to the root endpoint, without requiring tailscaled. +func (e *Env) waitForAgentConn(ctx context.Context, n *Node) error { + for { + reqCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + req, err := http.NewRequestWithContext(reqCtx, "GET", "http://unused/", nil) + if err != nil { + cancel() + return err + } + res, err := n.agent.HTTPClient.Do(req) + cancel() + if err == nil { + res.Body.Close() + return nil + } + if ctx.Err() != nil { + return ctx.Err() + } + time.Sleep(500 * time.Millisecond) + } +} + // Agent returns the node's TTA agent client, or nil if NoAgent is set. func (n *Node) Agent() *vnet.NodeAgentClient { return n.agent diff --git a/tstest/natlab/vmtest/vmtest_test.go b/tstest/natlab/vmtest/vmtest_test.go index e1c343977..260bc2fd4 100644 --- a/tstest/natlab/vmtest/vmtest_test.go +++ b/tstest/natlab/vmtest/vmtest_test.go @@ -26,16 +26,32 @@ func TestMacOSAndLinuxCanPing(t *testing.T) { vmtest.DontJoinTailnet()) macos := env.AddNode("macos", lan, vmtest.OS(vmtest.MacOS), - vmtest.DontJoinTailnet(), - vmtest.NoAgent()) + vmtest.DontJoinTailnet()) env.Start() - // Ping from Linux (which has TTA) to macOS (which just responds to ICMP). - // LANPing retries until the macOS VM has booted and acquired a DHCP lease. env.LANPing(linux, macos.LanIP(lan)) } +func TestTwoMacOSVMsCanPing(t *testing.T) { + env := vmtest.New(t) + + lan := env.AddNetwork("192.168.1.1/24") + + mac1 := env.AddNode("mac1", lan, + vmtest.OS(vmtest.MacOS), + vmtest.DontJoinTailnet()) + mac2 := env.AddNode("mac2", lan, + vmtest.OS(vmtest.MacOS), + vmtest.DontJoinTailnet()) + + env.Start() + + // Both macOS VMs have TTA. Ping from mac1 to mac2 and vice versa. + env.LANPing(mac1, mac2.LanIP(lan)) + env.LANPing(mac2, mac1.LanIP(lan)) +} + func TestSubnetRouter(t *testing.T) { testSubnetRouterForOS(t, vmtest.Ubuntu2404) } diff --git a/tstest/natlab/vnet/vip.go b/tstest/natlab/vnet/vip.go index a6973ed50..07b64f54c 100644 --- a/tstest/natlab/vnet/vip.go +++ b/tstest/natlab/vnet/vip.go @@ -33,6 +33,13 @@ func (v virtualIP) Match(a netip.Addr) bool { return v.v4 == a.Unmap() || v.v6 == a } +// TestDriverIPv4 returns the IPv4 address of the test driver VIP (52.52.0.2). +// TTA agents dial this IP on port TestDriverPort to connect to the test harness. +func TestDriverIPv4() netip.Addr { return fakeTestAgent.v4 } + +// TestDriverPort is the port the test driver listens on. +const TestDriverPort = 8008 + // FakeDNSIPv4 returns the fake DNS IPv4 address. func FakeDNSIPv4() netip.Addr { return fakeDNS.v4 } diff --git a/tstest/natlab/vnet/vnet.go b/tstest/natlab/vnet/vnet.go index 4836eea05..1c28c2c5d 100644 --- a/tstest/natlab/vnet/vnet.go +++ b/tstest/natlab/vnet/vnet.go @@ -352,7 +352,7 @@ func (n *network) acceptTCP(r *tcp.ForwarderRequest) { return } - if destPort == 8008 && fakeTestAgent.Match(destIP) { + if destPort == TestDriverPort && fakeTestAgent.Match(destIP) { node, ok := n.nodeByIP(clientRemoteIP) if !ok { n.logf("unknown client IP %v trying to connect to test driver", clientRemoteIP) @@ -2106,7 +2106,7 @@ func (s *Server) shouldInterceptTCP(pkt gopacket.Packet) bool { return true } } - if tcp.DstPort == 8008 && fakeTestAgent.Match(flow.dst) { + if tcp.DstPort == TestDriverPort && fakeTestAgent.Match(flow.dst) { // Connection from cmd/tta. return true } diff --git a/tstest/tailmac/Swift/Host/HostCli.swift b/tstest/tailmac/Swift/Host/HostCli.swift index 578722520..16711b2aa 100644 --- a/tstest/tailmac/Swift/Host/HostCli.swift +++ b/tstest/tailmac/Swift/Host/HostCli.swift @@ -22,8 +22,10 @@ extension HostCli { @Option var share: String? @Flag(help: "Run without GUI (for automated testing)") var headless: Bool = false @Flag(help: "Create NIC with no attachment (for later hot-swap)") var disconnectedNic: Bool = false + @Flag(help: "Use NAT NIC instead of socket NIC (for snapshot prep)") var natNic: Bool = false @Option(help: "Hot-swap NIC to this dgram socket path after boot/restore") var attachNetwork: String? @Option(help: "Serve screenshots on this localhost port (0 = auto)") var screenshotPort: Int? + @Option(help: "Assign IP/mask/gw to guest via vsock (e.g. 192.168.1.2/255.255.255.0/192.168.1.1)") var assignIp: String? mutating func run() { config = Config(id) @@ -32,19 +34,38 @@ extension HostCli { if headless { let attachSocket = attachNetwork - let disconnected = disconnectedNic || attachSocket != nil + let useNatNIC = natNic + let disconnected = !useNatNIC && (disconnectedNic || attachSocket != nil) let wantScreenshots = screenshotPort != nil let requestedPort = UInt16(screenshotPort ?? 0) + let ipConfig = assignIp + + // Set up SIGINT handler before entering the event loop. + // The dispatch source must be stored in a global to prevent ARC deallocation. + signal(SIGINT, SIG_IGN) + let sigintSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: .main) + retainedSigintSource = sigintSource DispatchQueue.main.async { let controller = VMController() - controller.createVirtualMachine(headless: true, disconnectedNIC: disconnected) + controller.createVirtualMachine(headless: true, disconnectedNIC: disconnected, natNIC: useNatNIC) + + // Start vsock listener for IP assignment. + // If --assign-ip is set, the listener replies with the IP config JSON. + // If not set (snapshot prep), it replies "wait" so TTA keeps polling. + if let ipCfg = ipConfig { + let parts = ipCfg.split(separator: "/") + if parts.count == 3 { + let response = "{\"ip\":\"\(parts[0])\",\"mask\":\"\(parts[1])\",\"gw\":\"\(parts[2])\"}" + controller.startIPConfigListener(response: response) + } + } else { + controller.startIPConfigListener(response: "wait") + } - // Handle SIGINT (from test cleanup) by saving VM state before exit. - let sigintSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: .main) - signal(SIGINT, SIG_IGN) // Let DispatchSource handle it sigintSource.setEventHandler { - print("SIGINT received, saving VM state...") + print("SIGINT received, disconnecting NIC and saving VM state...") + controller.disconnectNetwork() controller.pauseAndSaveVirtualMachine { print("VM state saved, exiting.") Foundation.exit(0) @@ -79,11 +100,7 @@ extension HostCli { let doAttach = { if let sock = attachSocket { - // Give macOS a moment to settle after boot/restore, - // then hot-swap the NIC attachment. - DispatchQueue.main.asyncAfter(deadline: .now() + 1.0) { - controller.attachNetwork(serverSocket: sock, clientID: config.vmID) - } + controller.attachNetwork(serverSocket: sock, clientID: config.vmID) } } @@ -107,7 +124,9 @@ extension HostCli { fflush(stdout) app.run() } else { - RunLoop.main.run() + // Use dispatchMain() instead of RunLoop.main.run() so that + // GCD dispatch sources (like the SIGINT handler) are processed. + dispatchMain() } } else { _ = NSApplicationMain(CommandLine.argc, CommandLine.unsafeArgv) @@ -119,6 +138,7 @@ extension HostCli { // startScreenshotServer starts a localhost HTTP server that serves VM display // screenshots on GET /screenshot as JPEG. The port is printed to stdout as // "SCREENSHOT_PORT=" so the Go test harness can discover it. +var retainedSigintSource: DispatchSourceSignal? // prevent ARC deallocation var screenshotServer: ScreenshotHTTPServer? // prevent GC func startScreenshotServer(view: NSView, port: UInt16) { diff --git a/tstest/tailmac/Swift/Host/VMController.swift b/tstest/tailmac/Swift/Host/VMController.swift index 5dcd04411..c2014009a 100644 --- a/tstest/tailmac/Swift/Host/VMController.swift +++ b/tstest/tailmac/Swift/Host/VMController.swift @@ -81,7 +81,7 @@ class VMController: NSObject, VZVirtualMachineDelegate { return macPlatform } - func createVirtualMachine(headless: Bool = false, disconnectedNIC: Bool = false) { + func createVirtualMachine(headless: Bool = false, disconnectedNIC: Bool = false, natNIC: Bool = false) { let virtualMachineConfiguration = VZVirtualMachineConfiguration() virtualMachineConfiguration.platform = createMacPlaform() @@ -91,7 +91,10 @@ class VMController: NSObject, VZVirtualMachineDelegate { virtualMachineConfiguration.graphicsDevices = [helper.createGraphicsDeviceConfiguration()] virtualMachineConfiguration.storageDevices = [helper.createBlockDeviceConfiguration()] if headless { - if disconnectedNIC { + if natNIC { + // NAT NIC for SSH access during snapshot preparation. + virtualMachineConfiguration.networkDevices = [helper.createNetworkDeviceConfiguration()] + } else if disconnectedNIC { // Create a NIC with no attachment. The NIC exists in the hardware // config (so saved state is compatible) but appears disconnected. // Call attachNetwork() after restore to hot-swap the attachment. @@ -120,6 +123,17 @@ class VMController: NSObject, VZVirtualMachineDelegate { virtualMachine.delegate = self } + /// Disconnect the NIC by setting its attachment to nil. + /// Call before saving state so the snapshot has no active link. + func disconnectNetwork() { + guard let nic = virtualMachine.networkDevices.first else { + print("disconnectNetwork: no network devices") + return + } + nic.attachment = nil + print("disconnectNetwork: NIC attachment set to nil") + } + /// Hot-swap the NIC attachment on a running VM. The VM must have been /// created with disconnectedNIC=true. After calling this, the guest /// sees the link come up and does DHCP. @@ -157,6 +171,21 @@ class VMController: NSObject, VZVirtualMachineDelegate { } } + /// Start a vsock listener that tells the guest TTA agent what IP to configure. + /// If response is nil, the listener replies "wait" (snapshot prep mode). + func startIPConfigListener(response: String) { + guard let device = virtualMachine.socketDevices.first as? VZVirtioSocketDevice else { + print("startIPConfigListener: no socket device") + return + } + let listener = IPConfigListener(response: response) + retainedIPConfigListener = listener + let vsockListener = VZVirtioSocketListener() + vsockListener.delegate = listener + device.setSocketListener(vsockListener, forPort: 51011) + print("startIPConfigListener: listening on vsock port 51011") + } + func resumeVirtualMachine() { virtualMachine.resume(completionHandler: { (result) in if case let .failure(error) = result { @@ -211,3 +240,28 @@ class VMController: NSObject, VZVirtualMachineDelegate { exit(0) } } + +// Global to prevent ARC deallocation of the vsock listener. +var retainedIPConfigListener: IPConfigListener? + +/// Listens on vsock port 51011 for TTA connections and replies with +/// an IP configuration JSON string (or "wait" during snapshot prep). +class IPConfigListener: NSObject, VZVirtioSocketListenerDelegate { + let response: String + + init(response: String) { + self.response = response + } + + func listener(_ listener: VZVirtioSocketListener, + shouldAcceptNewConnection connection: VZVirtioSocketConnection, + from socketDevice: VZVirtioSocketDevice) -> Bool { + let fd = connection.fileDescriptor + let data = Array((response + "\n").utf8) + data.withUnsafeBufferPointer { buf in + _ = write(fd, buf.baseAddress!, buf.count) + } + connection.close() + return true + } +}