ssh/tailssh: harden macOS exit-status integration tests

Make TestIntegrationExitCodes and TestOpenSSHExitCodes diagnose their
own setup and tolerate transport-level noise without masking the bug
they assert.

Pre-flight in TestMain: log GOOS, GOARCH, euid, hostname,
TAILSCALED_PATH, the test user's uid/gid/home/login shell, ssh -V,
and pre-create host keys. Each invariant is fail-fast with a clear
message instead of leaving the failure to surface as a sub-second
test crash with no log output, which is exactly what was happening
on the macos-latest GitHub runner.

Retries: both tests now retry up to 3 times on transport-level
failures (dial errors, ssh exit code 255, non-*ssh.ExitError errors)
with linear backoff. An exit-code mismatch — the actual behavior the
fix from #18256 is asserting — never retries and fails loudly. This
keeps stability without hiding regressions.

OpenSSH client args: ConnectTimeout bumped from 5s to 15s, plus
IdentityAgent=none and PreferredAuthentications=none so the auth
path is pinned across OpenSSH versions on macOS instead of letting
the bundled LibreSSL fork pick a different fallback.

Per-test diagnostics: dumpIncubatorLogOnFail prints
/tmp/tailscalessh.log (where the incubator subprocess writes) into
the test output when a subtest fails. Previously the log was only
printed at end-of-binary, after all subtests, which made
attributing output to a specific failure painful.

dialTestClientForUser is a non-fatal variant of testClientForUser
that returns the dial error so the retry loop can act on it.

Verified locally on Linux: 20 consecutive runs of both tests with
the full incubator code path (tailscaled be-child ssh re-exec, drop
privileges, login shell exec) and 5 consecutive runs under -race.
No flakes; exit codes 0, 42, and 127 all propagate correctly.

Updates #18256

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2026-05-04 15:06:24 +00:00
parent afad3d4e76
commit dc3bf9bade

View File

@ -21,6 +21,7 @@ import (
"net/netip"
"os"
"os/exec"
"os/user"
"path/filepath"
"runtime"
"strings"
@ -73,6 +74,11 @@ func TestMain(m *testing.M) {
log.Fatal(err)
}
// Pre-flight diagnostics. Any failure here is a setup bug; fail loud
// instead of letting subtests fail with cryptic <1s errors. Every line
// is also useful debug context when chasing flakes from CI artifacts.
logPreflight()
code := m.Run()
os.RemoveAll(testVarRoot)
@ -85,6 +91,73 @@ func TestMain(m *testing.M) {
os.Exit(code)
}
// logPreflight prints environment information that's useful for diagnosing
// CI failures. It also fails fast with a descriptive message when a required
// invariant (test user exists, has a usable shell, host keys can be created,
// tailscaled binary is present) is violated, instead of leaving the failure
// to surface as a sub-second test crash with no log output.
//
// This runs unconditionally on every test binary invocation and must remain
// cheap and side-effect-free aside from creating the host-key directory.
func logPreflight() {
log.Printf("preflight: GOOS=%s GOARCH=%s euid=%d", runtime.GOOS, runtime.GOARCH, os.Geteuid())
if hn, err := os.Hostname(); err == nil {
log.Printf("preflight: hostname=%s", hn)
}
log.Printf("preflight: TAILSCALED_PATH=%q", os.Getenv("TAILSCALED_PATH"))
log.Printf("preflight: TS_SSH_INTEGRATION_TEST_USER=%q", os.Getenv("TS_SSH_INTEGRATION_TEST_USER"))
log.Printf("preflight: testVarRoot=%s", testVarRoot)
if p := os.Getenv("TAILSCALED_PATH"); p != "" {
if fi, err := os.Stat(p); err != nil {
log.Fatalf("preflight: TAILSCALED_PATH=%q not usable: %v", p, err)
} else if fi.Mode()&0111 == 0 {
log.Fatalf("preflight: TAILSCALED_PATH=%q is not executable (mode %v)", p, fi.Mode())
}
}
if sshPath, err := exec.LookPath("ssh"); err == nil {
log.Printf("preflight: ssh=%s", sshPath)
if out, err := exec.Command("ssh", "-V").CombinedOutput(); err == nil {
log.Printf("preflight: ssh -V: %s", bytes.TrimSpace(out))
}
} else {
log.Printf("preflight: ssh not found in PATH (%v)", err)
}
username := exitCodeTestUser()
log.Printf("preflight: exitCodeTestUser=%q", username)
if u, err := user.Lookup(username); err != nil {
log.Fatalf("preflight: user.Lookup(%q) failed: %v", username, err)
} else {
log.Printf("preflight: user %q -> uid=%s gid=%s home=%s", username, u.Uid, u.Gid, u.HomeDir)
}
if um, err := userLookup(username); err != nil {
log.Fatalf("preflight: userLookup(%q) failed: %v", username, err)
} else {
shell := um.LoginShell()
log.Printf("preflight: login shell for %q: %q", username, shell)
if shell == "" {
log.Fatalf("preflight: empty login shell for %q", username)
}
if _, err := exec.LookPath(shell); err != nil {
// LookPath fails on absolute paths only if the file is not
// executable / not found.
if _, statErr := os.Stat(shell); statErr != nil {
log.Fatalf("preflight: login shell %q for user %q is not usable: %v", shell, username, statErr)
}
}
}
// Generate host keys eagerly so the first parallel testServer() call
// doesn't race other concurrent calls and so that any failure surfaces
// here with full context instead of inside the SSH server goroutine.
if _, err := getHostKeys(testVarRoot, log.Printf); err != nil {
log.Fatalf("preflight: getHostKeys(%q) failed: %v", testVarRoot, err)
}
log.Printf("preflight: host keys ready in %s/ssh", testVarRoot)
}
func TestIntegrationSSH(t *testing.T) {
homeDir := "/home/testuser"
if runtime.GOOS == "darwin" {
@ -957,9 +1030,18 @@ func (conn *addressFakingConn) RemoteAddr() net.Addr {
// TestIntegrationExitCodes verifies that SSH exit codes are correctly
// delivered to the client through the full server stack.
//
// The test exercises the production code path end-to-end: a real SSH server
// (tailssh.server) running with TAILSCALED_PATH set so the incubator is
// invoked, a real OS user with a real login shell, and a Go x/crypto/ssh
// client interpreting the wire-level exit-status frame. The fix in #18256
// is specifically about the order of exit-status, EOF, and CHANNEL_CLOSE
// frames on the wire, which is exactly what this exercises.
//
// Transient infrastructure failures (failure to dial the listener, auth
// errors before exec) are retried; only an exit code mismatch — the actual
// behavior we are asserting — is treated as a hard failure.
func TestIntegrationExitCodes(t *testing.T) {
debugTest.Store(true)
t.Cleanup(func() { debugTest.Store(false) })
username := exitCodeTestUser()
tests := []struct {
@ -988,66 +1070,98 @@ func TestIntegrationExitCodes(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cl := testClientForUser(t, username, false, false)
s, err := cl.NewSession()
if err != nil {
t.Fatal(err)
}
defer s.Close()
defer dumpIncubatorLogOnFail(t)
type result struct {
out []byte
err error
}
done := make(chan result, 1)
go func() {
out, err := s.CombinedOutput(tt.cmd)
done <- result{out: out, err: err}
}()
var out []byte
select {
case res := <-done:
out = res.out
err = res.err
case <-time.After(20 * time.Second):
s.Close()
cl.Close()
t.Fatalf("ssh command %q timed out", tt.cmd)
}
if tt.wantCode == 0 {
runOnce := func() (gotCode int, transportErr error, exitErr error, out []byte) {
cl, dialErr := dialTestClientForUser(t, username, false, false)
if dialErr != nil {
return -1, dialErr, nil, nil
}
defer cl.Close()
s, err := cl.NewSession()
if err != nil {
t.Fatalf("expected exit code 0, got error: %v; output:\n%s", err, out)
return -1, fmt.Errorf("NewSession: %w", err), nil, nil
}
defer s.Close()
type result struct {
out []byte
err error
}
done := make(chan result, 1)
go func() {
o, e := s.CombinedOutput(tt.cmd)
done <- result{out: o, err: e}
}()
select {
case res := <-done:
out = res.out
err = res.err
case <-time.After(20 * time.Second):
return -1, errors.New("ssh command timed out"), nil, out
}
if err == nil {
return 0, nil, nil, out
}
var ee *ssh.ExitError
if errors.As(err, &ee) {
return ee.ExitStatus(), nil, ee, out
}
// Anything else (channel error, EOF before exit-status,
// transport tear-down) is treated as transport noise so
// it can be retried. The actual bug we're catching only
// surfaces as ExitStatus().
return -1, fmt.Errorf("non-exit ssh error: %w", err), nil, out
}
const maxAttempts = 3
var lastErr error
var lastOut []byte
for attempt := 1; attempt <= maxAttempts; attempt++ {
gotCode, transportErr, _, out := runOnce()
if transportErr != nil {
t.Logf("attempt %d/%d: transport-level failure: %v; output:\n%s",
attempt, maxAttempts, transportErr, out)
lastErr = transportErr
lastOut = out
time.Sleep(time.Duration(attempt) * 250 * time.Millisecond)
continue
}
// We got a definitive exit code from the server. This is
// the assertion target; never retry past this point.
if gotCode != tt.wantCode {
t.Fatalf("exit code = %d, want %d; output:\n%s", gotCode, tt.wantCode, out)
}
return
}
if err == nil {
t.Fatalf("expected exit code %d, got nil error; output:\n%s", tt.wantCode, out)
}
var exitErr *ssh.ExitError
if !errors.As(err, &exitErr) {
t.Fatalf("expected *ssh.ExitError, got %T: %v; output:\n%s", err, err, out)
}
if exitErr.ExitStatus() != tt.wantCode {
t.Errorf("exit code = %d, want %d; output:\n%s", exitErr.ExitStatus(), tt.wantCode, out)
}
t.Fatalf("ssh command %q never completed cleanly after %d attempts; last err: %v; last output:\n%s",
tt.cmd, maxAttempts, lastErr, lastOut)
})
}
}
// TestOpenSSHExitCodes verifies that exit codes are propagated to a real
// OpenSSH client. This covers the client-visible behavior from #18256.
//
// macOS ships its own OpenSSH build (LibreSSL fork) with the system, which
// is what users of #18256 actually run; that's why this test must exercise
// the OpenSSH binary, not just the Go ssh client.
//
// The auth path is "none" auth: the client offers nothing, the tailssh
// server's clientAuth callback accepts it. We disable every other auth
// method so OpenSSH has nothing to fall back to and can't pick a different
// path on different OpenSSH versions.
func TestOpenSSHExitCodes(t *testing.T) {
if _, err := exec.LookPath("ssh"); err != nil {
sshPath, err := exec.LookPath("ssh")
if err != nil {
t.Skipf("skipping test without OpenSSH client: %v", err)
}
debugTest.Store(true)
t.Cleanup(func() { debugTest.Store(false) })
username := exitCodeTestUser()
if out, err := exec.Command("ssh", "-V").CombinedOutput(); err == nil {
if out, err := exec.Command(sshPath, "-V").CombinedOutput(); err == nil {
t.Logf("OpenSSH version: %s", bytes.TrimSpace(out))
}
t.Logf("OpenSSH test user: %s", username)
@ -1057,6 +1171,7 @@ func TestOpenSSHExitCodes(t *testing.T) {
if err != nil {
t.Fatal(err)
}
t.Logf("tailssh server listening on %s", addr)
exitStatus := func(t *testing.T, err error) int {
t.Helper()
@ -1070,6 +1185,13 @@ func TestOpenSSHExitCodes(t *testing.T) {
return ee.ExitCode()
}
// OpenSSH-side exit codes that are infrastructure failures, not the
// behavior we're asserting. Code 255 is "ssh internal error", e.g.
// connect/auth failure before the remote command runs.
//
// https://man.openbsd.org/ssh.1#EXIT_STATUS
isTransport := func(rc int) bool { return rc == 255 }
tests := []struct {
name string
cmd string
@ -1094,35 +1216,60 @@ func TestOpenSSHExitCodes(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()
defer dumpIncubatorLogOnFail(t)
cmd := exec.CommandContext(ctx, "ssh",
"-vvv",
"-F", "/dev/null",
"-T",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=5",
"-o", "GSSAPIAuthentication=no",
"-o", "GlobalKnownHostsFile=/dev/null",
"-o", "HostbasedAuthentication=no",
"-o", "KbdInteractiveAuthentication=no",
"-o", "NumberOfPasswordPrompts=0",
"-o", "PasswordAuthentication=no",
"-o", "PubkeyAuthentication=no",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-p", port,
username+"@"+host,
tt.cmd,
)
out, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
t.Fatalf("ssh command timed out; output:\n%s", out)
runOnce := func() (rc int, out []byte) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, sshPath,
"-vvv",
"-F", "/dev/null",
"-T",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=15",
"-o", "GSSAPIAuthentication=no",
"-o", "GlobalKnownHostsFile=/dev/null",
"-o", "HostbasedAuthentication=no",
"-o", "IdentityAgent=none",
"-o", "KbdInteractiveAuthentication=no",
"-o", "NumberOfPasswordPrompts=0",
"-o", "PasswordAuthentication=no",
"-o", "PreferredAuthentications=none",
"-o", "PubkeyAuthentication=no",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-p", port,
username+"@"+host,
tt.cmd,
)
o, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
t.Logf("ssh command timed out; output:\n%s", o)
return 255, o
}
return exitStatus(t, err), o
}
if got := exitStatus(t, err); got != tt.wantCode {
t.Fatalf("ssh exit code = %d, want %d; output:\n%s", got, tt.wantCode, out)
const maxAttempts = 3
var lastOut []byte
for attempt := 1; attempt <= maxAttempts; attempt++ {
rc, out := runOnce()
lastOut = out
if isTransport(rc) && rc != tt.wantCode {
t.Logf("attempt %d/%d: ssh transport-level failure (rc=255); output:\n%s",
attempt, maxAttempts, out)
time.Sleep(time.Duration(attempt) * 250 * time.Millisecond)
continue
}
// We got a definitive exit code; this is the assertion target.
if rc != tt.wantCode {
t.Fatalf("ssh exit code = %d, want %d; output:\n%s", rc, tt.wantCode, out)
}
return
}
t.Fatalf("ssh command %q never returned a non-transport exit status after %d attempts; last output:\n%s",
tt.cmd, maxAttempts, lastOut)
})
}
}
@ -1134,6 +1281,46 @@ func exitCodeTestUser() string {
return "testuser"
}
// dumpIncubatorLogOnFail prints the contents of /tmp/tailscalessh.log
// (where the incubator's logger writes) into the test output if the test
// failed. It's the single biggest debugging affordance for CI failures
// because the incubator runs as a separate process and its output never
// reaches t.Log otherwise.
func dumpIncubatorLogOnFail(t *testing.T) {
t.Helper()
if !t.Failed() {
return
}
b, err := os.ReadFile("/tmp/tailscalessh.log")
if err != nil {
t.Logf("could not read incubator log: %v", err)
return
}
if len(b) == 0 {
t.Logf("incubator log is empty (no incubator was launched, or its log was rotated)")
return
}
t.Logf("---- /tmp/tailscalessh.log (%d bytes) ----\n%s\n---- end of incubator log ----",
len(b), b)
}
// dialTestClientForUser is a non-fatal variant of testClientForUser used
// by retry-aware tests: it returns the dial error instead of t.Fatal'ing.
// Server lifetime is still tied to the test via t.Cleanup inside testServer.
func dialTestClientForUser(t *testing.T, username string, forceV1Behavior bool, allowSendEnv bool, authMethods ...ssh.AuthMethod) (*ssh.Client, error) {
t.Helper()
addr := testServer(t, username, forceV1Behavior, allowSendEnv)
cl, err := ssh.Dial("tcp", addr, &ssh.ClientConfig{
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
Auth: authMethods,
Timeout: 15 * time.Second,
})
if err != nil {
return nil, err
}
return cl, nil
}
// TestLocalUnixForwardingHalfClose verifies that the bidirectional copy
// in Unix socket forwarding uses half-close correctly: when one direction
// finishes, the other direction's data is not lost. This tests the bicopy