From 4fd5bcc2b84f8c80e8e636abab4d13b6ef387524 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 8 Apr 2026 11:58:30 +0000 Subject: [PATCH] ssh/tailssh: fix exit-status ordering and improve session shutdown Fix a race where CloseWrite (SSH_MSG_CHANNEL_EOF) could be sent before Exit (exit-status), causing SSH clients (especially on macOS) to miss the exit status. The root cause was a select between outputDone and ctx.Done that allowed Exit to be called while stdout was still copying, or after CloseWrite had already been sent. Replace the atomic.Bool/atomic.Int32/channel synchronization with sync.WaitGroup and guarantee the wire ordering required by RFC 4254 section 6.10: exit-status -> EOF -> CHANNEL_CLOSE. The new ordering in run(): 1. cmd.Wait() returns with exit code 2. ss.Exit(exitCode) sends exit-status 3. closeAll(childPipes) signals io.Copy goroutines to finish 4. wg.Wait() waits for stdout goroutine which calls CloseWrite (EOF) 5. defer ss.Close() sends CHANNEL_CLOSE on function return Additional changes: - Send SIGHUP instead of SIGKILL on session termination - Use exit code 255 for SSH protocol/permission errors - Use exit code 127 for command not found - Use exit code 254 for recording failures - Add isNotFoundOrExecutable helper Based on tailscale/tailscale#18331. Fixes #18256 Change-Id: Ib0c984d466c1a96c8f642e94a5dfe60d33d71fd8 Signed-off-by: Kristoffer Dalby --- flake.nix | 2 +- flakehashes.json | 4 +- go.mod | 2 +- go.sum | 4 +- shell.nix | 2 +- ssh/tailssh/tailssh.go | 108 ++++++++++++++++++++--------------------- 6 files changed, 60 insertions(+), 62 deletions(-) diff --git a/flake.nix b/flake.nix index 2a5880c10..900a3b181 100644 --- a/flake.nix +++ b/flake.nix @@ -164,4 +164,4 @@ }); }; } -# nix-direnv cache busting line: sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc= +# nix-direnv cache busting line: sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk= diff --git a/flakehashes.json b/flakehashes.json index a90e77cf1..ddc17cb7e 100644 --- a/flakehashes.json +++ b/flakehashes.json @@ -4,7 +4,7 @@ "sri": "sha256-pCvFNTFuvhSBb5O+PPuilaowP4tXcCOP1NgYUDJTcJU=" }, "vendor": { - "goModSum": "sha256-xjPeSzdlDw247JtuZ9gI/OXh0IYvQV3qN1WNRbSlir8=", - "sri": "sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc=" + "goModSum": "sha256-gyNEKCCFTtvRVkcavNxRd6laPOJ9CvdiTQyL4bLHyco=", + "sri": "sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk=" } } diff --git a/go.mod b/go.mod index 8adb1f076..917556b9f 100644 --- a/go.mod +++ b/go.mod @@ -92,7 +92,7 @@ require ( github.com/studio-b12/gowebdav v0.9.0 github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f - github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89 + github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41 github.com/tailscale/gokrazy-kernel v0.0.0-20240728225134-3d23beabda2e github.com/tailscale/golang-x-crypto v0.0.0-20250404221719-a5573b049869 diff --git a/go.sum b/go.sum index 2caac328d..1cc113e30 100644 --- a/go.sum +++ b/go.sum @@ -1131,8 +1131,8 @@ github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d h1:JcGKBZAL7 github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d/go.mod h1:XrBNfAFN+pwoWuksbFS9Ccxnopa15zJGgXRFN90l3K4= github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f h1:PDPGJtm9PFBLNudHGwkfUGp/FWvP+kXXJ0D1pB35F40= github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f/go.mod h1:p9lPsd+cx33L3H9nNoecRRxPssFKUwwI50I3pZ0yT+8= -github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89 h1:glgVc1ZYMjwN1Q/ITWeuSQyl029uayagaR2sjsifehc= -github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89/go.mod h1:wn16Km1EZOX4UEAyaZa3dBwfFGOJ7neck40NcwosJUw= +github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed h1:eM/Z6f15XvHPVmf7RDh0l+6HkjX8ZnO3LvofoaAMdkc= +github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed/go.mod h1:wn16Km1EZOX4UEAyaZa3dBwfFGOJ7neck40NcwosJUw= github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55 h1:Gzfnfk2TWrk8Jj4P4c1a3CtQyMaTVCznlkLZI++hok4= github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55/go.mod h1:4k4QO+dQ3R5FofL+SanAUZe+/QfeK0+OIuwDIRu2vSg= github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41 h1:/V2rCMMWcsjYaYO2MeovLw+ClP63OtXgCF2Y1eb8+Ns= diff --git a/shell.nix b/shell.nix index 6c36f7706..c9c2f0404 100644 --- a/shell.nix +++ b/shell.nix @@ -16,4 +16,4 @@ ) { src = ./.; }).shellNix -# nix-direnv cache busting line: sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc= +# nix-direnv cache busting line: sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk= diff --git a/ssh/tailssh/tailssh.go b/ssh/tailssh/tailssh.go index e01f78eb3..90bc86ffc 100644 --- a/ssh/tailssh/tailssh.go +++ b/ssh/tailssh/tailssh.go @@ -878,11 +878,17 @@ func (ss *sshSession) killProcessOnContextDone() { // We don't need to Process.Wait here, sshSession.run() does // the waiting regardless of termination reason. - // TODO(maisem): should this be a SIGTERM followed by a SIGKILL? - ss.cmd.Process.Kill() + // Send SIGHUP like a real terminal disconnect would. + ss.cmd.Process.Signal(syscall.SIGHUP) }) } +// isNotFoundOrExecutable reports whether err indicates the command +// could not be found or executed. +func isNotFoundOrExecutable(err error) bool { + return errors.Is(err, exec.ErrNotFound) || errors.Is(err, os.ErrNotExist) +} + // attachSession registers ss as an active session. func (c *conn) attachSession(ss *sshSession) { c.srv.sessionWaitGroup.Add(1) @@ -968,10 +974,11 @@ func (ss *sshSession) run() { metricActiveSessions.Add(1) defer metricActiveSessions.Add(-1) defer ss.cancelCtx(errSessionDone) + defer ss.Close() if attached := ss.conn.srv.attachSessionToConnIfNotShutdown(ss); !attached { fmt.Fprintf(ss, "Tailscale SSH is shutting down\r\n") - ss.Exit(1) + ss.Exit(255) return } defer ss.conn.detachSession(ss) @@ -993,7 +1000,9 @@ func (ss *sshSession) run() { if lu.Uid != fmt.Sprint(euid) { ss.logf("can't switch to user %q from process euid %v", lu.Username, euid) fmt.Fprintf(ss, "can't switch user\r\n") - ss.Exit(1) + // Exit code 255 indicates SSH protocol/permission error, + // matching OpenSSH behavior for fatal errors. + ss.Exit(255) return } } @@ -1021,7 +1030,9 @@ func (ss *sshSession) run() { fmt.Fprintf(ss, "can't start new recording\r\n") } ss.logf("startNewRecording: %v", err) - ss.Exit(1) + // Exit code 254 for recording infrastructure failure. + // Distinct from 255 (SSH protocol error) and 1 (general failure). + ss.Exit(254) return } ss.logf("startNewRecording: ") @@ -1034,6 +1045,7 @@ func (ss *sshSession) run() { err := ss.launchProcess() if err != nil { logf("start failed: %v", err.Error()) + exitCode := 1 if errors.Is(err, context.Canceled) { cause := context.Cause(ss.ctx) if serr, ok := cause.(SSHTerminationError); ok { @@ -1041,67 +1053,58 @@ func (ss *sshSession) run() { io.WriteString(ss.Stderr(), "\r\n\r\n"+msg+"\r\n\r\n") } } + } else if isNotFoundOrExecutable(err) { + // Use exit code 127 for "command not found" per shell convention. + exitCode = 127 } - ss.Exit(1) + ss.Exit(exitCode) return } ss.exitHandled = make(chan struct{}) go ss.killProcessOnContextDone() - var processDone atomic.Bool + // Start goroutines to copy stdin/stdout/stderr. + var wg sync.WaitGroup + + wg.Add(1) go func() { + defer wg.Done() defer ss.wrStdin.Close() if _, err := io.Copy(rec.writer("i", ss.wrStdin), ss); err != nil { logf("stdin copy: %v", err) ss.cancelCtx(err) } }() - outputDone := make(chan struct{}) - var openOutputStreams atomic.Int32 - if ss.rdStderr != nil { - openOutputStreams.Store(2) - } else { - openOutputStreams.Store(1) - } + + wg.Add(1) go func() { + defer wg.Done() defer ss.rdStdout.Close() - _, err := io.Copy(rec.writer("o", ss), ss.rdStdout) - if err != nil && !errors.Is(err, io.EOF) { - isErrBecauseProcessExited := processDone.Load() && errors.Is(err, syscall.EIO) - if !isErrBecauseProcessExited { - logf("stdout copy: %v", err) - ss.cancelCtx(err) - } - } - if openOutputStreams.Add(-1) == 0 { - ss.CloseWrite() - close(outputDone) + if _, err := io.Copy(rec.writer("o", ss), ss.rdStdout); err != nil && !errors.Is(err, io.EOF) { + logf("stdout copy: %v", err) } + ss.CloseWrite() }() + // rdStderr is nil for ptys. if ss.rdStderr != nil { + wg.Add(1) go func() { + defer wg.Done() defer ss.rdStderr.Close() - _, err := io.Copy(ss.Stderr(), ss.rdStderr) - if err != nil { + if _, err := io.Copy(ss.Stderr(), ss.rdStderr); err != nil { logf("stderr copy: %v", err) } - if openOutputStreams.Add(-1) == 0 { - ss.CloseWrite() - close(outputDone) - } }() } err = ss.cmd.Wait() - processDone.Store(true) if ss.ctx.Err() != nil { // Context was canceled (e.g., recording upload failure). // Wait for killProcessOnContextDone to finish writing any // termination message before we proceed. This must happen - // before closeAll and CloseWrite so the SSH channel is - // still writable. + // before Exit so the SSH channel is still writable. <-ss.exitHandled } @@ -1110,31 +1113,26 @@ func (ss *sshSession) run() { // aforementioned goroutine. ss.exitOnce.Do(func() {}) - // Close the process-side of all pipes to signal the asynchronous - // io.Copy routines reading/writing from the pipes to terminate. - // Block for the io.Copy to finish before calling ss.Exit below. - closeAll(ss.childPipes...) - select { - case <-outputDone: - case <-ss.ctx.Done(): - <-ss.exitHandled - } - + var exitCode int if err == nil { ss.logf("Session complete") - ss.Exit(0) - return - } - if ee, ok := err.(*exec.ExitError); ok { - code := ee.ProcessState.ExitCode() - ss.logf("Wait: code=%v", code) - ss.Exit(code) - return + exitCode = 0 + } else if ee, ok := err.(*exec.ExitError); ok { + exitCode = ee.ProcessState.ExitCode() + ss.logf("Wait: code=%v", exitCode) + } else { + ss.logf("Wait: %v", err) + exitCode = 1 } - ss.logf("Wait: %v", err) - ss.Exit(1) - return + // Send exit-status before EOF/Close. Per RFC 4254 section 6.10, + // exit-status should be sent before channel close. + ss.Exit(exitCode) + + // Close process-side of pipes to signal io.Copy goroutines + // to finish, then wait for all I/O to complete. + closeAll(ss.childPipes...) + wg.Wait() } // recordSSHToLocalDisk is a deprecated dev knob to allow recording SSH sessions