ssh/tailssh: fix exit-status ordering and improve session shutdown

Fix a race where CloseWrite (SSH_MSG_CHANNEL_EOF) could be sent before
Exit (exit-status), causing SSH clients (especially on macOS) to miss
the exit status. The root cause was a select between outputDone and
ctx.Done that allowed Exit to be called while stdout was still copying,
or after CloseWrite had already been sent.

Replace the atomic.Bool/atomic.Int32/channel synchronization with
sync.WaitGroup and guarantee the wire ordering required by RFC 4254
section 6.10: exit-status -> EOF -> CHANNEL_CLOSE.

The new ordering in run():
1. cmd.Wait() returns with exit code
2. ss.Exit(exitCode) sends exit-status
3. closeAll(childPipes) signals io.Copy goroutines to finish
4. wg.Wait() waits for stdout goroutine which calls CloseWrite (EOF)
5. defer ss.Close() sends CHANNEL_CLOSE on function return

Additional changes:
- Send SIGHUP instead of SIGKILL on session termination
- Use exit code 255 for SSH protocol/permission errors
- Use exit code 127 for command not found
- Use exit code 254 for recording failures
- Add isNotFoundOrExecutable helper

Based on tailscale/tailscale#18331.

Fixes #18256

Change-Id: Ib0c984d466c1a96c8f642e94a5dfe60d33d71fd8
Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2026-04-08 11:58:30 +00:00
parent 290a6cc03c
commit 4fd5bcc2b8
6 changed files with 60 additions and 62 deletions

View File

@ -164,4 +164,4 @@
});
};
}
# nix-direnv cache busting line: sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc=
# nix-direnv cache busting line: sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk=

View File

@ -4,7 +4,7 @@
"sri": "sha256-pCvFNTFuvhSBb5O+PPuilaowP4tXcCOP1NgYUDJTcJU="
},
"vendor": {
"goModSum": "sha256-xjPeSzdlDw247JtuZ9gI/OXh0IYvQV3qN1WNRbSlir8=",
"sri": "sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc="
"goModSum": "sha256-gyNEKCCFTtvRVkcavNxRd6laPOJ9CvdiTQyL4bLHyco=",
"sri": "sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk="
}
}

2
go.mod
View File

@ -92,7 +92,7 @@ require (
github.com/studio-b12/gowebdav v0.9.0
github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d
github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f
github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89
github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed
github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41
github.com/tailscale/gokrazy-kernel v0.0.0-20240728225134-3d23beabda2e
github.com/tailscale/golang-x-crypto v0.0.0-20250404221719-a5573b049869

4
go.sum
View File

@ -1131,8 +1131,8 @@ github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d h1:JcGKBZAL7
github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d/go.mod h1:XrBNfAFN+pwoWuksbFS9Ccxnopa15zJGgXRFN90l3K4=
github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f h1:PDPGJtm9PFBLNudHGwkfUGp/FWvP+kXXJ0D1pB35F40=
github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f/go.mod h1:p9lPsd+cx33L3H9nNoecRRxPssFKUwwI50I3pZ0yT+8=
github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89 h1:glgVc1ZYMjwN1Q/ITWeuSQyl029uayagaR2sjsifehc=
github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89/go.mod h1:wn16Km1EZOX4UEAyaZa3dBwfFGOJ7neck40NcwosJUw=
github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed h1:eM/Z6f15XvHPVmf7RDh0l+6HkjX8ZnO3LvofoaAMdkc=
github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed/go.mod h1:wn16Km1EZOX4UEAyaZa3dBwfFGOJ7neck40NcwosJUw=
github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55 h1:Gzfnfk2TWrk8Jj4P4c1a3CtQyMaTVCznlkLZI++hok4=
github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55/go.mod h1:4k4QO+dQ3R5FofL+SanAUZe+/QfeK0+OIuwDIRu2vSg=
github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41 h1:/V2rCMMWcsjYaYO2MeovLw+ClP63OtXgCF2Y1eb8+Ns=

View File

@ -16,4 +16,4 @@
) {
src = ./.;
}).shellNix
# nix-direnv cache busting line: sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc=
# nix-direnv cache busting line: sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk=

View File

@ -878,11 +878,17 @@ func (ss *sshSession) killProcessOnContextDone() {
// We don't need to Process.Wait here, sshSession.run() does
// the waiting regardless of termination reason.
// TODO(maisem): should this be a SIGTERM followed by a SIGKILL?
ss.cmd.Process.Kill()
// Send SIGHUP like a real terminal disconnect would.
ss.cmd.Process.Signal(syscall.SIGHUP)
})
}
// isNotFoundOrExecutable reports whether err indicates the command
// could not be found or executed.
func isNotFoundOrExecutable(err error) bool {
return errors.Is(err, exec.ErrNotFound) || errors.Is(err, os.ErrNotExist)
}
// attachSession registers ss as an active session.
func (c *conn) attachSession(ss *sshSession) {
c.srv.sessionWaitGroup.Add(1)
@ -968,10 +974,11 @@ func (ss *sshSession) run() {
metricActiveSessions.Add(1)
defer metricActiveSessions.Add(-1)
defer ss.cancelCtx(errSessionDone)
defer ss.Close()
if attached := ss.conn.srv.attachSessionToConnIfNotShutdown(ss); !attached {
fmt.Fprintf(ss, "Tailscale SSH is shutting down\r\n")
ss.Exit(1)
ss.Exit(255)
return
}
defer ss.conn.detachSession(ss)
@ -993,7 +1000,9 @@ func (ss *sshSession) run() {
if lu.Uid != fmt.Sprint(euid) {
ss.logf("can't switch to user %q from process euid %v", lu.Username, euid)
fmt.Fprintf(ss, "can't switch user\r\n")
ss.Exit(1)
// Exit code 255 indicates SSH protocol/permission error,
// matching OpenSSH behavior for fatal errors.
ss.Exit(255)
return
}
}
@ -1021,7 +1030,9 @@ func (ss *sshSession) run() {
fmt.Fprintf(ss, "can't start new recording\r\n")
}
ss.logf("startNewRecording: %v", err)
ss.Exit(1)
// Exit code 254 for recording infrastructure failure.
// Distinct from 255 (SSH protocol error) and 1 (general failure).
ss.Exit(254)
return
}
ss.logf("startNewRecording: <nil>")
@ -1034,6 +1045,7 @@ func (ss *sshSession) run() {
err := ss.launchProcess()
if err != nil {
logf("start failed: %v", err.Error())
exitCode := 1
if errors.Is(err, context.Canceled) {
cause := context.Cause(ss.ctx)
if serr, ok := cause.(SSHTerminationError); ok {
@ -1041,67 +1053,58 @@ func (ss *sshSession) run() {
io.WriteString(ss.Stderr(), "\r\n\r\n"+msg+"\r\n\r\n")
}
}
} else if isNotFoundOrExecutable(err) {
// Use exit code 127 for "command not found" per shell convention.
exitCode = 127
}
ss.Exit(1)
ss.Exit(exitCode)
return
}
ss.exitHandled = make(chan struct{})
go ss.killProcessOnContextDone()
var processDone atomic.Bool
// Start goroutines to copy stdin/stdout/stderr.
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
defer ss.wrStdin.Close()
if _, err := io.Copy(rec.writer("i", ss.wrStdin), ss); err != nil {
logf("stdin copy: %v", err)
ss.cancelCtx(err)
}
}()
outputDone := make(chan struct{})
var openOutputStreams atomic.Int32
if ss.rdStderr != nil {
openOutputStreams.Store(2)
} else {
openOutputStreams.Store(1)
}
wg.Add(1)
go func() {
defer wg.Done()
defer ss.rdStdout.Close()
_, err := io.Copy(rec.writer("o", ss), ss.rdStdout)
if err != nil && !errors.Is(err, io.EOF) {
isErrBecauseProcessExited := processDone.Load() && errors.Is(err, syscall.EIO)
if !isErrBecauseProcessExited {
logf("stdout copy: %v", err)
ss.cancelCtx(err)
}
}
if openOutputStreams.Add(-1) == 0 {
ss.CloseWrite()
close(outputDone)
if _, err := io.Copy(rec.writer("o", ss), ss.rdStdout); err != nil && !errors.Is(err, io.EOF) {
logf("stdout copy: %v", err)
}
ss.CloseWrite()
}()
// rdStderr is nil for ptys.
if ss.rdStderr != nil {
wg.Add(1)
go func() {
defer wg.Done()
defer ss.rdStderr.Close()
_, err := io.Copy(ss.Stderr(), ss.rdStderr)
if err != nil {
if _, err := io.Copy(ss.Stderr(), ss.rdStderr); err != nil {
logf("stderr copy: %v", err)
}
if openOutputStreams.Add(-1) == 0 {
ss.CloseWrite()
close(outputDone)
}
}()
}
err = ss.cmd.Wait()
processDone.Store(true)
if ss.ctx.Err() != nil {
// Context was canceled (e.g., recording upload failure).
// Wait for killProcessOnContextDone to finish writing any
// termination message before we proceed. This must happen
// before closeAll and CloseWrite so the SSH channel is
// still writable.
// before Exit so the SSH channel is still writable.
<-ss.exitHandled
}
@ -1110,31 +1113,26 @@ func (ss *sshSession) run() {
// aforementioned goroutine.
ss.exitOnce.Do(func() {})
// Close the process-side of all pipes to signal the asynchronous
// io.Copy routines reading/writing from the pipes to terminate.
// Block for the io.Copy to finish before calling ss.Exit below.
closeAll(ss.childPipes...)
select {
case <-outputDone:
case <-ss.ctx.Done():
<-ss.exitHandled
}
var exitCode int
if err == nil {
ss.logf("Session complete")
ss.Exit(0)
return
}
if ee, ok := err.(*exec.ExitError); ok {
code := ee.ProcessState.ExitCode()
ss.logf("Wait: code=%v", code)
ss.Exit(code)
return
exitCode = 0
} else if ee, ok := err.(*exec.ExitError); ok {
exitCode = ee.ProcessState.ExitCode()
ss.logf("Wait: code=%v", exitCode)
} else {
ss.logf("Wait: %v", err)
exitCode = 1
}
ss.logf("Wait: %v", err)
ss.Exit(1)
return
// Send exit-status before EOF/Close. Per RFC 4254 section 6.10,
// exit-status should be sent before channel close.
ss.Exit(exitCode)
// Close process-side of pipes to signal io.Copy goroutines
// to finish, then wait for all I/O to complete.
closeAll(ss.childPipes...)
wg.Wait()
}
// recordSSHToLocalDisk is a deprecated dev knob to allow recording SSH sessions