mirror of
https://github.com/tailscale/tailscale.git
synced 2026-05-05 12:16:44 +02:00
ssh/tailssh: fix exit-status ordering and improve session shutdown
Fix a race where CloseWrite (SSH_MSG_CHANNEL_EOF) could be sent before Exit (exit-status), causing SSH clients (especially on macOS) to miss the exit status. The root cause was a select between outputDone and ctx.Done that allowed Exit to be called while stdout was still copying, or after CloseWrite had already been sent. Replace the atomic.Bool/atomic.Int32/channel synchronization with sync.WaitGroup and guarantee the wire ordering required by RFC 4254 section 6.10: exit-status -> EOF -> CHANNEL_CLOSE. The new ordering in run(): 1. cmd.Wait() returns with exit code 2. ss.Exit(exitCode) sends exit-status 3. closeAll(childPipes) signals io.Copy goroutines to finish 4. wg.Wait() waits for stdout goroutine which calls CloseWrite (EOF) 5. defer ss.Close() sends CHANNEL_CLOSE on function return Additional changes: - Send SIGHUP instead of SIGKILL on session termination - Use exit code 255 for SSH protocol/permission errors - Use exit code 127 for command not found - Use exit code 254 for recording failures - Add isNotFoundOrExecutable helper Based on tailscale/tailscale#18331. Fixes #18256 Change-Id: Ib0c984d466c1a96c8f642e94a5dfe60d33d71fd8 Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
parent
290a6cc03c
commit
4fd5bcc2b8
@ -164,4 +164,4 @@
|
||||
});
|
||||
};
|
||||
}
|
||||
# nix-direnv cache busting line: sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc=
|
||||
# nix-direnv cache busting line: sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk=
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
"sri": "sha256-pCvFNTFuvhSBb5O+PPuilaowP4tXcCOP1NgYUDJTcJU="
|
||||
},
|
||||
"vendor": {
|
||||
"goModSum": "sha256-xjPeSzdlDw247JtuZ9gI/OXh0IYvQV3qN1WNRbSlir8=",
|
||||
"sri": "sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc="
|
||||
"goModSum": "sha256-gyNEKCCFTtvRVkcavNxRd6laPOJ9CvdiTQyL4bLHyco=",
|
||||
"sri": "sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk="
|
||||
}
|
||||
}
|
||||
|
||||
2
go.mod
2
go.mod
@ -92,7 +92,7 @@ require (
|
||||
github.com/studio-b12/gowebdav v0.9.0
|
||||
github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d
|
||||
github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f
|
||||
github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89
|
||||
github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed
|
||||
github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41
|
||||
github.com/tailscale/gokrazy-kernel v0.0.0-20240728225134-3d23beabda2e
|
||||
github.com/tailscale/golang-x-crypto v0.0.0-20250404221719-a5573b049869
|
||||
|
||||
4
go.sum
4
go.sum
@ -1131,8 +1131,8 @@ github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d h1:JcGKBZAL7
|
||||
github.com/tailscale/certstore v0.1.1-0.20260409135935-3638fb84b77d/go.mod h1:XrBNfAFN+pwoWuksbFS9Ccxnopa15zJGgXRFN90l3K4=
|
||||
github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f h1:PDPGJtm9PFBLNudHGwkfUGp/FWvP+kXXJ0D1pB35F40=
|
||||
github.com/tailscale/depaware v0.0.0-20251001183927-9c2ad255ef3f/go.mod h1:p9lPsd+cx33L3H9nNoecRRxPssFKUwwI50I3pZ0yT+8=
|
||||
github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89 h1:glgVc1ZYMjwN1Q/ITWeuSQyl029uayagaR2sjsifehc=
|
||||
github.com/tailscale/gliderssh v0.3.4-0.20260330083525-c1389c70ff89/go.mod h1:wn16Km1EZOX4UEAyaZa3dBwfFGOJ7neck40NcwosJUw=
|
||||
github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed h1:eM/Z6f15XvHPVmf7RDh0l+6HkjX8ZnO3LvofoaAMdkc=
|
||||
github.com/tailscale/gliderssh v0.3.4-0.20260429115553-434cb5bd3eed/go.mod h1:wn16Km1EZOX4UEAyaZa3dBwfFGOJ7neck40NcwosJUw=
|
||||
github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55 h1:Gzfnfk2TWrk8Jj4P4c1a3CtQyMaTVCznlkLZI++hok4=
|
||||
github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55/go.mod h1:4k4QO+dQ3R5FofL+SanAUZe+/QfeK0+OIuwDIRu2vSg=
|
||||
github.com/tailscale/goexpect v0.0.0-20210902213824-6e8c725cea41 h1:/V2rCMMWcsjYaYO2MeovLw+ClP63OtXgCF2Y1eb8+Ns=
|
||||
|
||||
@ -16,4 +16,4 @@
|
||||
) {
|
||||
src = ./.;
|
||||
}).shellNix
|
||||
# nix-direnv cache busting line: sha256-5zxCDQ12bu8dvJ51RCQk/m07oM2qNNrTB5cbb1Za/sc=
|
||||
# nix-direnv cache busting line: sha256-jdsGXGI6v+uaC9rWZT30WEngV9HHSHbg/kNKje4U0Uk=
|
||||
|
||||
@ -878,11 +878,17 @@ func (ss *sshSession) killProcessOnContextDone() {
|
||||
// We don't need to Process.Wait here, sshSession.run() does
|
||||
// the waiting regardless of termination reason.
|
||||
|
||||
// TODO(maisem): should this be a SIGTERM followed by a SIGKILL?
|
||||
ss.cmd.Process.Kill()
|
||||
// Send SIGHUP like a real terminal disconnect would.
|
||||
ss.cmd.Process.Signal(syscall.SIGHUP)
|
||||
})
|
||||
}
|
||||
|
||||
// isNotFoundOrExecutable reports whether err indicates the command
|
||||
// could not be found or executed.
|
||||
func isNotFoundOrExecutable(err error) bool {
|
||||
return errors.Is(err, exec.ErrNotFound) || errors.Is(err, os.ErrNotExist)
|
||||
}
|
||||
|
||||
// attachSession registers ss as an active session.
|
||||
func (c *conn) attachSession(ss *sshSession) {
|
||||
c.srv.sessionWaitGroup.Add(1)
|
||||
@ -968,10 +974,11 @@ func (ss *sshSession) run() {
|
||||
metricActiveSessions.Add(1)
|
||||
defer metricActiveSessions.Add(-1)
|
||||
defer ss.cancelCtx(errSessionDone)
|
||||
defer ss.Close()
|
||||
|
||||
if attached := ss.conn.srv.attachSessionToConnIfNotShutdown(ss); !attached {
|
||||
fmt.Fprintf(ss, "Tailscale SSH is shutting down\r\n")
|
||||
ss.Exit(1)
|
||||
ss.Exit(255)
|
||||
return
|
||||
}
|
||||
defer ss.conn.detachSession(ss)
|
||||
@ -993,7 +1000,9 @@ func (ss *sshSession) run() {
|
||||
if lu.Uid != fmt.Sprint(euid) {
|
||||
ss.logf("can't switch to user %q from process euid %v", lu.Username, euid)
|
||||
fmt.Fprintf(ss, "can't switch user\r\n")
|
||||
ss.Exit(1)
|
||||
// Exit code 255 indicates SSH protocol/permission error,
|
||||
// matching OpenSSH behavior for fatal errors.
|
||||
ss.Exit(255)
|
||||
return
|
||||
}
|
||||
}
|
||||
@ -1021,7 +1030,9 @@ func (ss *sshSession) run() {
|
||||
fmt.Fprintf(ss, "can't start new recording\r\n")
|
||||
}
|
||||
ss.logf("startNewRecording: %v", err)
|
||||
ss.Exit(1)
|
||||
// Exit code 254 for recording infrastructure failure.
|
||||
// Distinct from 255 (SSH protocol error) and 1 (general failure).
|
||||
ss.Exit(254)
|
||||
return
|
||||
}
|
||||
ss.logf("startNewRecording: <nil>")
|
||||
@ -1034,6 +1045,7 @@ func (ss *sshSession) run() {
|
||||
err := ss.launchProcess()
|
||||
if err != nil {
|
||||
logf("start failed: %v", err.Error())
|
||||
exitCode := 1
|
||||
if errors.Is(err, context.Canceled) {
|
||||
cause := context.Cause(ss.ctx)
|
||||
if serr, ok := cause.(SSHTerminationError); ok {
|
||||
@ -1041,67 +1053,58 @@ func (ss *sshSession) run() {
|
||||
io.WriteString(ss.Stderr(), "\r\n\r\n"+msg+"\r\n\r\n")
|
||||
}
|
||||
}
|
||||
} else if isNotFoundOrExecutable(err) {
|
||||
// Use exit code 127 for "command not found" per shell convention.
|
||||
exitCode = 127
|
||||
}
|
||||
ss.Exit(1)
|
||||
ss.Exit(exitCode)
|
||||
return
|
||||
}
|
||||
ss.exitHandled = make(chan struct{})
|
||||
go ss.killProcessOnContextDone()
|
||||
|
||||
var processDone atomic.Bool
|
||||
// Start goroutines to copy stdin/stdout/stderr.
|
||||
var wg sync.WaitGroup
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer ss.wrStdin.Close()
|
||||
if _, err := io.Copy(rec.writer("i", ss.wrStdin), ss); err != nil {
|
||||
logf("stdin copy: %v", err)
|
||||
ss.cancelCtx(err)
|
||||
}
|
||||
}()
|
||||
outputDone := make(chan struct{})
|
||||
var openOutputStreams atomic.Int32
|
||||
if ss.rdStderr != nil {
|
||||
openOutputStreams.Store(2)
|
||||
} else {
|
||||
openOutputStreams.Store(1)
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer ss.rdStdout.Close()
|
||||
_, err := io.Copy(rec.writer("o", ss), ss.rdStdout)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
isErrBecauseProcessExited := processDone.Load() && errors.Is(err, syscall.EIO)
|
||||
if !isErrBecauseProcessExited {
|
||||
logf("stdout copy: %v", err)
|
||||
ss.cancelCtx(err)
|
||||
}
|
||||
}
|
||||
if openOutputStreams.Add(-1) == 0 {
|
||||
ss.CloseWrite()
|
||||
close(outputDone)
|
||||
if _, err := io.Copy(rec.writer("o", ss), ss.rdStdout); err != nil && !errors.Is(err, io.EOF) {
|
||||
logf("stdout copy: %v", err)
|
||||
}
|
||||
ss.CloseWrite()
|
||||
}()
|
||||
|
||||
// rdStderr is nil for ptys.
|
||||
if ss.rdStderr != nil {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer ss.rdStderr.Close()
|
||||
_, err := io.Copy(ss.Stderr(), ss.rdStderr)
|
||||
if err != nil {
|
||||
if _, err := io.Copy(ss.Stderr(), ss.rdStderr); err != nil {
|
||||
logf("stderr copy: %v", err)
|
||||
}
|
||||
if openOutputStreams.Add(-1) == 0 {
|
||||
ss.CloseWrite()
|
||||
close(outputDone)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
err = ss.cmd.Wait()
|
||||
processDone.Store(true)
|
||||
|
||||
if ss.ctx.Err() != nil {
|
||||
// Context was canceled (e.g., recording upload failure).
|
||||
// Wait for killProcessOnContextDone to finish writing any
|
||||
// termination message before we proceed. This must happen
|
||||
// before closeAll and CloseWrite so the SSH channel is
|
||||
// still writable.
|
||||
// before Exit so the SSH channel is still writable.
|
||||
<-ss.exitHandled
|
||||
}
|
||||
|
||||
@ -1110,31 +1113,26 @@ func (ss *sshSession) run() {
|
||||
// aforementioned goroutine.
|
||||
ss.exitOnce.Do(func() {})
|
||||
|
||||
// Close the process-side of all pipes to signal the asynchronous
|
||||
// io.Copy routines reading/writing from the pipes to terminate.
|
||||
// Block for the io.Copy to finish before calling ss.Exit below.
|
||||
closeAll(ss.childPipes...)
|
||||
select {
|
||||
case <-outputDone:
|
||||
case <-ss.ctx.Done():
|
||||
<-ss.exitHandled
|
||||
}
|
||||
|
||||
var exitCode int
|
||||
if err == nil {
|
||||
ss.logf("Session complete")
|
||||
ss.Exit(0)
|
||||
return
|
||||
}
|
||||
if ee, ok := err.(*exec.ExitError); ok {
|
||||
code := ee.ProcessState.ExitCode()
|
||||
ss.logf("Wait: code=%v", code)
|
||||
ss.Exit(code)
|
||||
return
|
||||
exitCode = 0
|
||||
} else if ee, ok := err.(*exec.ExitError); ok {
|
||||
exitCode = ee.ProcessState.ExitCode()
|
||||
ss.logf("Wait: code=%v", exitCode)
|
||||
} else {
|
||||
ss.logf("Wait: %v", err)
|
||||
exitCode = 1
|
||||
}
|
||||
|
||||
ss.logf("Wait: %v", err)
|
||||
ss.Exit(1)
|
||||
return
|
||||
// Send exit-status before EOF/Close. Per RFC 4254 section 6.10,
|
||||
// exit-status should be sent before channel close.
|
||||
ss.Exit(exitCode)
|
||||
|
||||
// Close process-side of pipes to signal io.Copy goroutines
|
||||
// to finish, then wait for all I/O to complete.
|
||||
closeAll(ss.childPipes...)
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// recordSSHToLocalDisk is a deprecated dev knob to allow recording SSH sessions
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user