mirror of
https://github.com/tailscale/tailscale.git
synced 2026-03-09 23:41:39 +01:00
Experiment with ideal connect flow
This commit is contained in:
parent
61dd2662ec
commit
eb769a2a68
@ -32,6 +32,7 @@ import (
|
||||
"tailscale.com/derp"
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/logtail/backoff"
|
||||
"tailscale.com/net/dnscache"
|
||||
"tailscale.com/net/netmon"
|
||||
"tailscale.com/net/netns"
|
||||
@ -87,19 +88,20 @@ type Client struct {
|
||||
// Client.conn holds mu.
|
||||
addrFamSelAtomic syncs.AtomicValue[AddressFamilySelector]
|
||||
|
||||
mu sync.Mutex
|
||||
atomicState syncs.AtomicValue[ConnectedState] // hold mu to write
|
||||
started bool // true upon first connect, never transitions to false
|
||||
preferred bool
|
||||
canAckPings bool
|
||||
closed bool
|
||||
netConn io.Closer
|
||||
client *derp.Client
|
||||
connGen int // incremented once per new connection; valid values are >0
|
||||
serverPubKey key.NodePublic
|
||||
tlsState *tls.ConnectionState
|
||||
pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong
|
||||
clock tstime.Clock
|
||||
mu sync.Mutex
|
||||
atomicState syncs.AtomicValue[ConnectedState] // hold mu to write
|
||||
started bool // true upon first connect, never transitions to false
|
||||
preferred bool
|
||||
canAckPings bool
|
||||
closed bool
|
||||
netConn io.Closer
|
||||
client *derp.Client
|
||||
connGen int // incremented once per new connection; valid values are >0
|
||||
serverPubKey key.NodePublic
|
||||
tlsState *tls.ConnectionState
|
||||
pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong
|
||||
clock tstime.Clock
|
||||
attemptingIdealConnection bool
|
||||
}
|
||||
|
||||
// ConnectedState describes the state of a derphttp Client.
|
||||
@ -325,6 +327,7 @@ func useWebsockets() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// TODO: split this into 2 methods so we can do like a try-connect ype thing?
|
||||
func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, connGen int, err error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
@ -460,32 +463,15 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
|
||||
var serverProtoVersion int
|
||||
var tlsState *tls.ConnectionState
|
||||
if c.useHTTPS() {
|
||||
tlsConn := c.tlsClient(tcpConn, node)
|
||||
httpConn = tlsConn
|
||||
|
||||
// Force a handshake now (instead of waiting for it to
|
||||
// be done implicitly on read/write) so we can check
|
||||
// the ConnectionState.
|
||||
if err := tlsConn.Handshake(); err != nil {
|
||||
handshakeData, err := c.tlsHandshake(tcpConn, node)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// We expect to be using TLS 1.3 to our own servers, and only
|
||||
// starting at TLS 1.3 are the server's returned certificates
|
||||
// encrypted, so only look for and use our "meta cert" if we're
|
||||
// using TLS 1.3. If we're not using TLS 1.3, it might be a user
|
||||
// running cmd/derper themselves with a different configuration,
|
||||
// in which case we can avoid this fast-start optimization.
|
||||
// (If a corporate proxy is MITM'ing TLS 1.3 connections with
|
||||
// corp-mandated TLS root certs than all bets are off anyway.)
|
||||
// Note that we're not specifically concerned about TLS downgrade
|
||||
// attacks. TLS handles that fine:
|
||||
// https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html
|
||||
cs := tlsConn.ConnectionState()
|
||||
tlsState = &cs
|
||||
if cs.Version >= tls.VersionTLS13 {
|
||||
serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates)
|
||||
}
|
||||
httpConn = handshakeData.conn
|
||||
serverPub = handshakeData.serverPub
|
||||
serverProtoVersion = handshakeData.serverProtoVersion
|
||||
tlsState = &handshakeData.connectionState
|
||||
} else {
|
||||
httpConn = tcpConn
|
||||
}
|
||||
@ -510,6 +496,34 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
|
||||
// on Client that's like "here's the connect result and connGen for the
|
||||
// next connect that comes in"). Tracking bug for all this is:
|
||||
// https://github.com/tailscale/tailscale/issues/12724
|
||||
if !c.attemptingIdealConnection {
|
||||
c.attemptingIdealConnection = true
|
||||
|
||||
go func() {
|
||||
bo := backoff.NewBackoff("dial-node", c.logf, 24*time.Hour)
|
||||
|
||||
for {
|
||||
bo.BackOff(ctx, err)
|
||||
|
||||
// Exit if we're connected to the preferred
|
||||
c.mu.Lock()
|
||||
if c.preferred {
|
||||
c.mu.Unlock()
|
||||
break
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
err := c.tryConnectIdeal(ctx, node, "test")
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
c.mu.Lock()
|
||||
c.attemptingIdealConnection = false
|
||||
c.mu.Unlock()
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
if !serverPub.IsZero() && serverProtoVersion != 0 {
|
||||
@ -527,6 +541,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
|
||||
// No need to flush the HTTP request. the derp.Client's initial
|
||||
// client auth frame will flush it.
|
||||
} else {
|
||||
// TODO: this is the upgrade
|
||||
if err := req.Write(brw); err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
@ -581,6 +596,199 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
|
||||
return c.client, c.connGen, nil
|
||||
}
|
||||
|
||||
func (c *Client) tryConnectIdeal(ctx context.Context, node *tailcfg.DERPNode, caller string) (err error) {
|
||||
// timeout is the fallback maximum time (if ctx doesn't limit
|
||||
// it further) to do all of: DNS + TCP + TLS + HTTP Upgrade +
|
||||
// DERP upgrade.
|
||||
const timeout = 10 * time.Second
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
go func() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// Either timeout fired (handled below), or
|
||||
// we're returning via the defer cancel()
|
||||
// below.
|
||||
case <-c.ctx.Done():
|
||||
// Propagate a Client.Close call into
|
||||
// cancelling this context.
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
defer cancel()
|
||||
|
||||
var tcpConn net.Conn
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
err = fmt.Errorf("%v: %v", ctx.Err(), err)
|
||||
}
|
||||
err = fmt.Errorf("%s connect to %v: %v", caller, node.HostName, err)
|
||||
if tcpConn != nil {
|
||||
go tcpConn.Close()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
tcpConn, err = c.dialNode(ctx, node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
//// More handshake? ////
|
||||
var httpConn net.Conn // a TCP conn or a TLS conn; what we speak HTTP to
|
||||
var serverPub key.NodePublic // or zero if unknown (if not using TLS or TLS middlebox eats it)
|
||||
var serverProtoVersion int
|
||||
var tlsState *tls.ConnectionState
|
||||
if c.useHTTPS() {
|
||||
handshakeData, err := c.tlsHandshake(tcpConn, node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
httpConn = handshakeData.conn
|
||||
serverPub = handshakeData.serverPub
|
||||
serverProtoVersion = handshakeData.serverProtoVersion
|
||||
tlsState = &handshakeData.connectionState
|
||||
} else {
|
||||
httpConn = tcpConn
|
||||
}
|
||||
//// More handshake? ////
|
||||
|
||||
// TODO: drop old here and get mutex
|
||||
c.closeForReconnect(c.client)
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
//// Upgrade ? ////
|
||||
brw := bufio.NewReadWriter(bufio.NewReader(httpConn), bufio.NewWriter(httpConn))
|
||||
var derpClient *derp.Client
|
||||
|
||||
req, err := http.NewRequest("GET", c.urlString(node), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Upgrade", "DERP")
|
||||
req.Header.Set("Connection", "Upgrade")
|
||||
|
||||
if !serverPub.IsZero() && serverProtoVersion != 0 {
|
||||
// parseMetaCert found the server's public key (no TLS
|
||||
// middlebox was in the way), so skip the HTTP upgrade
|
||||
// exchange. See https://github.com/tailscale/tailscale/issues/693
|
||||
// for an overview. We still send the HTTP request
|
||||
// just to get routed into the server's HTTP Handler so it
|
||||
// can Hijack the request, but we signal with a special header
|
||||
// that we don't want to deal with its HTTP response.
|
||||
req.Header.Set(fastStartHeader, "1") // suppresses the server's HTTP response
|
||||
if err := req.Write(brw); err != nil {
|
||||
return err
|
||||
}
|
||||
// No need to flush the HTTP request. the derp.Client's initial
|
||||
// client auth frame will flush it.
|
||||
} else {
|
||||
// TODO: this is the upgrade
|
||||
if err := req.Write(brw); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := brw.Flush(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
resp, err := http.ReadResponse(brw.Reader, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if resp.StatusCode != http.StatusSwitchingProtocols {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
return fmt.Errorf("GET failed: %v: %s", err, b)
|
||||
}
|
||||
}
|
||||
//// Upgrade ? ////
|
||||
|
||||
derpClient, err = derp.NewClient(c.privateKey, httpConn, brw, c.logf,
|
||||
derp.MeshKey(c.MeshKey),
|
||||
derp.ServerPublicKey(serverPub),
|
||||
derp.CanAckPings(c.canAckPings),
|
||||
derp.IsProber(c.IsProber),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := derpClient.NotePreferred(true); err != nil {
|
||||
go httpConn.Close()
|
||||
return err
|
||||
}
|
||||
|
||||
if c.WatchConnectionChanges {
|
||||
if err := derpClient.WatchConnectionChanges(); err != nil {
|
||||
go httpConn.Close()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
c.preferred = true
|
||||
|
||||
c.serverPubKey = derpClient.ServerPublicKey()
|
||||
c.client = derpClient
|
||||
c.netConn = tcpConn
|
||||
c.tlsState = tlsState
|
||||
c.connGen++
|
||||
|
||||
localAddr, _ := c.client.LocalAddr()
|
||||
c.atomicState.Store(ConnectedState{
|
||||
Connected: true,
|
||||
LocalAddr: localAddr,
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
type tlsHandshakeData struct {
|
||||
conn *tls.Conn
|
||||
connectionState tls.ConnectionState
|
||||
serverPub key.NodePublic // zero if unknown (if not using TLS or TLS middlebox eats it)
|
||||
serverProtoVersion int
|
||||
}
|
||||
|
||||
// tlsHandshake forces a TLS handshake and returns tlsHandshakeData
|
||||
// containing the TLS connection, connection state, and server metadata
|
||||
// on success.
|
||||
func (c *Client) tlsHandshake(tcpConn net.Conn, node *tailcfg.DERPNode) (*tlsHandshakeData, error) {
|
||||
tlsConn := c.tlsClient(tcpConn, node)
|
||||
|
||||
// Force a handshake now (instead of waiting for it to
|
||||
// be done implicitly on read/write) so we can check
|
||||
// the ConnectionState.
|
||||
if err := tlsConn.Handshake(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// We expect to be using TLS 1.3 to our own servers, and only
|
||||
// starting at TLS 1.3 are the server's returned certificates
|
||||
// encrypted, so only look for and use our "meta cert" if we're
|
||||
// using TLS 1.3. If we're not using TLS 1.3, it might be a user
|
||||
// running cmd/derper themselves with a different configuration,
|
||||
// in which case we can avoid this fast-start optimization.
|
||||
// (If a corporate proxy is MITM'ing TLS 1.3 connections with
|
||||
// corp-mandated TLS root certs than all bets are off anyway.)
|
||||
// Note that we're not specifically concerned about TLS downgrade
|
||||
// attacks. TLS handles that fine:
|
||||
// https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html
|
||||
cs := tlsConn.ConnectionState()
|
||||
var serverPub key.NodePublic
|
||||
var serverProtoVersion int
|
||||
if cs.Version >= tls.VersionTLS13 {
|
||||
serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates)
|
||||
}
|
||||
|
||||
return &tlsHandshakeData{
|
||||
conn: tlsConn,
|
||||
connectionState: cs,
|
||||
serverPub: serverPub,
|
||||
serverProtoVersion: serverProtoVersion,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// SetURLDialer sets the dialer to use for dialing URLs.
|
||||
// This dialer is only use for clients created with NewClient, not NewRegionClient.
|
||||
// If unset or nil, the default dialer is used.
|
||||
@ -1116,6 +1324,7 @@ func (c *Client) Close() error {
|
||||
// time and both calling closeForReconnect and the caller goroutines
|
||||
// forever calling closeForReconnect in lockstep endlessly;
|
||||
// https://github.com/tailscale/tailscale/pull/264)
|
||||
// TODO: this to close existing?
|
||||
func (c *Client) closeForReconnect(brokenClient *derp.Client) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user