diff --git a/derp/derphttp/derphttp_client.go b/derp/derphttp/derphttp_client.go index c95d072b1..198bafb5f 100644 --- a/derp/derphttp/derphttp_client.go +++ b/derp/derphttp/derphttp_client.go @@ -32,6 +32,7 @@ import ( "tailscale.com/derp" "tailscale.com/envknob" "tailscale.com/health" + "tailscale.com/logtail/backoff" "tailscale.com/net/dnscache" "tailscale.com/net/netmon" "tailscale.com/net/netns" @@ -87,19 +88,20 @@ type Client struct { // Client.conn holds mu. addrFamSelAtomic syncs.AtomicValue[AddressFamilySelector] - mu sync.Mutex - atomicState syncs.AtomicValue[ConnectedState] // hold mu to write - started bool // true upon first connect, never transitions to false - preferred bool - canAckPings bool - closed bool - netConn io.Closer - client *derp.Client - connGen int // incremented once per new connection; valid values are >0 - serverPubKey key.NodePublic - tlsState *tls.ConnectionState - pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong - clock tstime.Clock + mu sync.Mutex + atomicState syncs.AtomicValue[ConnectedState] // hold mu to write + started bool // true upon first connect, never transitions to false + preferred bool + canAckPings bool + closed bool + netConn io.Closer + client *derp.Client + connGen int // incremented once per new connection; valid values are >0 + serverPubKey key.NodePublic + tlsState *tls.ConnectionState + pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong + clock tstime.Clock + attemptingIdealConnection bool } // ConnectedState describes the state of a derphttp Client. @@ -325,6 +327,7 @@ func useWebsockets() bool { return false } +// TODO: split this into 2 methods so we can do like a try-connect ype thing? func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, connGen int, err error) { c.mu.Lock() defer c.mu.Unlock() @@ -460,32 +463,15 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien var serverProtoVersion int var tlsState *tls.ConnectionState if c.useHTTPS() { - tlsConn := c.tlsClient(tcpConn, node) - httpConn = tlsConn - - // Force a handshake now (instead of waiting for it to - // be done implicitly on read/write) so we can check - // the ConnectionState. - if err := tlsConn.Handshake(); err != nil { + handshakeData, err := c.tlsHandshake(tcpConn, node) + if err != nil { return nil, 0, err } - // We expect to be using TLS 1.3 to our own servers, and only - // starting at TLS 1.3 are the server's returned certificates - // encrypted, so only look for and use our "meta cert" if we're - // using TLS 1.3. If we're not using TLS 1.3, it might be a user - // running cmd/derper themselves with a different configuration, - // in which case we can avoid this fast-start optimization. - // (If a corporate proxy is MITM'ing TLS 1.3 connections with - // corp-mandated TLS root certs than all bets are off anyway.) - // Note that we're not specifically concerned about TLS downgrade - // attacks. TLS handles that fine: - // https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html - cs := tlsConn.ConnectionState() - tlsState = &cs - if cs.Version >= tls.VersionTLS13 { - serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates) - } + httpConn = handshakeData.conn + serverPub = handshakeData.serverPub + serverProtoVersion = handshakeData.serverProtoVersion + tlsState = &handshakeData.connectionState } else { httpConn = tcpConn } @@ -510,6 +496,34 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien // on Client that's like "here's the connect result and connGen for the // next connect that comes in"). Tracking bug for all this is: // https://github.com/tailscale/tailscale/issues/12724 + if !c.attemptingIdealConnection { + c.attemptingIdealConnection = true + + go func() { + bo := backoff.NewBackoff("dial-node", c.logf, 24*time.Hour) + + for { + bo.BackOff(ctx, err) + + // Exit if we're connected to the preferred + c.mu.Lock() + if c.preferred { + c.mu.Unlock() + break + } + c.mu.Unlock() + + err := c.tryConnectIdeal(ctx, node, "test") + if err == nil { + break + } + } + + c.mu.Lock() + c.attemptingIdealConnection = false + c.mu.Unlock() + }() + } } if !serverPub.IsZero() && serverProtoVersion != 0 { @@ -527,6 +541,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien // No need to flush the HTTP request. the derp.Client's initial // client auth frame will flush it. } else { + // TODO: this is the upgrade if err := req.Write(brw); err != nil { return nil, 0, err } @@ -581,6 +596,199 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien return c.client, c.connGen, nil } +func (c *Client) tryConnectIdeal(ctx context.Context, node *tailcfg.DERPNode, caller string) (err error) { + // timeout is the fallback maximum time (if ctx doesn't limit + // it further) to do all of: DNS + TCP + TLS + HTTP Upgrade + + // DERP upgrade. + const timeout = 10 * time.Second + ctx, cancel := context.WithTimeout(ctx, timeout) + go func() { + select { + case <-ctx.Done(): + // Either timeout fired (handled below), or + // we're returning via the defer cancel() + // below. + case <-c.ctx.Done(): + // Propagate a Client.Close call into + // cancelling this context. + cancel() + } + }() + defer cancel() + + var tcpConn net.Conn + + defer func() { + if err != nil { + if ctx.Err() != nil { + err = fmt.Errorf("%v: %v", ctx.Err(), err) + } + err = fmt.Errorf("%s connect to %v: %v", caller, node.HostName, err) + if tcpConn != nil { + go tcpConn.Close() + } + } + }() + + tcpConn, err = c.dialNode(ctx, node) + if err != nil { + return err + } + + //// More handshake? //// + var httpConn net.Conn // a TCP conn or a TLS conn; what we speak HTTP to + var serverPub key.NodePublic // or zero if unknown (if not using TLS or TLS middlebox eats it) + var serverProtoVersion int + var tlsState *tls.ConnectionState + if c.useHTTPS() { + handshakeData, err := c.tlsHandshake(tcpConn, node) + if err != nil { + return err + } + + httpConn = handshakeData.conn + serverPub = handshakeData.serverPub + serverProtoVersion = handshakeData.serverProtoVersion + tlsState = &handshakeData.connectionState + } else { + httpConn = tcpConn + } + //// More handshake? //// + + // TODO: drop old here and get mutex + c.closeForReconnect(c.client) + c.mu.Lock() + defer c.mu.Unlock() + + //// Upgrade ? //// + brw := bufio.NewReadWriter(bufio.NewReader(httpConn), bufio.NewWriter(httpConn)) + var derpClient *derp.Client + + req, err := http.NewRequest("GET", c.urlString(node), nil) + if err != nil { + return err + } + req.Header.Set("Upgrade", "DERP") + req.Header.Set("Connection", "Upgrade") + + if !serverPub.IsZero() && serverProtoVersion != 0 { + // parseMetaCert found the server's public key (no TLS + // middlebox was in the way), so skip the HTTP upgrade + // exchange. See https://github.com/tailscale/tailscale/issues/693 + // for an overview. We still send the HTTP request + // just to get routed into the server's HTTP Handler so it + // can Hijack the request, but we signal with a special header + // that we don't want to deal with its HTTP response. + req.Header.Set(fastStartHeader, "1") // suppresses the server's HTTP response + if err := req.Write(brw); err != nil { + return err + } + // No need to flush the HTTP request. the derp.Client's initial + // client auth frame will flush it. + } else { + // TODO: this is the upgrade + if err := req.Write(brw); err != nil { + return err + } + if err := brw.Flush(); err != nil { + return err + } + + resp, err := http.ReadResponse(brw.Reader, req) + if err != nil { + return err + } + if resp.StatusCode != http.StatusSwitchingProtocols { + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("GET failed: %v: %s", err, b) + } + } + //// Upgrade ? //// + + derpClient, err = derp.NewClient(c.privateKey, httpConn, brw, c.logf, + derp.MeshKey(c.MeshKey), + derp.ServerPublicKey(serverPub), + derp.CanAckPings(c.canAckPings), + derp.IsProber(c.IsProber), + ) + if err != nil { + return err + } + if err := derpClient.NotePreferred(true); err != nil { + go httpConn.Close() + return err + } + + if c.WatchConnectionChanges { + if err := derpClient.WatchConnectionChanges(); err != nil { + go httpConn.Close() + return err + } + } + + c.preferred = true + + c.serverPubKey = derpClient.ServerPublicKey() + c.client = derpClient + c.netConn = tcpConn + c.tlsState = tlsState + c.connGen++ + + localAddr, _ := c.client.LocalAddr() + c.atomicState.Store(ConnectedState{ + Connected: true, + LocalAddr: localAddr, + }) + return nil +} + +type tlsHandshakeData struct { + conn *tls.Conn + connectionState tls.ConnectionState + serverPub key.NodePublic // zero if unknown (if not using TLS or TLS middlebox eats it) + serverProtoVersion int +} + +// tlsHandshake forces a TLS handshake and returns tlsHandshakeData +// containing the TLS connection, connection state, and server metadata +// on success. +func (c *Client) tlsHandshake(tcpConn net.Conn, node *tailcfg.DERPNode) (*tlsHandshakeData, error) { + tlsConn := c.tlsClient(tcpConn, node) + + // Force a handshake now (instead of waiting for it to + // be done implicitly on read/write) so we can check + // the ConnectionState. + if err := tlsConn.Handshake(); err != nil { + return nil, err + } + + // We expect to be using TLS 1.3 to our own servers, and only + // starting at TLS 1.3 are the server's returned certificates + // encrypted, so only look for and use our "meta cert" if we're + // using TLS 1.3. If we're not using TLS 1.3, it might be a user + // running cmd/derper themselves with a different configuration, + // in which case we can avoid this fast-start optimization. + // (If a corporate proxy is MITM'ing TLS 1.3 connections with + // corp-mandated TLS root certs than all bets are off anyway.) + // Note that we're not specifically concerned about TLS downgrade + // attacks. TLS handles that fine: + // https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html + cs := tlsConn.ConnectionState() + var serverPub key.NodePublic + var serverProtoVersion int + if cs.Version >= tls.VersionTLS13 { + serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates) + } + + return &tlsHandshakeData{ + conn: tlsConn, + connectionState: cs, + serverPub: serverPub, + serverProtoVersion: serverProtoVersion, + }, nil +} + // SetURLDialer sets the dialer to use for dialing URLs. // This dialer is only use for clients created with NewClient, not NewRegionClient. // If unset or nil, the default dialer is used. @@ -1116,6 +1324,7 @@ func (c *Client) Close() error { // time and both calling closeForReconnect and the caller goroutines // forever calling closeForReconnect in lockstep endlessly; // https://github.com/tailscale/tailscale/pull/264) +// TODO: this to close existing? func (c *Client) closeForReconnect(brokenClient *derp.Client) { c.mu.Lock() defer c.mu.Unlock()