diff --git a/client/tailscale/localclient.go b/client/tailscale/localclient.go index 9c2bcc467..1bed54174 100644 --- a/client/tailscale/localclient.go +++ b/client/tailscale/localclient.go @@ -1327,6 +1327,17 @@ func (lc *LocalClient) SetServeConfig(ctx context.Context, config *ipn.ServeConf return nil } +// LameDuck shuts down all connections to control, thus making control consider this node inactive. This can be run on +// HA subnet router or app connector replicas before shutting them down to ensure peers get told to switch over to +// another replica whilst there is still some grace period for the existing connections to terminate. +func (lc *LocalClient) LameDuck(ctx context.Context) error { + _, _, err := lc.sendWithHeaders(ctx, "POST", "/localapi/v0/lameduck", 200, nil, nil) + if err != nil { + return fmt.Errorf("error enabling lameduck mode: %w", err) + } + return nil +} + // NetworkLockDisable shuts down network-lock across the tailnet. func (lc *LocalClient) NetworkLockDisable(ctx context.Context, secret []byte) error { if _, err := lc.send(ctx, "POST", "/localapi/v0/tka/disable", 200, bytes.NewReader(secret)); err != nil { diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index b91f1337a..b0a20cdcb 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -780,6 +780,19 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() { b.cc.SetPaused((b.state == ipn.Stopped && b.netMap != nil) || (!networkUp && !testenv.InTest() && !assumeNetworkUpdateForTest())) } +// LameDuck shuts down control client. This can be run before node shutdown to force control to consider this ndoe +// inactive. This can be used to ensure that nodes that are HA subnet router or app connector replicas are shutting +// down, clients switch over to other replicas whilst the existing connections are kept alive for some period of time. +func (b *LocalBackend) LameDuck() { + b.mu.Lock() + defer b.mu.Unlock() + cc := b.resetControlClientLocked() + if cc == nil { + return + } + cc.Shutdown() +} + // captivePortalDetectionInterval is the duration to wait in an unhealthy state with connectivity broken // before running captive portal detection. const captivePortalDetectionInterval = 2 * time.Second diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index 0d41725d8..a73977877 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -108,6 +108,7 @@ var handler = map[string]localAPIHandler{ "goroutines": (*Handler).serveGoroutines, "handle-push-message": (*Handler).serveHandlePushMessage, "id-token": (*Handler).serveIDToken, + "lameduck": (*Handler).lameDuck, "login-interactive": (*Handler).serveLoginInteractive, "logout": (*Handler).serveLogout, "logtap": (*Handler).serveLogTap, @@ -952,6 +953,22 @@ func (h *Handler) servePprof(w http.ResponseWriter, r *http.Request) { servePprofFunc(w, r) } +// lameDuck is the handler for local API /lameduck endpoint that shuts down control client, so that node no longer +// communicates with control. Doing this makes control consider this node inactive. This can be used before shutting +// down a replica of HA subnet router or app connector deployments to ensure that control tells the peers to switch +// over to another replica whilst still maintaining th existing peer connections. +func (h *Handler) lameDuck(w http.ResponseWriter, r *http.Request) { + if !h.PermitWrite { + http.Error(w, "access denied", http.StatusForbidden) + return + } + if r.Method != httpm.POST { + http.Error(w, "use POST", http.StatusMethodNotAllowed) + return + } + h.b.LameDuck() +} + func (h *Handler) reloadConfig(w http.ResponseWriter, r *http.Request) { if !h.PermitWrite { http.Error(w, "access denied", http.StatusForbidden)