Tom Meadows 5eb0b4be31
cmd/containerboot,cmd/k8s-proxy,kube: add authkey renewal to k8s-proxy (#19221)
* kube/authkey,cmd/containerboot: extract shared auth key reissue package

Move auth key reissue logic (set marker, wait for new key, clear marker,
read config) into a shared kube/authkey package and update containerboot
to use it. No behaviour change.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

* kube/authkey,kube/state,cmd/containerboot: preserve device_id across restarts

Stop clearing device_id, device_fqdn, and device_ips from state on startup.
These keys are now preserved across restarts so the operator can track
device identity. Expand ClearReissueAuthKey to clear device state and
tailscaled profile data when performing a full auth key reissue.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

* cmd/containerboot: use root context for auth key reissue wait

Pass the root context instead of bootCtx to setAndWaitForAuthKeyReissue.
The 60-second bootCtx timeout was cancelling the reissue wait before the
operator had time to respond, causing the pod to crash-loop.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

* cmd/k8s-proxy: add auth key renewal support

Add auth key reissue handling to k8s-proxy, mirroring containerboot.
When the proxy detects an auth failure (login-state health warning or
NeedsLogin state), it disconnects from control, signals the operator
via the state Secret, waits for a new key, clears stale state, and
exits so Kubernetes restarts the pod with the new key.

A health watcher goroutine runs alongside ts.Up() to short-circuit
the startup timeout on terminal auth failures.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

---------

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>
2026-04-15 16:13:46 +01:00

162 lines
4.9 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
package main
import (
"context"
"fmt"
"strings"
"sync"
"time"
"go.uber.org/zap"
"tailscale.com/client/local"
"tailscale.com/health"
"tailscale.com/ipn"
"tailscale.com/kube/authkey"
"tailscale.com/kube/k8s-proxy/conf"
"tailscale.com/kube/kubeapi"
"tailscale.com/kube/kubeclient"
"tailscale.com/kube/kubetypes"
"tailscale.com/tailcfg"
)
const k8sProxyFieldManager = "tailscale-k8s-proxy"
// resetState clears k8s-proxy state from previous runs and sets
// initial values. This ensures the operator doesn't use stale state when a Pod
// is first recreated.
//
// It also clears the reissue_authkey marker if the operator has actioned it
// (i.e., the config now has a different auth key than what was marked for
// reissue).
func resetState(ctx context.Context, kc kubeclient.Client, stateSecretName string, podUID string, configAuthKey string) error {
existingSecret, err := kc.GetSecret(ctx, stateSecretName)
switch {
case kubeclient.IsNotFoundErr(err):
return nil
case err != nil:
return fmt.Errorf("failed to read state Secret %q to reset state: %w", stateSecretName, err)
}
s := &kubeapi.Secret{
Data: map[string][]byte{
kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion),
},
}
if podUID != "" {
s.Data[kubetypes.KeyPodUID] = []byte(podUID)
}
// Only clear reissue_authkey if the operator has actioned it.
brokenAuthkey, ok := existingSecret.Data[kubetypes.KeyReissueAuthkey]
if ok && configAuthKey != "" && string(brokenAuthkey) != configAuthKey {
s.Data[kubetypes.KeyReissueAuthkey] = nil
}
return kc.StrategicMergePatchSecret(ctx, stateSecretName, s, k8sProxyFieldManager)
}
// needsAuthKeyReissue reports whether the given backend state and health
// warnings indicate a terminal auth failure requiring a new key from the
// operator.
func needsAuthKeyReissue(backendState string, healthWarnings []string) bool {
if backendState == ipn.NeedsLogin.String() {
return true
}
loginWarnableCode := string(health.LoginStateWarnable.Code)
for _, h := range healthWarnings {
if strings.Contains(h, loginWarnableCode) {
return true
}
}
return false
}
// checkInitialAuthState checks if the tsnet server is in an auth failure state
// immediately after coming up. Returns true if auth key reissue is needed.
func checkInitialAuthState(ctx context.Context, lc *local.Client) (bool, error) {
status, err := lc.Status(ctx)
if err != nil {
return false, fmt.Errorf("error getting status: %w", err)
}
return needsAuthKeyReissue(status.BackendState, status.Health), nil
}
// monitorAuthHealth watches the IPN bus for auth failures and triggers reissue
// when needed. Runs until context is cancelled or auth failure is detected.
func monitorAuthHealth(ctx context.Context, lc *local.Client, reissueCh chan<- struct{}, logger *zap.SugaredLogger) error {
w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialHealthState)
if err != nil {
return fmt.Errorf("failed to watch IPN bus for auth health: %w", err)
}
defer w.Close()
for {
if ctx.Err() != nil {
return ctx.Err()
}
n, err := w.Next()
if err != nil {
return err
}
if n.Health != nil {
if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok {
logger.Info("Auth key failed to authenticate (may be expired or single-use), requesting new key from operator")
select {
case reissueCh <- struct{}{}:
case <-ctx.Done():
}
return nil
}
}
}
}
// handleAuthKeyReissue orchestrates the auth key reissue flow:
// 1. Disconnect from control
// 2. Set reissue marker in state Secret
// 3. Wait for operator to provide new key
// 4. Exit cleanly (Kubernetes will restart the pod with the new key)
func handleAuthKeyReissue(ctx context.Context, lc *local.Client, kc kubeclient.Client, stateSecretName string, currentAuthKey string, cfgChan <-chan *conf.Config, logger *zap.SugaredLogger) error {
if err := lc.DisconnectControl(ctx); err != nil {
return fmt.Errorf("error disconnecting from control: %w", err)
}
if err := authkey.SetReissueAuthKey(ctx, kc, stateSecretName, currentAuthKey, k8sProxyFieldManager); err != nil {
return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret: %w", err)
}
var mu sync.Mutex
var latestAuthKey string
notify := make(chan struct{}, 1)
// we use this go func to abstract away conf.Config from the shared function
go func() {
for cfg := range cfgChan {
if cfg.Parsed.AuthKey != nil {
mu.Lock()
latestAuthKey = *cfg.Parsed.AuthKey
mu.Unlock()
select {
case notify <- struct{}{}:
default:
}
}
}
}()
getAuthKey := func() string {
mu.Lock()
defer mu.Unlock()
return latestAuthKey
}
clearFn := func(ctx context.Context) error {
return authkey.ClearReissueAuthKey(ctx, kc, stateSecretName, k8sProxyFieldManager)
}
return authkey.WaitForAuthKeyReissue(ctx, currentAuthKey, 10*time.Minute, getAuthKey, clearFn, notify)
}