diff --git a/cmd/containerboot/kube.go b/cmd/containerboot/kube.go index 73f5819b4..3e97710da 100644 --- a/cmd/containerboot/kube.go +++ b/cmd/containerboot/kube.go @@ -21,6 +21,7 @@ import ( "github.com/fsnotify/fsnotify" "tailscale.com/client/local" "tailscale.com/ipn" + "tailscale.com/kube/authkey" "tailscale.com/kube/egressservices" "tailscale.com/kube/ingressservices" "tailscale.com/kube/kubeapi" @@ -32,7 +33,6 @@ import ( ) const fieldManager = "tailscale-container" -const kubeletMountedConfigLn = "..data" // kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use // this rather than any of the upstream Kubernetes client libaries to avoid extra imports. @@ -127,6 +127,9 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error { // resetContainerbootState resets state from previous runs of containerboot to // ensure the operator doesn't use stale state when a Pod is first recreated. +// +// Device identity keys (device_id, device_fqdn, device_ips) are preserved so +// the operator can clean up the old device from the control plane. func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string, tailscaledConfigAuthkey string) error { existingSecret, err := kc.GetSecret(ctx, kc.stateSecret) switch { @@ -139,12 +142,7 @@ func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string s := &kubeapi.Secret{ Data: map[string][]byte{ - kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), - - // TODO(tomhjp): Perhaps shouldn't clear device ID and use a different signal, as this could leak tailnet devices. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -169,47 +167,18 @@ func (kc *kubeClient) setAndWaitForAuthKeyReissue(ctx context.Context, client *l return fmt.Errorf("error disconnecting from control: %w", err) } - err = kc.setReissueAuthKey(ctx, tailscaledConfigAuthKey) + err = authkey.SetReissueAuthKey(ctx, kc.Client, kc.stateSecret, tailscaledConfigAuthKey, authkey.TailscaleContainerFieldManager) if err != nil { return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret: %w", err) } - err = kc.waitForAuthKeyReissue(ctx, cfg.TailscaledConfigFilePath, tailscaledConfigAuthKey, 10*time.Minute) - if err != nil { - return fmt.Errorf("failed to receive new auth key: %w", err) + clearFn := func(ctx context.Context) error { + return authkey.ClearReissueAuthKey(ctx, kc.Client, kc.stateSecret, authkey.TailscaleContainerFieldManager) } - return nil -} - -func (kc *kubeClient) setReissueAuthKey(ctx context.Context, authKey string) error { - s := &kubeapi.Secret{ - Data: map[string][]byte{ - kubetypes.KeyReissueAuthkey: []byte(authKey), - }, - } - - log.Printf("Requesting a new auth key from operator") - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) -} - -func (kc *kubeClient) waitForAuthKeyReissue(ctx context.Context, configPath string, oldAuthKey string, maxWait time.Duration) error { - log.Printf("Waiting for operator to provide new auth key (max wait: %v)", maxWait) - - ctx, cancel := context.WithTimeout(ctx, maxWait) - defer cancel() - - tailscaledCfgDir := filepath.Dir(configPath) - toWatch := filepath.Join(tailscaledCfgDir, kubeletMountedConfigLn) - - var ( - pollTicker <-chan time.Time - eventChan <-chan fsnotify.Event - ) - - pollInterval := 5 * time.Second - - // Try to use fsnotify for faster notification + getAuthKey := func() string { return authkey.AuthKeyFromConfig(cfg.TailscaledConfigFilePath) } + tailscaledCfgDir := filepath.Dir(cfg.TailscaledConfigFilePath) + var notify <-chan struct{} if w, err := fsnotify.NewWatcher(); err != nil { log.Printf("auth key reissue: fsnotify unavailable, using polling: %v", err) } else if err := w.Add(tailscaledCfgDir); err != nil { @@ -217,54 +186,28 @@ func (kc *kubeClient) waitForAuthKeyReissue(ctx context.Context, configPath stri log.Printf("auth key reissue: fsnotify watch failed, using polling: %v", err) } else { defer w.Close() + ch := make(chan struct{}, 1) + toWatch := filepath.Join(tailscaledCfgDir, "..data") + go func() { + for ev := range w.Events { + if ev.Name == toWatch { + select { + case ch <- struct{}{}: + default: + } + } + } + }() + notify = ch log.Printf("auth key reissue: watching for config changes via fsnotify") - eventChan = w.Events } - // still keep polling if using fsnotify, for logging and in case fsnotify fails - pt := time.NewTicker(pollInterval) - defer pt.Stop() - pollTicker = pt.C - - start := time.Now() - - for { - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for auth key reissue after %v", maxWait) - case <-pollTicker: // Waits for polling tick, continues when received - case event := <-eventChan: - if event.Name != toWatch { - continue - } - } - - newAuthKey := authkeyFromTailscaledConfig(configPath) - if newAuthKey != "" && newAuthKey != oldAuthKey { - log.Printf("New auth key received from operator after %v", time.Since(start).Round(time.Second)) - - if err := kc.clearReissueAuthKeyRequest(ctx); err != nil { - log.Printf("Warning: failed to clear reissue request: %v", err) - } - - return nil - } - - if eventChan == nil && pollTicker != nil { - log.Printf("Waiting for new auth key from operator (%v elapsed)", time.Since(start).Round(time.Second)) - } + err = authkey.WaitForAuthKeyReissue(ctx, tailscaledConfigAuthKey, 10*time.Minute, getAuthKey, clearFn, notify) + if err != nil { + return fmt.Errorf("failed to receive new auth key: %w", err) } -} -// clearReissueAuthKeyRequest removes the reissue_authkey marker from the Secret -// to signal to the operator that we've successfully received the new key. -func (kc *kubeClient) clearReissueAuthKeyRequest(ctx context.Context) error { - s := &kubeapi.Secret{ - Data: map[string][]byte{ - kubetypes.KeyReissueAuthkey: nil, - }, - } - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) + return nil } // waitForConsistentState waits for tailscaled to finish writing state if it diff --git a/cmd/containerboot/kube_test.go b/cmd/containerboot/kube_test.go index b2e89a36c..fec0b74f7 100644 --- a/cmd/containerboot/kube_test.go +++ b/cmd/containerboot/kube_test.go @@ -257,12 +257,8 @@ func TestResetContainerbootState(t *testing.T) { authkey: "new-authkey", initial: map[string][]byte{}, expected: map[string][]byte{ - kubetypes.KeyCapVer: capver, - kubetypes.KeyPodUID: []byte("1234"), - // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyCapVer: capver, + kubetypes.KeyPodUID: []byte("1234"), kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -271,11 +267,7 @@ func TestResetContainerbootState(t *testing.T) { "empty_initial_no_pod_uid": { initial: map[string][]byte{}, expected: map[string][]byte{ - kubetypes.KeyCapVer: capver, - // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyCapVer: capver, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -303,9 +295,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, kubetypes.KeyPodUID: []byte("1234"), // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -321,9 +310,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, kubetypes.KeyReissueAuthkey: nil, // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -338,9 +324,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, // reissue_authkey not cleared. // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, @@ -355,9 +338,6 @@ func TestResetContainerbootState(t *testing.T) { kubetypes.KeyCapVer: capver, // reissue_authkey not cleared. // Cleared keys. - kubetypes.KeyDeviceID: nil, - kubetypes.KeyDeviceFQDN: nil, - kubetypes.KeyDeviceIPs: nil, kubetypes.KeyHTTPSEndpoint: nil, egressservices.KeyEgressServices: nil, ingressservices.IngressConfigKey: nil, diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index e80192a31..12a274507 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -139,8 +139,8 @@ import ( "tailscale.com/health" "tailscale.com/ipn" - "tailscale.com/ipn/conffile" kubeutils "tailscale.com/k8s-operator" + "tailscale.com/kube/authkey" healthz "tailscale.com/kube/health" "tailscale.com/kube/kubetypes" klc "tailscale.com/kube/localclient" @@ -209,7 +209,7 @@ func run() error { var tailscaledConfigAuthkey string if isOneStepConfig(cfg) { - tailscaledConfigAuthkey = authkeyFromTailscaledConfig(cfg.TailscaledConfigFilePath) + tailscaledConfigAuthkey = authkey.AuthKeyFromConfig(cfg.TailscaledConfigFilePath) } var kc *kubeClient @@ -374,7 +374,7 @@ authLoop: if hasKubeStateStore(cfg) { log.Printf("Auth key missing or invalid (NeedsLogin state), disconnecting from control and requesting new key from operator") - err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey) + err := kc.setAndWaitForAuthKeyReissue(ctx, client, cfg, tailscaledConfigAuthkey) if err != nil { return fmt.Errorf("failed to get a reissued authkey: %w", err) } @@ -414,7 +414,7 @@ authLoop: if isOneStepConfig(cfg) && hasKubeStateStore(cfg) { log.Printf("Auth key failed to authenticate (may be expired or single-use), disconnecting from control and requesting new key from operator") - err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey) + err := kc.setAndWaitForAuthKeyReissue(ctx, client, cfg, tailscaledConfigAuthkey) if err != nil { return fmt.Errorf("failed to get a reissued authkey: %w", err) } @@ -1024,11 +1024,3 @@ func serviceIPsFromNetMap(nm *netmap.NetworkMap, fqdn dnsname.FQDN) []netip.Pref return prefixes } - -func authkeyFromTailscaledConfig(path string) string { - if cfg, err := conffile.Load(path); err == nil && cfg.Parsed.AuthKey != nil { - return *cfg.Parsed.AuthKey - } - - return "" -} diff --git a/cmd/k8s-proxy/k8s-proxy.go b/cmd/k8s-proxy/k8s-proxy.go index 38a86a5e0..673493f58 100644 --- a/cmd/k8s-proxy/k8s-proxy.go +++ b/cmd/k8s-proxy/k8s-proxy.go @@ -31,6 +31,7 @@ import ( "k8s.io/utils/strings/slices" "tailscale.com/client/local" "tailscale.com/cmd/k8s-proxy/internal/config" + "tailscale.com/health" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/ipn/store" @@ -41,6 +42,7 @@ import ( "tailscale.com/kube/certs" healthz "tailscale.com/kube/health" "tailscale.com/kube/k8s-proxy/conf" + "tailscale.com/kube/kubeclient" "tailscale.com/kube/kubetypes" klc "tailscale.com/kube/localclient" "tailscale.com/kube/metrics" @@ -171,10 +173,31 @@ func run(logger *zap.SugaredLogger) error { // If Pod UID unset, assume we're running outside of a cluster/not managed // by the operator, so no need to set additional state keys. + var kc kubeclient.Client + var stateSecretName string if podUID != "" { if err := state.SetInitialKeys(st, podUID); err != nil { return fmt.Errorf("error setting initial state: %w", err) } + + if cfg.Parsed.State != nil { + if name, ok := strings.CutPrefix(*cfg.Parsed.State, "kube:"); ok { + stateSecretName = name + + kc, err = kubeclient.New(k8sProxyFieldManager) + if err != nil { + return err + } + + var configAuthKey string + if cfg.Parsed.AuthKey != nil { + configAuthKey = *cfg.Parsed.AuthKey + } + if err := resetState(ctx, kc, stateSecretName, podUID, configAuthKey); err != nil { + return fmt.Errorf("error resetting state: %w", err) + } + } + } } var authKey string @@ -197,23 +220,69 @@ func run(logger *zap.SugaredLogger) error { ts.Hostname = *cfg.Parsed.Hostname } - // Make sure we crash loop if Up doesn't complete in reasonable time. - upCtx, upCancel := context.WithTimeout(ctx, time.Minute) - defer upCancel() - if _, err := ts.Up(upCtx); err != nil { - return fmt.Errorf("error starting tailscale server: %w", err) - } - defer ts.Close() lc, err := ts.LocalClient() if err != nil { return fmt.Errorf("error getting local client: %w", err) } - // Setup for updating state keys. + // Make sure we crash loop if Up doesn't complete in reasonable time. + upCtx, upCancel := context.WithTimeout(ctx, 30*time.Second) + defer upCancel() + + // ts.Up() deliberately ignores NeedsLogin because it fires transiently + // during normal auth-key login. We can watch for the login-state health + // warning here though, which only fires on terminal auth failure, and + // cancel early. + go func() { + w, err := lc.WatchIPNBus(upCtx, ipn.NotifyInitialHealthState) + if err != nil { + return + } + defer w.Close() + for { + n, err := w.Next() + if err != nil { + logger.Debugf("failed to process message from ipn bus: %s", err.Error()) + return + } + if n.Health != nil { + if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok { + upCancel() + return + } + } + } + }() + + if _, err := ts.Up(upCtx); err != nil { + if kc != nil && stateSecretName != "" { + return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger) + } + return err + } + + defer ts.Close() + + reissueCh := make(chan struct{}, 1) if podUID != "" { group.Go(func() error { return state.KeepKeysUpdated(ctx, st, klc.New(lc)) }) + + if kc != nil && stateSecretName != "" { + needsReissue, err := checkInitialAuthState(ctx, lc) + if err != nil { + return fmt.Errorf("error checking initial auth state: %w", err) + } + if needsReissue { + logger.Info("Auth key missing or invalid after startup, requesting new key from operator") + return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger) + } + + group.Go(func() error { + return monitorAuthHealth(ctx, lc, reissueCh, logger) + }) + } } if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) { @@ -362,6 +431,8 @@ func run(logger *zap.SugaredLogger) error { } cfgLogger.Infof("Config reloaded") + case <-reissueCh: + return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger) } } } diff --git a/cmd/k8s-proxy/kube.go b/cmd/k8s-proxy/kube.go new file mode 100644 index 000000000..1d9348f1a --- /dev/null +++ b/cmd/k8s-proxy/kube.go @@ -0,0 +1,161 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +package main + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + "go.uber.org/zap" + "tailscale.com/client/local" + "tailscale.com/health" + "tailscale.com/ipn" + "tailscale.com/kube/authkey" + "tailscale.com/kube/k8s-proxy/conf" + "tailscale.com/kube/kubeapi" + "tailscale.com/kube/kubeclient" + "tailscale.com/kube/kubetypes" + "tailscale.com/tailcfg" +) + +const k8sProxyFieldManager = "tailscale-k8s-proxy" + +// resetState clears k8s-proxy state from previous runs and sets +// initial values. This ensures the operator doesn't use stale state when a Pod +// is first recreated. +// +// It also clears the reissue_authkey marker if the operator has actioned it +// (i.e., the config now has a different auth key than what was marked for +// reissue). +func resetState(ctx context.Context, kc kubeclient.Client, stateSecretName string, podUID string, configAuthKey string) error { + existingSecret, err := kc.GetSecret(ctx, stateSecretName) + switch { + case kubeclient.IsNotFoundErr(err): + return nil + case err != nil: + return fmt.Errorf("failed to read state Secret %q to reset state: %w", stateSecretName, err) + } + + s := &kubeapi.Secret{ + Data: map[string][]byte{ + kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), + }, + } + if podUID != "" { + s.Data[kubetypes.KeyPodUID] = []byte(podUID) + } + + // Only clear reissue_authkey if the operator has actioned it. + brokenAuthkey, ok := existingSecret.Data[kubetypes.KeyReissueAuthkey] + if ok && configAuthKey != "" && string(brokenAuthkey) != configAuthKey { + s.Data[kubetypes.KeyReissueAuthkey] = nil + } + + return kc.StrategicMergePatchSecret(ctx, stateSecretName, s, k8sProxyFieldManager) +} + +// needsAuthKeyReissue reports whether the given backend state and health +// warnings indicate a terminal auth failure requiring a new key from the +// operator. +func needsAuthKeyReissue(backendState string, healthWarnings []string) bool { + if backendState == ipn.NeedsLogin.String() { + return true + } + loginWarnableCode := string(health.LoginStateWarnable.Code) + for _, h := range healthWarnings { + if strings.Contains(h, loginWarnableCode) { + return true + } + } + return false +} + +// checkInitialAuthState checks if the tsnet server is in an auth failure state +// immediately after coming up. Returns true if auth key reissue is needed. +func checkInitialAuthState(ctx context.Context, lc *local.Client) (bool, error) { + status, err := lc.Status(ctx) + if err != nil { + return false, fmt.Errorf("error getting status: %w", err) + } + return needsAuthKeyReissue(status.BackendState, status.Health), nil +} + +// monitorAuthHealth watches the IPN bus for auth failures and triggers reissue +// when needed. Runs until context is cancelled or auth failure is detected. +func monitorAuthHealth(ctx context.Context, lc *local.Client, reissueCh chan<- struct{}, logger *zap.SugaredLogger) error { + w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialHealthState) + if err != nil { + return fmt.Errorf("failed to watch IPN bus for auth health: %w", err) + } + defer w.Close() + + for { + if ctx.Err() != nil { + return ctx.Err() + } + n, err := w.Next() + if err != nil { + return err + } + if n.Health != nil { + if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok { + logger.Info("Auth key failed to authenticate (may be expired or single-use), requesting new key from operator") + select { + case reissueCh <- struct{}{}: + case <-ctx.Done(): + } + return nil + } + } + } +} + +// handleAuthKeyReissue orchestrates the auth key reissue flow: +// 1. Disconnect from control +// 2. Set reissue marker in state Secret +// 3. Wait for operator to provide new key +// 4. Exit cleanly (Kubernetes will restart the pod with the new key) +func handleAuthKeyReissue(ctx context.Context, lc *local.Client, kc kubeclient.Client, stateSecretName string, currentAuthKey string, cfgChan <-chan *conf.Config, logger *zap.SugaredLogger) error { + if err := lc.DisconnectControl(ctx); err != nil { + return fmt.Errorf("error disconnecting from control: %w", err) + } + if err := authkey.SetReissueAuthKey(ctx, kc, stateSecretName, currentAuthKey, k8sProxyFieldManager); err != nil { + return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret: %w", err) + } + + var mu sync.Mutex + var latestAuthKey string + notify := make(chan struct{}, 1) + + // we use this go func to abstract away conf.Config from the shared function + go func() { + for cfg := range cfgChan { + if cfg.Parsed.AuthKey != nil { + mu.Lock() + latestAuthKey = *cfg.Parsed.AuthKey + mu.Unlock() + select { + case notify <- struct{}{}: + default: + } + } + } + }() + + getAuthKey := func() string { + mu.Lock() + defer mu.Unlock() + return latestAuthKey + } + clearFn := func(ctx context.Context) error { + return authkey.ClearReissueAuthKey(ctx, kc, stateSecretName, k8sProxyFieldManager) + } + + return authkey.WaitForAuthKeyReissue(ctx, currentAuthKey, 10*time.Minute, getAuthKey, clearFn, notify) +} diff --git a/cmd/k8s-proxy/kube_test.go b/cmd/k8s-proxy/kube_test.go new file mode 100644 index 000000000..c7e0f33d0 --- /dev/null +++ b/cmd/k8s-proxy/kube_test.go @@ -0,0 +1,141 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +package main + +import ( + "context" + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "tailscale.com/health" + "tailscale.com/kube/kubeapi" + "tailscale.com/kube/kubeclient" + "tailscale.com/kube/kubetypes" + "tailscale.com/tailcfg" +) + +func TestResetState(t *testing.T) { + tests := []struct { + name string + existingData map[string][]byte + podUID string + configAuthKey string + wantPatched map[string][]byte + }{ + { + name: "sets_capver_and_pod_uid", + existingData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("device-123"), + kubetypes.KeyDeviceFQDN: []byte("node.tailnet"), + kubetypes.KeyDeviceIPs: []byte(`["100.64.0.1"]`), + }, + podUID: "pod-123", + configAuthKey: "new-key", + wantPatched: map[string][]byte{ + kubetypes.KeyPodUID: []byte("pod-123"), + }, + }, + { + name: "clears_reissue_marker_when_actioned", + existingData: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-key"), + }, + podUID: "pod-123", + configAuthKey: "new-key", + wantPatched: map[string][]byte{ + kubetypes.KeyPodUID: []byte("pod-123"), + kubetypes.KeyReissueAuthkey: nil, + }, + }, + { + name: "keeps_reissue_marker_when_not_actioned", + existingData: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-key"), + }, + podUID: "pod-123", + configAuthKey: "old-key", + wantPatched: map[string][]byte{ + kubetypes.KeyPodUID: []byte("pod-123"), + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.wantPatched[kubetypes.KeyCapVer] = fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion) + + var patched map[string][]byte + kc := &kubeclient.FakeClient{ + GetSecretImpl: func(ctx context.Context, name string) (*kubeapi.Secret, error) { + return &kubeapi.Secret{Data: tt.existingData}, nil + }, + StrategicMergePatchSecretImpl: func(ctx context.Context, name string, s *kubeapi.Secret, fm string) error { + patched = s.Data + return nil + }, + } + + err := resetState(context.Background(), kc, "test-secret", tt.podUID, tt.configAuthKey) + if err != nil { + t.Fatalf("resetState() error = %v", err) + } + + if diff := cmp.Diff(tt.wantPatched, patched); diff != "" { + t.Errorf("resetState() mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestNeedsAuthKeyReissue(t *testing.T) { + loginWarnableCode := string(health.LoginStateWarnable.Code) + + tests := []struct { + name string + backendState string + health []string + want bool + }{ + { + name: "running_healthy", + backendState: "Running", + want: false, + }, + { + name: "needs_login", + backendState: "NeedsLogin", + want: true, + }, + { + name: "running_with_login_warning", + backendState: "Running", + health: []string{"warning: " + loginWarnableCode + ": you are logged out"}, + want: true, + }, + { + name: "running_with_unrelated_warning", + backendState: "Running", + health: []string{"dns-not-working"}, + want: false, + }, + { + name: "running_no_warnings", + backendState: "Running", + health: nil, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := needsAuthKeyReissue(tt.backendState, tt.health) + if got != tt.want { + t.Errorf("needsAuthKeyReissue() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/kube/authkey/authkey.go b/kube/authkey/authkey.go new file mode 100644 index 000000000..f544a0c81 --- /dev/null +++ b/kube/authkey/authkey.go @@ -0,0 +1,122 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +// Package authkey provides shared logic for handling auth key reissue +// requests between tailnet clients (containerboot, k8s-proxy) and the +// operator. +// +// When a client fails to authenticate (expired key, single-use key already +// used), it signals the operator by setting a marker in its state Secret. +// The operator responds by deleting the old device and issuing a new auth +// key. The client watches for the new key and restarts to apply it. +package authkey + +import ( + "context" + "fmt" + "log" + "time" + + "tailscale.com/ipn" + "tailscale.com/ipn/conffile" + "tailscale.com/kube/kubeapi" + "tailscale.com/kube/kubeclient" + "tailscale.com/kube/kubetypes" +) + +const ( + TailscaleContainerFieldManager = "tailscale-container" +) + +// SetReissueAuthKey sets the reissue_authkey marker in the state Secret to +// signal to the operator that a new auth key is needed. The marker value is +// the auth key that failed to authenticate. +func SetReissueAuthKey(ctx context.Context, kc kubeclient.Client, stateSecretName string, authKey string, fieldManager string) error { + s := &kubeapi.Secret{ + Data: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte(authKey), + }, + } + + log.Printf("Requesting a new auth key from operator") + return kc.StrategicMergePatchSecret(ctx, stateSecretName, s, fieldManager) +} + +// ClearReissueAuthKey removes the reissue_authkey marker from the state Secret +// to signal to the operator that we've successfully received the new key. +func ClearReissueAuthKey(ctx context.Context, kc kubeclient.Client, stateSecretName string, fieldManager string) error { + existing, err := kc.GetSecret(ctx, stateSecretName) + if err != nil { + return fmt.Errorf("error getting state secret: %w", err) + } + + s := &kubeapi.Secret{ + Data: map[string][]byte{ + kubetypes.KeyReissueAuthkey: nil, + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + string(ipn.MachineKeyStateKey): nil, + string(ipn.CurrentProfileStateKey): nil, + string(ipn.KnownProfilesStateKey): nil, + }, + } + + if profileKey := string(existing.Data[string(ipn.CurrentProfileStateKey)]); profileKey != "" { + s.Data[profileKey] = nil + } + + return kc.StrategicMergePatchSecret(ctx, stateSecretName, s, fieldManager) +} + +// WaitForAuthKeyReissue polls getAuthKey for a new auth key different from +// oldAuthKey, returning when one is found or maxWait expires. If notify is +// non-nil, it is used to wake the loop on config changes; otherwise it falls +// back to periodic polling. The clearFn callback is called when a new key is +// detected, to clear the reissue marker from the state Secret. +func WaitForAuthKeyReissue(ctx context.Context, oldAuthKey string, maxWait time.Duration, getAuthKey func() string, clearFn func(context.Context) error, notify <-chan struct{}) error { + log.Printf("Waiting for operator to provide new auth key (max wait: %v)", maxWait) + + ctx, cancel := context.WithTimeout(ctx, maxWait) + defer cancel() + + pollInterval := 5 * time.Second + pt := time.NewTicker(pollInterval) + defer pt.Stop() + + start := time.Now() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for auth key reissue after %v", maxWait) + case <-pt.C: + case <-notify: + } + + newAuthKey := getAuthKey() + if newAuthKey != "" && newAuthKey != oldAuthKey { + log.Printf("New auth key received from operator after %v", time.Since(start).Round(time.Second)) + if err := clearFn(ctx); err != nil { + log.Printf("Warning: failed to clear reissue request: %v", err) + } + return nil + } + + if notify == nil { + log.Printf("Waiting for new auth key from operator (%v elapsed)", time.Since(start).Round(time.Second)) + } + } +} + +// AuthKeyFromConfig extracts the auth key from a tailscaled config file. +// Returns empty string if the file cannot be read or contains no auth key. +func AuthKeyFromConfig(path string) string { + if cfg, err := conffile.Load(path); err == nil && cfg.Parsed.AuthKey != nil { + return *cfg.Parsed.AuthKey + } + + return "" +} diff --git a/kube/authkey/authkey_test.go b/kube/authkey/authkey_test.go new file mode 100644 index 000000000..268bc46d6 --- /dev/null +++ b/kube/authkey/authkey_test.go @@ -0,0 +1,124 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +package authkey + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp" + "tailscale.com/ipn" + "tailscale.com/kube/kubeapi" + "tailscale.com/kube/kubeclient" + "tailscale.com/kube/kubetypes" +) + +func TestSetReissueAuthKey(t *testing.T) { + var patched map[string][]byte + kc := &kubeclient.FakeClient{ + StrategicMergePatchSecretImpl: func(ctx context.Context, name string, secret *kubeapi.Secret, _ string) error { + patched = secret.Data + return nil + }, + } + + err := SetReissueAuthKey(context.Background(), kc, "test-secret", "old-auth-key", TailscaleContainerFieldManager) + if err != nil { + t.Fatalf("SetReissueAuthKey() error = %v", err) + } + + want := map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-auth-key"), + } + if diff := cmp.Diff(want, patched); diff != "" { + t.Errorf("SetReissueAuthKey() mismatch (-want +got):\n%s", diff) + } +} + +func TestClearReissueAuthKey(t *testing.T) { + var patched map[string][]byte + kc := &kubeclient.FakeClient{ + GetSecretImpl: func(ctx context.Context, name string) (*kubeapi.Secret, error) { + return &kubeapi.Secret{ + Data: map[string][]byte{ + "_current-profile": []byte("profile-abc1"), + "profile-abc1": []byte("some-profile-data"), + "_machinekey": []byte("machine-key-data"), + }, + }, nil + }, + StrategicMergePatchSecretImpl: func(ctx context.Context, name string, secret *kubeapi.Secret, _ string) error { + patched = secret.Data + return nil + }, + } + + err := ClearReissueAuthKey(context.Background(), kc, "test-secret", TailscaleContainerFieldManager) + if err != nil { + t.Fatalf("ClearReissueAuthKey() error = %v", err) + } + + want := map[string][]byte{ + kubetypes.KeyReissueAuthkey: nil, + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + string(ipn.MachineKeyStateKey): nil, + string(ipn.CurrentProfileStateKey): nil, + string(ipn.KnownProfilesStateKey): nil, + "profile-abc1": nil, + } + if diff := cmp.Diff(want, patched); diff != "" { + t.Errorf("ClearReissueAuthKey() mismatch (-want +got):\n%s", diff) + } +} + +func TestAuthKeyFromConfig(t *testing.T) { + for name, tc := range map[string]struct { + configContent string + want string + }{ + "valid_config_with_authkey": { + configContent: `{"Version":"alpha0","AuthKey":"test-auth-key"}`, + want: "test-auth-key", + }, + "valid_config_without_authkey": { + configContent: `{"Version":"alpha0"}`, + want: "", + }, + "invalid_config": { + configContent: `not valid json`, + want: "", + }, + "empty_config": { + configContent: ``, + want: "", + }, + } { + t.Run(name, func(t *testing.T) { + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.json") + + if err := os.WriteFile(configPath, []byte(tc.configContent), 0600); err != nil { + t.Fatalf("failed to write config file: %v", err) + } + + got := AuthKeyFromConfig(configPath) + if got != tc.want { + t.Errorf("AuthKeyFromConfig() = %q, want %q", got, tc.want) + } + }) + } + + t.Run("nonexistent_file", func(t *testing.T) { + got := AuthKeyFromConfig("/nonexistent/path/config.json") + if got != "" { + t.Errorf("AuthKeyFromConfig() = %q, want empty string for nonexistent file", got) + } + }) +} diff --git a/kube/state/state.go b/kube/state/state.go index ebedb2f72..a7f00b7f2 100644 --- a/kube/state/state.go +++ b/kube/state/state.go @@ -30,19 +30,8 @@ const ( keyDeviceFQDN = ipn.StateKey(kubetypes.KeyDeviceFQDN) ) -// SetInitialKeys sets Pod UID and cap ver and clears tailnet device state -// keys to help stop the operator using stale tailnet device state. +// SetInitialKeys sets Pod UID and cap ver. func SetInitialKeys(store ipn.StateStore, podUID string) error { - // Clear device state keys first so the operator knows if the pod UID - // matches, the other values are definitely not stale. - for _, key := range []ipn.StateKey{keyDeviceID, keyDeviceFQDN, keyDeviceIPs} { - if _, err := store.ReadState(key); err == nil { - if err := store.WriteState(key, nil); err != nil { - return fmt.Errorf("error writing %q to state store: %w", key, err) - } - } - } - if err := store.WriteState(keyPodUID, []byte(podUID)); err != nil { return fmt.Errorf("error writing pod UID to state store: %w", err) } diff --git a/kube/state/state_test.go b/kube/state/state_test.go index 9b2ce69be..b5603acb5 100644 --- a/kube/state/state_test.go +++ b/kube/state/state_test.go @@ -58,9 +58,9 @@ func TestSetInitialStateKeys(t *testing.T) { expected: map[ipn.StateKey][]byte{ keyPodUID: podUID, keyCapVer: expectedCapVer, - keyDeviceID: nil, - keyDeviceFQDN: nil, - keyDeviceIPs: nil, + keyDeviceID: []byte("existing-device-id"), + keyDeviceFQDN: []byte("existing-device-fqdn"), + keyDeviceIPs: []byte(`["1.2.3.4"]`), }, }, } {