David Bond 85d6ba9473
cmd/k8s-operator: migrate to tailscale-client-go-v2 (#19010)
This commit modifies the kubernetes operator to use the `tailscale-client-go-v2`
package instead of the internal tailscale client it was previously using. This
now gives us the ability to expand out custom resources and features as they
become available via the API module.

The tailnet reconciler has also been modified to manage clients as tailnets
are created and removed, providing each subsequent reconciler with a single
`ClientProvider` that obtains a tailscale client for the respective tailnet
by name, or the operator's default when presented with a blank string.

Fixes: https://github.com/tailscale/corp/issues/38418

Signed-off-by: David Bond <davidsbond93@gmail.com>
2026-04-09 14:39:46 +01:00

340 lines
11 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
// Package tailnet provides reconciliation logic for the Tailnet custom resource definition. It is responsible for
// ensuring the referenced OAuth credentials are valid and have the required scopes to be able to generate authentication
// keys, manage devices & manage VIP services.
package tailnet
import (
"context"
"errors"
"fmt"
"net/url"
"sync"
"time"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"tailscale.com/client/tailscale/v2"
"tailscale.com/ipn"
operatorutils "tailscale.com/k8s-operator"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/k8s-operator/reconciler"
"tailscale.com/k8s-operator/tsclient"
"tailscale.com/kube/kubetypes"
"tailscale.com/tstime"
"tailscale.com/util/clientmetric"
"tailscale.com/util/set"
)
type (
// The Reconciler type is a reconcile.TypedReconciler implementation used to manage the reconciliation of
// Tailnet custom resources.
Reconciler struct {
client.Client
tailscaleNamespace string
clock tstime.Clock
logger *zap.SugaredLogger
clientFunc func(*tsapi.Tailnet, *corev1.Secret) tsclient.Client
registry ClientRegistry
// Metrics related fields
mu sync.Mutex
tailnets set.Slice[types.UID]
}
// The ReconcilerOptions type contains configuration values for the Reconciler.
ReconcilerOptions struct {
// The client for interacting with the Kubernetes API.
Client client.Client
// The namespace the operator is installed in. This reconciler expects Tailnet OAuth credentials to be stored
// in Secret resources within this namespace.
TailscaleNamespace string
// Controls which clock to use for performing time-based functions. This is typically modified for use
// in tests.
Clock tstime.Clock
// The logger to use for this Reconciler.
Logger *zap.SugaredLogger
// ClientFunc is a function that takes tailscale credentials and returns an implementation for the Tailscale
// HTTP API. This should generally be nil unless needed for testing.
ClientFunc func(*tsapi.Tailnet, *corev1.Secret) tsclient.Client
// Registry is used to store and share initialized tailscale clients for use by other reconcilers.
Registry ClientRegistry
}
// The ClientRegistry interface describes types that can store initialized tailscale clients for use by other
// reconcilers.
ClientRegistry interface {
// Add should store the given tsclient.Client implementation for a specified tailnet.
Add(tailnet string, client tsclient.Client, ready bool)
// Remove should remove any tsclient.Client implementation for a specified tailnet.
Remove(tailnet string)
}
)
const reconcilerName = "tailnet-reconciler"
// NewReconciler returns a new instance of the Reconciler type. It watches specifically for changes to Tailnet custom
// resources. The ReconcilerOptions can be used to modify the behaviour of the Reconciler.
func NewReconciler(options ReconcilerOptions) *Reconciler {
return &Reconciler{
Client: options.Client,
tailscaleNamespace: options.TailscaleNamespace,
clock: options.Clock,
logger: options.Logger.Named(reconcilerName),
clientFunc: options.ClientFunc,
registry: options.Registry,
}
}
// Register the Reconciler onto the given manager.Manager implementation.
func (r *Reconciler) Register(mgr manager.Manager) error {
return builder.
ControllerManagedBy(mgr).
For(&tsapi.Tailnet{}).
Named(reconcilerName).
Complete(r)
}
var (
// gaugeTailnetResources tracks the overall number of Tailnet resources currently managed by this operator instance.
gaugeTailnetResources = clientmetric.NewGauge(kubetypes.MetricTailnetCount)
)
// Reconcile is invoked when a change occurs to Tailnet resources within the cluster. On create/update, the Tailnet
// resource is validated ensuring that the specified Secret exists and contains valid OAuth credentials that have
// required permissions to perform all necessary functions by the operator.
func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
var tailnet tsapi.Tailnet
err := r.Get(ctx, req.NamespacedName, &tailnet)
switch {
case apierrors.IsNotFound(err):
return reconcile.Result{}, nil
case err != nil:
return reconcile.Result{}, fmt.Errorf("failed to get Tailnet %q: %w", req.NamespacedName, err)
}
if !tailnet.DeletionTimestamp.IsZero() {
return r.delete(ctx, &tailnet)
}
return r.createOrUpdate(ctx, &tailnet)
}
func (r *Reconciler) delete(ctx context.Context, tailnet *tsapi.Tailnet) (reconcile.Result, error) {
reconciler.RemoveFinalizer(tailnet)
if err := r.Update(ctx, tailnet); err != nil {
return reconcile.Result{}, fmt.Errorf("failed to remove finalizer from Tailnet %q: %w", tailnet.Name, err)
}
r.mu.Lock()
r.tailnets.Remove(tailnet.UID)
r.mu.Unlock()
gaugeTailnetResources.Set(int64(r.tailnets.Len()))
r.registry.Remove(tailnet.Name)
return reconcile.Result{}, nil
}
// Constants for condition reasons.
const (
ReasonInvalidOAuth = "InvalidOAuth"
ReasonInvalidSecret = "InvalidSecret"
ReasonValid = "TailnetValid"
)
func (r *Reconciler) createOrUpdate(ctx context.Context, tailnet *tsapi.Tailnet) (reconcile.Result, error) {
r.mu.Lock()
r.tailnets.Add(tailnet.UID)
r.mu.Unlock()
gaugeTailnetResources.Set(int64(r.tailnets.Len()))
name := types.NamespacedName{Name: tailnet.Spec.Credentials.SecretName, Namespace: r.tailscaleNamespace}
var secret corev1.Secret
err := r.Get(ctx, name, &secret)
// The referenced Secret does not exist within the tailscale namespace, so we'll mark the Tailnet as not ready
// for use.
if apierrors.IsNotFound(err) {
operatorutils.SetTailnetCondition(
tailnet,
tsapi.TailnetReady,
metav1.ConditionFalse,
ReasonInvalidSecret,
fmt.Sprintf("referenced secret %q does not exist in namespace %q", name.Name, r.tailscaleNamespace),
r.clock,
r.logger,
)
if err = r.Status().Update(ctx, tailnet); err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update Tailnet status for %q: %w", tailnet.Name, err)
}
return reconcile.Result{}, nil
}
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to get secret %q: %w", name, err)
}
// We first ensure that the referenced secret contains the required fields. Otherwise, we set the Tailnet as
// invalid. The operator will not allow the use of this Tailnet while it is in an invalid state.
if ok := r.ensureSecret(tailnet, &secret); !ok {
if err = r.Status().Update(ctx, tailnet); err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update Tailnet status for %q: %w", tailnet.Name, err)
}
return reconcile.Result{RequeueAfter: time.Minute / 2}, nil
}
tsClient, err := r.createClient(tailnet, &secret)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to create tailnet client: %w", err)
}
// Second, we ensure the OAuth credentials supplied in the secret are valid and have the required scopes to access
// the various API endpoints required by the operator.
if ok := r.ensurePermissions(ctx, tsClient, tailnet); !ok {
r.registry.Add(tailnet.Name, tsClient, false)
if err = r.Status().Update(ctx, tailnet); err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update Tailnet status for %q: %w", tailnet.Name, err)
}
// We provide a requeue duration here as a user will likely want to go and modify their scopes and come back.
// This should save them having to delete and recreate the resource.
return reconcile.Result{RequeueAfter: time.Minute / 2}, nil
}
operatorutils.SetTailnetCondition(
tailnet,
tsapi.TailnetReady,
metav1.ConditionTrue,
ReasonValid,
ReasonValid,
r.clock,
r.logger,
)
if err = r.Status().Update(ctx, tailnet); err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update Tailnet status for %q: %w", tailnet.Name, err)
}
reconciler.SetFinalizer(tailnet)
if err = r.Update(ctx, tailnet); err != nil {
return reconcile.Result{}, fmt.Errorf("failed to add finalizer to Tailnet %q: %w", tailnet.Name, err)
}
r.registry.Add(tailnet.Name, tsClient, true)
return reconcile.Result{}, nil
}
// Constants for OAuth credential fields within the Secret referenced by the Tailnet.
const (
clientIDKey = "client_id"
clientSecretKey = "client_secret"
)
func (r *Reconciler) createClient(tailnet *tsapi.Tailnet, secret *corev1.Secret) (tsclient.Client, error) {
if r.clientFunc != nil {
return r.clientFunc(tailnet, secret), nil
}
baseURL := ipn.DefaultControlURL
if tailnet.Spec.LoginURL != "" {
baseURL = tailnet.Spec.LoginURL
}
base, err := url.Parse(baseURL)
if err != nil {
return nil, fmt.Errorf("failed to parse base URL %q: %w", baseURL, err)
}
return tsclient.Wrap(&tailscale.Client{
BaseURL: base,
UserAgent: "tailscale-k8s-operator",
Auth: &tailscale.OAuth{
ClientID: string(secret.Data[clientIDKey]),
ClientSecret: string(secret.Data[clientSecretKey]),
},
}), nil
}
func (r *Reconciler) ensurePermissions(ctx context.Context, tsClient tsclient.Client, tailnet *tsapi.Tailnet) bool {
// Perform basic list requests here to confirm that the OAuth credentials referenced on the Tailnet resource
// can perform the basic operations required for the operator to function. This has a caveat of only performing
// read actions, as we don't want to create arbitrary keys and VIP services. However, it will catch when a user
// has completely forgotten an entire scope that's required.
var errs error
if _, err := tsClient.Devices().List(ctx); err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to list devices: %w", err))
}
if _, err := tsClient.Keys().List(ctx, false); err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to list auth keys: %w", err))
}
if _, err := tsClient.VIPServices().List(ctx); err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to list tailscale services: %w", err))
}
if errs != nil {
operatorutils.SetTailnetCondition(
tailnet,
tsapi.TailnetReady,
metav1.ConditionFalse,
ReasonInvalidOAuth,
errs.Error(),
r.clock,
r.logger,
)
return false
}
return true
}
func (r *Reconciler) ensureSecret(tailnet *tsapi.Tailnet, secret *corev1.Secret) bool {
var message string
switch {
case len(secret.Data) == 0:
message = fmt.Sprintf("Secret %q is empty", secret.Name)
case len(secret.Data[clientIDKey]) == 0:
message = fmt.Sprintf("Secret %q is missing the client_id field", secret.Name)
case len(secret.Data[clientSecretKey]) == 0:
message = fmt.Sprintf("Secret %q is missing the client_secret field", secret.Name)
}
if message == "" {
return true
}
operatorutils.SetTailnetCondition(
tailnet,
tsapi.TailnetReady,
metav1.ConditionFalse,
ReasonInvalidSecret,
message,
r.clock,
r.logger,
)
return false
}