mirror of
				https://github.com/tailscale/tailscale.git
				synced 2025-10-24 22:02:04 +02:00 
			
		
		
		
	cmd/{containerboot,k8s-operator},kube: add preshutdown hook for egress PG proxies
This change is part of work towards minimizing downtime during update
rollouts of egress ProxyGroup replicas.
This change:
- updates the containerboot health check logic to return Pod IP in headers,
if set
- always runs the health check for egress PG proxies
- updates ClusterIP Services created for PG egress endpoints to include
the health check endpoint
- implements preshutdown endpoint in proxies. The preshutdown endpoint
logic waits till, for all currently configured egress services, the ClusterIP
Service health check endpoint is no longer returned by the shutting-down Pod
(by looking at the new Pod IP header).
- ensures that kubelet is configured to call the preshutdown endpoint
This reduces the possibility that, as replicas are terminated during an update,
a replica gets terminated to which cluster traffic is still being routed via
the ClusterIP Service because kube proxy has not yet updated routig rules.
This is not a perfect check as in practice, it only checks that the kube
proxy on the node on which the proxy runs has updated rules. However, overall
this might be good enough.
The preshutdown logic is disabled if users have configured a custom health check
port via TS_LOCAL_ADDR_PORT env var. This change throws a warnign if so and in
future setting of that env var for operator proxies might be disallowed (as users
shouldn't need to configure this for a Pod directly).
This is backwards compatible with earlier proxy versions.
Updates tailscale/tailscale#14326
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
		
	
			
		
			
				
	
	
		
			215 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			215 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright (c) Tailscale Inc & AUTHORS
 | |
| // SPDX-License-Identifier: BSD-3-Clause
 | |
| 
 | |
| //go:build !plan9
 | |
| 
 | |
| package main
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"net/netip"
 | |
| 	"reflect"
 | |
| 	"strings"
 | |
| 
 | |
| 	"go.uber.org/zap"
 | |
| 	corev1 "k8s.io/api/core/v1"
 | |
| 	discoveryv1 "k8s.io/api/discovery/v1"
 | |
| 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 | |
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | |
| 	"sigs.k8s.io/controller-runtime/pkg/client"
 | |
| 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 | |
| 	"tailscale.com/kube/egressservices"
 | |
| 	"tailscale.com/types/ptr"
 | |
| )
 | |
| 
 | |
| // egressEpsReconciler reconciles EndpointSlices for tailnet services exposed to cluster via egress ProxyGroup proxies.
 | |
| type egressEpsReconciler struct {
 | |
| 	client.Client
 | |
| 	logger      *zap.SugaredLogger
 | |
| 	tsNamespace string
 | |
| }
 | |
| 
 | |
| // Reconcile reconciles an EndpointSlice for a tailnet service. It updates the EndpointSlice with the endpoints of
 | |
| // those ProxyGroup Pods that are ready to route traffic to the tailnet service.
 | |
| // It compares tailnet service state stored in egress proxy state Secrets by containerboot with the desired
 | |
| // configuration stored in proxy-cfg ConfigMap to determine if the endpoint is ready.
 | |
| func (er *egressEpsReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) {
 | |
| 	l := er.logger.With("Service", req.NamespacedName)
 | |
| 	l.Debugf("starting reconcile")
 | |
| 	defer l.Debugf("reconcile finished")
 | |
| 
 | |
| 	eps := new(discoveryv1.EndpointSlice)
 | |
| 	err = er.Get(ctx, req.NamespacedName, eps)
 | |
| 	if apierrors.IsNotFound(err) {
 | |
| 		l.Debugf("EndpointSlice not found")
 | |
| 		return reconcile.Result{}, nil
 | |
| 	}
 | |
| 	if err != nil {
 | |
| 		return reconcile.Result{}, fmt.Errorf("failed to get EndpointSlice: %w", err)
 | |
| 	}
 | |
| 	if !eps.DeletionTimestamp.IsZero() {
 | |
| 		l.Debugf("EnpointSlice is being deleted")
 | |
| 		return res, nil
 | |
| 	}
 | |
| 
 | |
| 	// Get the user-created ExternalName Service and use its status conditions to determine whether cluster
 | |
| 	// resources are set up for this tailnet service.
 | |
| 	svc := &corev1.Service{
 | |
| 		ObjectMeta: metav1.ObjectMeta{
 | |
| 			Name:      eps.Labels[LabelParentName],
 | |
| 			Namespace: eps.Labels[LabelParentNamespace],
 | |
| 		},
 | |
| 	}
 | |
| 	err = er.Get(ctx, client.ObjectKeyFromObject(svc), svc)
 | |
| 	if apierrors.IsNotFound(err) {
 | |
| 		l.Infof("ExternalName Service %s/%s not found, perhaps it was deleted", svc.Namespace, svc.Name)
 | |
| 		return res, nil
 | |
| 	}
 | |
| 	if err != nil {
 | |
| 		return res, fmt.Errorf("error retrieving ExternalName Service: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	// TODO(irbekrm): currently this reconcile loop runs all the checks every time it's triggered, which is
 | |
| 	// wasteful. Once we have a Ready condition for ExternalName Services for ProxyGroup, use the condition to
 | |
| 	// determine if a reconcile is needed.
 | |
| 
 | |
| 	oldEps := eps.DeepCopy()
 | |
| 	tailnetSvc := tailnetSvcName(svc)
 | |
| 	l = l.With("tailnet-service-name", tailnetSvc)
 | |
| 
 | |
| 	// Retrieve the desired tailnet service configuration from the ConfigMap.
 | |
| 	proxyGroupName := eps.Labels[labelProxyGroup]
 | |
| 	_, cfgs, err := egressSvcsConfigs(ctx, er.Client, proxyGroupName, er.tsNamespace)
 | |
| 	if err != nil {
 | |
| 		return res, fmt.Errorf("error retrieving tailnet services configuration: %w", err)
 | |
| 	}
 | |
| 	if cfgs == nil {
 | |
| 		// TODO(irbekrm): this path would be hit if egress service was once exposed on a ProxyGroup that later
 | |
| 		// got deleted. Probably the EndpointSlices then need to be deleted too- need to rethink this flow.
 | |
| 		l.Debugf("No egress config found, likely because ProxyGroup has not been created")
 | |
| 		return res, nil
 | |
| 	}
 | |
| 	cfg, ok := (*cfgs)[tailnetSvc]
 | |
| 	if !ok {
 | |
| 		l.Infof("[unexpected] configuration for tailnet service %s not found", tailnetSvc)
 | |
| 		return res, nil
 | |
| 	}
 | |
| 
 | |
| 	// Check which Pods in ProxyGroup are ready to route traffic to this
 | |
| 	// egress service.
 | |
| 	podList := &corev1.PodList{}
 | |
| 	if err := er.List(ctx, podList, client.MatchingLabels(pgLabels(proxyGroupName, nil))); err != nil {
 | |
| 		return res, fmt.Errorf("error listing Pods for ProxyGroup %s: %w", proxyGroupName, err)
 | |
| 	}
 | |
| 	newEndpoints := make([]discoveryv1.Endpoint, 0)
 | |
| 	for _, pod := range podList.Items {
 | |
| 		ready, err := er.podIsReadyToRouteTraffic(ctx, pod, &cfg, tailnetSvc, l)
 | |
| 		if err != nil {
 | |
| 			return res, fmt.Errorf("error verifying if Pod is ready to route traffic: %w", err)
 | |
| 		}
 | |
| 		if !ready {
 | |
| 			continue // maybe next time
 | |
| 		}
 | |
| 		podIP, err := podIPv4(&pod) // we currently only support IPv4
 | |
| 		if err != nil {
 | |
| 			return res, fmt.Errorf("error determining IPv4 address for Pod: %w", err)
 | |
| 		}
 | |
| 		newEndpoints = append(newEndpoints, discoveryv1.Endpoint{
 | |
| 			Hostname:  (*string)(&pod.UID),
 | |
| 			Addresses: []string{podIP},
 | |
| 			Conditions: discoveryv1.EndpointConditions{
 | |
| 				Ready:       ptr.To(true),
 | |
| 				Serving:     ptr.To(true),
 | |
| 				Terminating: ptr.To(false),
 | |
| 			},
 | |
| 		})
 | |
| 	}
 | |
| 	// Note that Endpoints are being overwritten with the currently valid endpoints so we don't need to explicitly
 | |
| 	// run a cleanup for deleted Pods etc.
 | |
| 	eps.Endpoints = newEndpoints
 | |
| 	if !reflect.DeepEqual(eps, oldEps) {
 | |
| 		l.Infof("Updating EndpointSlice to ensure traffic is routed to ready proxy Pods")
 | |
| 		if err := er.Update(ctx, eps); err != nil {
 | |
| 			return res, fmt.Errorf("error updating EndpointSlice: %w", err)
 | |
| 		}
 | |
| 	}
 | |
| 	return res, nil
 | |
| }
 | |
| 
 | |
| func podIPv4(pod *corev1.Pod) (string, error) {
 | |
| 	for _, ip := range pod.Status.PodIPs {
 | |
| 		parsed, err := netip.ParseAddr(ip.IP)
 | |
| 		if err != nil {
 | |
| 			return "", fmt.Errorf("error parsing IP address %s: %w", ip, err)
 | |
| 		}
 | |
| 		if parsed.Is4() {
 | |
| 			return parsed.String(), nil
 | |
| 		}
 | |
| 	}
 | |
| 	return "", nil
 | |
| }
 | |
| 
 | |
| // podIsReadyToRouteTraffic returns true if it appears that the proxy Pod has configured firewall rules to be able to
 | |
| // route traffic to the given tailnet service. It retrieves the proxy's state Secret and compares the tailnet service
 | |
| // status written there to the desired service configuration.
 | |
| func (er *egressEpsReconciler) podIsReadyToRouteTraffic(ctx context.Context, pod corev1.Pod, cfg *egressservices.Config, tailnetSvcName string, l *zap.SugaredLogger) (bool, error) {
 | |
| 	l = l.With("proxy_pod", pod.Name)
 | |
| 	l.Debugf("checking whether proxy is ready to route to egress service")
 | |
| 	if !pod.DeletionTimestamp.IsZero() {
 | |
| 		l.Debugf("proxy Pod is being deleted, ignore")
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	podIP, err := podIPv4(&pod)
 | |
| 	if err != nil {
 | |
| 		return false, fmt.Errorf("error determining Pod IP address: %v", err)
 | |
| 	}
 | |
| 	if podIP == "" {
 | |
| 		l.Infof("[unexpected] Pod does not have an IPv4 address, and IPv6 is not currently supported")
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	stateS := &corev1.Secret{
 | |
| 		ObjectMeta: metav1.ObjectMeta{
 | |
| 			Name:      pod.Name,
 | |
| 			Namespace: pod.Namespace,
 | |
| 		},
 | |
| 	}
 | |
| 	err = er.Get(ctx, client.ObjectKeyFromObject(stateS), stateS)
 | |
| 	if apierrors.IsNotFound(err) {
 | |
| 		l.Debugf("proxy does not have a state Secret, waiting...")
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	if err != nil {
 | |
| 		return false, fmt.Errorf("error getting state Secret: %w", err)
 | |
| 	}
 | |
| 	svcStatusBS := stateS.Data[egressservices.KeyEgressServices]
 | |
| 	if len(svcStatusBS) == 0 {
 | |
| 		l.Debugf("proxy's state Secret does not contain egress services status, waiting...")
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	svcStatus := &egressservices.Status{}
 | |
| 	if err := json.Unmarshal(svcStatusBS, svcStatus); err != nil {
 | |
| 		return false, fmt.Errorf("error unmarshalling egress service status: %w", err)
 | |
| 	}
 | |
| 	if !strings.EqualFold(podIP, svcStatus.PodIPv4) {
 | |
| 		l.Infof("proxy's egress service status is for Pod IP %s, current proxy's Pod IP %s, waiting for the proxy to reconfigure...", svcStatus.PodIPv4, podIP)
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	st, ok := (*svcStatus).Services[tailnetSvcName]
 | |
| 	if !ok {
 | |
| 		l.Infof("proxy's state Secret does not have egress service status, waiting...")
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	if !reflect.DeepEqual(cfg.TailnetTarget, st.TailnetTarget) {
 | |
| 		l.Infof("proxy has configured egress service for tailnet target %v, current target is %v, waiting for proxy to reconfigure...", st.TailnetTarget, cfg.TailnetTarget)
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	if !reflect.DeepEqual(cfg.Ports, st.Ports) {
 | |
| 		l.Debugf("proxy has configured egress service for ports %#+v, wants ports %#+v, waiting for proxy to reconfigure", st.Ports, cfg.Ports)
 | |
| 		return false, nil
 | |
| 	}
 | |
| 	l.Debugf("proxy is ready to route traffic to egress service")
 | |
| 	return true, nil
 | |
| }
 |