Aaron U'Ren a0fe844a93 feat(NSC): honor service-proxy-name label
Abide the service.kubernetes.io/service-proxy-name label as defined by
the upstream standard here:
https://github.com/kubernetes-sigs/kpng/blob/master/doc/service-proxy.md#ignored-servicesendpoints

Resolves the failing e2e test:
should implement service.kubernetes.io/service-proxy-name

Fixes: #979
2024-01-05 10:27:23 -06:00

545 lines
17 KiB
Go

package proxy
import (
"fmt"
"hash/fnv"
"net"
"os/exec"
"runtime"
"strconv"
"strings"
"syscall"
"time"
"github.com/cloudnativelabs/kube-router/v2/pkg/cri"
"github.com/cloudnativelabs/kube-router/v2/pkg/utils"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
netutils "k8s.io/utils/net"
)
const (
interfaceWaitSleepTime = 100 * time.Millisecond
sysFSVirtualNetPath = "/sys/devices/virtual/net"
sysFSHairpinRelPath = "brport/hairpin_mode"
hairpinEnable = "1"
)
func attemptNamespaceResetAfterError(hostNSHandle netns.NsHandle) {
err := netns.Set(hostNSHandle)
if err != nil {
klog.Errorf("failed to set hostNetworkNamespace while resetting namespace after a previous error due to %v",
err)
}
activeNetworkNamespaceHandle, err := netns.Get()
if err != nil {
klog.Errorf("failed to confirm activeNetworkNamespace while resetting namespace after "+
"a previous error due to %v", err)
return
}
klog.V(2).Infof("Current network namespace after revert namespace to host network namespace: %s",
activeNetworkNamespaceHandle.String())
_ = activeNetworkNamespaceHandle.Close()
}
// generateUniqueFWMark generates a unique uint32 hash value using the IP address, port, and protocol. This can then
// be used in IPVS, ip rules, and iptables to mark and later identify packets. FWMarks along with ip, port, and protocol
// are then stored in a map on the NSC and can be used later for lookup and as a general translation layer. If after
// maxUniqueFWMarkInc tries, generateUniqueFWMark is not able to find a unique permutation to use, an error is returned.
func (nsc *NetworkServicesController) generateUniqueFWMark(ip, protocol, port string) (uint32, error) {
// Generate a unit32 hash value using the IP address, port and protocol. This has been moved to an anonymous
// function since calling this without guarantees of uniqueness is unsafe.
generateFWMark := func(ip, protocol, port string, increment int) (uint32, error) {
const maxFwMarkBitSize = 0x3FFF
var err error
h := fnv.New32a()
if increment == 0 {
_, err = h.Write([]byte(ip + "-" + protocol + "-" + port))
} else {
_, err = h.Write([]byte(ip + "-" + protocol + "-" + port + "-" + fmt.Sprintf("%d", increment)))
}
if err != nil {
return 0, err
}
return h.Sum32() & maxFwMarkBitSize, err
}
const maxUniqueFWMarkInc = 16380
increment := 0
serviceKey := fmt.Sprintf("%s-%s-%s", ip, protocol, port)
for {
potentialFWMark, err := generateFWMark(ip, protocol, port, increment)
if err != nil {
return potentialFWMark, err
}
if foundServiceKey, ok := nsc.fwMarkMap[potentialFWMark]; ok {
if foundServiceKey != serviceKey {
increment++
continue
}
}
if increment >= maxUniqueFWMarkInc {
return 0, fmt.Errorf("could not obtain a unique FWMark for %s:%s:%s after %d tries",
protocol, ip, port, maxUniqueFWMarkInc)
}
nsc.fwMarkMap[potentialFWMark] = serviceKey
return potentialFWMark, nil
}
}
// lookupFWMarkByService finds the related FW mark from the internal fwMarkMap kept by the NetworkServiceController
// given the related ip, protocol, and port. If it isn't able to find a matching FW mark, then it returns an error.
func (nsc *NetworkServicesController) lookupFWMarkByService(ip, protocol, port string) uint32 {
needle := fmt.Sprintf("%s-%s-%s", ip, protocol, port)
for fwMark, serviceKey := range nsc.fwMarkMap {
if needle == serviceKey {
return fwMark
}
}
return 0
}
// lookupServiceByFWMark Lookup service ip, protocol, port by given FW Mark value (reverse of lookupFWMarkByService)
func (nsc *NetworkServicesController) lookupServiceByFWMark(fwMark uint32) (string, string, int, error) {
serviceKey, ok := nsc.fwMarkMap[fwMark]
if !ok {
return "", "", 0, fmt.Errorf("could not find service matching the given FW mark")
}
serviceKeySplit := strings.Split(serviceKey, "-")
if len(serviceKeySplit) != 3 {
return "", "", 0, fmt.Errorf("service key for found FW mark did not have 3 parts, this shouldn't be possible")
}
port, err := strconv.ParseInt(serviceKeySplit[2], 10, 32)
if err != nil {
return "", "", 0, fmt.Errorf("port number for service key for found FW mark was not a 32-bit int: %v", err)
}
return serviceKeySplit[0], serviceKeySplit[1], int(port), nil
}
// unsortedListsEquivalent compares two lists of endpointsInfo and considers them the same if they contains the same
// contents regardless of order. Returns true if both lists contain the same contents.
func unsortedListsEquivalent(a, b []endpointSliceInfo) bool {
if len(a) != len(b) {
return false
}
values := make(map[interface{}]int)
for _, val := range a {
values[val] = 1
}
for _, val := range b {
values[val]++
}
for _, val := range values {
if val == 1 {
return false
}
}
return true
}
// endpointsMapsEquivalent compares two maps of endpointsInfoMap to see if they have the same keys and values. Returns
// true if both maps contain the same keys and values.
func endpointsMapsEquivalent(a, b endpointSliceInfoMap) bool {
if len(a) != len(b) {
return false
}
for key, valA := range a {
valB, ok := b[key]
if !ok {
return false
}
if !unsortedListsEquivalent(valA, valB) {
return false
}
}
return true
}
// convertSvcProtoToSysCallProto converts a string based protocol that we receive from Kubernetes via something like the
// serviceInfo object into the uint16 syscall version of the protocol that is capable of interfacing with aspects of the
// Linux sub-sysem like IPVS
func convertSvcProtoToSysCallProto(svcProtocol string) uint16 {
switch svcProtocol {
case tcpProtocol:
return syscall.IPPROTO_TCP
case udpProtocol:
return syscall.IPPROTO_UDP
default:
return syscall.IPPROTO_NONE
}
}
// convertSysCallProtoToSvcProto converts a syscall based protocol version to a string representation that Kubernetes
// and other parts of kube-router understand
func convertSysCallProtoToSvcProto(sysProtocol uint16) string {
switch sysProtocol {
case syscall.IPPROTO_TCP:
return tcpProtocol
case syscall.IPPROTO_UDP:
return udpProtocol
default:
return noneProtocol
}
}
// findContainerRuntimeReferences find the container runtime and container ID for a given endpoint IP do this by:
// - Resolving the endpoint IP to a pod
// - Ensure that the pod actually exists on the node in question
// - Get the container ID of the primary container (since this function primarily allows us to enter the pod's
// namespace, it doesn't really matter which container we choose here, if this function gets used for something
// else in the future, this might have to be re-evaluated)
func (nsc *NetworkServicesController) findContainerRuntimeReferences(endpointIP string) (string, string, error) {
podObj, err := nsc.getPodObjectForEndpoint(endpointIP)
if err != nil {
return "", "", fmt.Errorf("failed to find endpoint with ip: %s. so skipping preparing endpoint for DSR",
endpointIP)
}
// we are only concerned with endpoint pod running on current node
if strings.Compare(podObj.Status.HostIP, nsc.primaryIP.String()) != 0 {
return "", "", nil
}
containerURL := podObj.Status.ContainerStatuses[0].ContainerID
runtime, containerID, err := cri.EndpointParser(containerURL)
if err != nil {
return "", "", fmt.Errorf("couldn't get containerID (container=%s, pod=%s). Skipping DSR endpoint set up",
podObj.Spec.Containers[0].Name, podObj.Name)
}
if containerID == "" {
return "", "", fmt.Errorf("failed to find container id for the endpoint with ip: %s so skipping preparing "+
"endpoint for DSR", endpointIP)
}
return runtime, containerID, nil
}
// addDSRIPInsidePodNetNamespace takes a given external IP and endpoint IP for a DSR service and then uses the container
// runtime to add the external IP to a virtual interface inside the pod so that it can receive DSR traffic inside its
// network namespace.
func (nsc *NetworkServicesController) addDSRIPInsidePodNetNamespace(externalIP, endpointIP string) error {
crRuntime, containerID, err := nsc.findContainerRuntimeReferences(endpointIP)
if err != nil {
return err
}
runtime.LockOSThread()
defer runtime.UnlockOSThread()
hostNetworkNamespaceHandle, err := netns.Get()
if err != nil {
return fmt.Errorf("failed to get namespace due to %v", err)
}
defer utils.CloseCloserDisregardError(&hostNetworkNamespaceHandle)
var pid int
if crRuntime == "docker" {
// WARN: This method is deprecated and will be removed once docker-shim is removed from kubelet.
pid, err = nsc.ln.getContainerPidWithDocker(containerID)
if err != nil {
return fmt.Errorf("failed to prepare endpoint %s to do direct server return due to %v",
endpointIP, err)
}
} else {
// We expect CRI compliant runtimes here
// ugly workaround, refactoring of pkg/Proxy is required
pid, err = nsc.ln.getContainerPidWithCRI(nsc.dsr.runtimeEndpoint, containerID)
if err != nil {
return fmt.Errorf("failed to prepare endpoint %s to do DSR due to: %v", endpointIP, err)
}
}
return nsc.ln.configureContainerForDSR(externalIP, endpointIP, containerID, pid, hostNetworkNamespaceHandle)
}
// getPrimaryAndCIDRsByFamily returns the best primary nodeIP and a slice of all of the relevant podCIDRs based upon a
// given IP family
func (nsc *NetworkServicesController) getPrimaryAndCIDRsByFamily(ipFamily v1.IPFamily) (string, []string) {
var primaryIP string
cidrMap := make(map[string]bool)
switch ipFamily {
case v1.IPv4Protocol:
// If we're not detected to be IPv4 capable break early
if !nsc.isIPv4Capable {
return "", nil
}
primaryIP = utils.FindBestIPv4NodeAddress(nsc.primaryIP, nsc.nodeIPv4Addrs).String()
if len(nsc.podCidr) > 0 && netutils.IsIPv4CIDRString(nsc.podCidr) {
cidrMap[nsc.podCidr] = true
}
if len(nsc.podIPv4CIDRs) > 0 {
for _, cidr := range nsc.podIPv4CIDRs {
if _, ok := cidrMap[cidr]; !ok {
cidrMap[cidr] = true
}
}
}
case v1.IPv6Protocol:
// If we're not detected to be IPv6 capable break early
if !nsc.isIPv6Capable {
return "", nil
}
primaryIP = utils.FindBestIPv6NodeAddress(nsc.primaryIP, nsc.nodeIPv6Addrs).String()
if len(nsc.podCidr) > 0 && netutils.IsIPv6CIDRString(nsc.podCidr) {
cidrMap[nsc.podCidr] = true
}
if len(nsc.podIPv6CIDRs) > 0 {
for _, cidr := range nsc.podIPv6CIDRs {
if _, ok := cidrMap[cidr]; !ok {
cidrMap[cidr] = true
}
}
}
}
cidrs := make([]string, len(cidrMap))
idx := 0
for cidr := range cidrMap {
cidrs[idx] = cidr
idx++
}
return primaryIP, cidrs
}
// GetAllClusterIPs returns all of the cluster IPs on a service separated into IPv4 and IPv6 lists
func getAllClusterIPs(svc *serviceInfo) map[v1.IPFamily][]net.IP {
// We use maps here so that we can de-duplicate repeat IP addresses
v4Map := make(map[string]bool)
v6Map := make(map[string]bool)
if svc.clusterIP != nil {
if svc.clusterIP.To4() != nil {
v4Map[svc.clusterIP.String()] = true
} else {
v6Map[svc.clusterIP.String()] = true
}
}
for _, clIP := range svc.clusterIPs {
ip := net.ParseIP(clIP)
if ip == nil {
continue
}
if ip.To4() != nil {
v4Map[ip.String()] = true
} else {
v6Map[ip.String()] = true
}
}
return convertIPMapsToFamilyMap(v4Map, v6Map)
}
// getAllClusterIPs returns all of the cluster IPs on a service separated into IPv4 and IPv6 lists
func getAllExternalIPs(svc *serviceInfo, includeLBIPs bool) map[v1.IPFamily][]net.IP {
// We use maps here so that we can de-duplicate repeat IP addresses
v4Map := make(map[string]bool)
v6Map := make(map[string]bool)
for _, exIP := range svc.externalIPs {
ip := net.ParseIP(exIP)
if ip == nil {
continue
}
if ip.To4() != nil {
v4Map[ip.String()] = true
} else {
v6Map[ip.String()] = true
}
}
if !includeLBIPs {
return convertIPMapsToFamilyMap(v4Map, v6Map)
}
for _, lbIP := range svc.loadBalancerIPs {
ip := net.ParseIP(lbIP)
if ip == nil {
continue
}
if ip.To4() != nil {
v4Map[ip.String()] = true
} else {
v6Map[ip.String()] = true
}
}
return convertIPMapsToFamilyMap(v4Map, v6Map)
}
// convertIPMapsToFamilyMap converts family specific maps of string IPs to a single map that is keyed by IPFamily and
// has parsed net.IPs
func convertIPMapsToFamilyMap(v4Map map[string]bool, v6Map map[string]bool) map[v1.IPFamily][]net.IP {
allIPs := make(map[v1.IPFamily][]net.IP)
allIPs[v1.IPv4Protocol] = make([]net.IP, len(v4Map))
allIPs[v1.IPv6Protocol] = make([]net.IP, len(v6Map))
idx := 0
for ip := range v4Map {
allIPs[v1.IPv4Protocol][idx] = net.ParseIP(ip)
idx++
}
idx = 0
for ip := range v6Map {
allIPs[v1.IPv6Protocol][idx] = net.ParseIP(ip)
idx++
}
return allIPs
}
// hairpinRuleFrom create hairpin rules for a given set of parameters
func hairpinRuleFrom(serviceIPs []net.IP, endpointIP string, endpointFamily v1.IPFamily, servicePort int,
ruleMap map[string][]string) {
var vipSubnet string
switch endpointFamily {
case v1.IPv4Protocol:
vipSubnet = "/32"
case v1.IPv6Protocol:
vipSubnet = "/128"
}
for _, svcIP := range serviceIPs {
ruleArgs := []string{"-s", endpointIP + "/32", "-d", endpointIP + vipSubnet,
"-m", "ipvs", "--vaddr", svcIP.String(), "--vport", strconv.Itoa(servicePort),
"-j", "SNAT", "--to-source", svcIP.String()}
// Trying to ensure this matches iptables.List()
ruleString := "-A " + ipvsHairpinChainName + " -s " + endpointIP + vipSubnet + " -d " +
endpointIP + vipSubnet + " -m ipvs" + " --vaddr " + svcIP.String() + " --vport " +
strconv.Itoa(servicePort) + " -j SNAT" + " --to-source " + svcIP.String()
ruleMap[ruleString] = ruleArgs
}
}
// ensureHairpinChain make sure that the hairpin chain in the nat table exists and that it has a jump rule from the
// POSTROUTING chain
func ensureHairpinChain(iptablesCmdHandler utils.IPTablesHandler) error {
hasHairpinChain := false
chains, err := iptablesCmdHandler.ListChains("nat")
if err != nil {
return fmt.Errorf("failed to list iptables chains: %v", err)
}
for _, chain := range chains {
if chain == ipvsHairpinChainName {
hasHairpinChain = true
}
}
// Create a chain for hairpin rules, if needed
if !hasHairpinChain {
err = iptablesCmdHandler.NewChain("nat", ipvsHairpinChainName)
if err != nil {
return fmt.Errorf("failed to create iptables chain \"%s\": %v", ipvsHairpinChainName, err)
}
}
// Create a jump that points to our hairpin chain
jumpArgs := []string{"-m", "ipvs", "--vdir", "ORIGINAL", "-j", ipvsHairpinChainName}
err = iptablesCmdHandler.AppendUnique("nat", "POSTROUTING", jumpArgs...)
if err != nil {
return fmt.Errorf("failed to add hairpin iptables jump rule: %v", err)
}
return nil
}
// getAllLocalIPs returns all IP addresses found on any network address in the system, excluding dummy and docker
// interfaces in a map that distinguishes between IPv4 and IPv6 addresses by v1.IPFamily
func getAllLocalIPs() (map[v1.IPFamily][]net.IP, error) {
// We use maps here so that we can de-duplicate repeat IP addresses
v4Map := make(map[string]bool)
v6Map := make(map[string]bool)
links, err := netlink.LinkList()
if err != nil {
return nil, fmt.Errorf("could not load list of net interfaces: %v", err)
}
for _, link := range links {
// do not include IPs for any interface that calls itself "dummy", "kube", or "docker"
if strings.Contains(link.Attrs().Name, "dummy") ||
strings.Contains(link.Attrs().Name, "kube") ||
strings.Contains(link.Attrs().Name, "docker") {
continue
}
linkAddrs, err := netlink.AddrList(link, netlink.FAMILY_V4)
if err != nil {
return nil, fmt.Errorf("failed to get IPs for interface: %v", err)
}
for _, addr := range linkAddrs {
if addr.IP.To4() != nil {
v4Map[addr.IP.String()] = true
} else {
v6Map[addr.IP.String()] = true
}
}
}
return convertIPMapsToFamilyMap(v4Map, v6Map), nil
}
// getIPSetName formulates an IP Family specific ipset name based upon the prefix and IPFamily passed
func getIPSetName(nameBase string, family v1.IPFamily) string {
var sb strings.Builder
if family == v1.IPv6Protocol {
sb.WriteString("inet6:")
}
sb.WriteString(nameBase)
return sb.String()
}
// getIPVSFirewallInputChainRule creates IPVS firwall input chain rule based upon the family that is passed. This is
// used by the NSC to ensure that traffic destined for IPVS services on the INPUT table will be directed to the IPVS
// firewall chain
func getIPVSFirewallInputChainRule(family v1.IPFamily) []string {
// The iptables rule for use in {setup,cleanup}IpvsFirewall.
return []string{
"-m", "comment", "--comment", "handle traffic to IPVS service IPs in custom chain",
"-m", "set", "--match-set", getIPSetName(serviceIPsIPSetName, family), "dst",
"-j", ipvsFirewallChainName}
}
// runIPCommandsWithArgs extend the exec.Command interface to allow passing an additional array of arguments to ip
func runIPCommandsWithArgs(ipArgs []string, additionalArgs ...string) *exec.Cmd {
var allArgs []string
allArgs = append(allArgs, ipArgs...)
allArgs = append(allArgs, additionalArgs...)
return exec.Command("ip", allArgs...)
}
// getLabelFromMap checks the list of passed labels for the service.kubernetes.io/service-proxy-name
// label and if it exists, returns it otherwise returns an error
func getLabelFromMap(label string, labels map[string]string) (string, error) {
for lbl, val := range labels {
if lbl == label {
return val, nil
}
}
return "", fmt.Errorf("label doesn't exist in map")
}