mirror of
https://github.com/cloudnativelabs/kube-router.git
synced 2025-09-26 02:21:03 +02:00
Abide the service.kubernetes.io/service-proxy-name label as defined by the upstream standard here: https://github.com/kubernetes-sigs/kpng/blob/master/doc/service-proxy.md#ignored-servicesendpoints Resolves the failing e2e test: should implement service.kubernetes.io/service-proxy-name Fixes: #979
545 lines
17 KiB
Go
545 lines
17 KiB
Go
package proxy
|
|
|
|
import (
|
|
"fmt"
|
|
"hash/fnv"
|
|
"net"
|
|
"os/exec"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/cloudnativelabs/kube-router/v2/pkg/cri"
|
|
"github.com/cloudnativelabs/kube-router/v2/pkg/utils"
|
|
"github.com/vishvananda/netlink"
|
|
"github.com/vishvananda/netns"
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/klog/v2"
|
|
netutils "k8s.io/utils/net"
|
|
)
|
|
|
|
const (
|
|
interfaceWaitSleepTime = 100 * time.Millisecond
|
|
sysFSVirtualNetPath = "/sys/devices/virtual/net"
|
|
sysFSHairpinRelPath = "brport/hairpin_mode"
|
|
hairpinEnable = "1"
|
|
)
|
|
|
|
func attemptNamespaceResetAfterError(hostNSHandle netns.NsHandle) {
|
|
err := netns.Set(hostNSHandle)
|
|
if err != nil {
|
|
klog.Errorf("failed to set hostNetworkNamespace while resetting namespace after a previous error due to %v",
|
|
err)
|
|
}
|
|
activeNetworkNamespaceHandle, err := netns.Get()
|
|
if err != nil {
|
|
klog.Errorf("failed to confirm activeNetworkNamespace while resetting namespace after "+
|
|
"a previous error due to %v", err)
|
|
return
|
|
}
|
|
klog.V(2).Infof("Current network namespace after revert namespace to host network namespace: %s",
|
|
activeNetworkNamespaceHandle.String())
|
|
_ = activeNetworkNamespaceHandle.Close()
|
|
}
|
|
|
|
// generateUniqueFWMark generates a unique uint32 hash value using the IP address, port, and protocol. This can then
|
|
// be used in IPVS, ip rules, and iptables to mark and later identify packets. FWMarks along with ip, port, and protocol
|
|
// are then stored in a map on the NSC and can be used later for lookup and as a general translation layer. If after
|
|
// maxUniqueFWMarkInc tries, generateUniqueFWMark is not able to find a unique permutation to use, an error is returned.
|
|
func (nsc *NetworkServicesController) generateUniqueFWMark(ip, protocol, port string) (uint32, error) {
|
|
// Generate a unit32 hash value using the IP address, port and protocol. This has been moved to an anonymous
|
|
// function since calling this without guarantees of uniqueness is unsafe.
|
|
generateFWMark := func(ip, protocol, port string, increment int) (uint32, error) {
|
|
const maxFwMarkBitSize = 0x3FFF
|
|
var err error
|
|
h := fnv.New32a()
|
|
if increment == 0 {
|
|
_, err = h.Write([]byte(ip + "-" + protocol + "-" + port))
|
|
} else {
|
|
_, err = h.Write([]byte(ip + "-" + protocol + "-" + port + "-" + fmt.Sprintf("%d", increment)))
|
|
}
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return h.Sum32() & maxFwMarkBitSize, err
|
|
}
|
|
|
|
const maxUniqueFWMarkInc = 16380
|
|
increment := 0
|
|
serviceKey := fmt.Sprintf("%s-%s-%s", ip, protocol, port)
|
|
for {
|
|
potentialFWMark, err := generateFWMark(ip, protocol, port, increment)
|
|
if err != nil {
|
|
return potentialFWMark, err
|
|
}
|
|
if foundServiceKey, ok := nsc.fwMarkMap[potentialFWMark]; ok {
|
|
if foundServiceKey != serviceKey {
|
|
increment++
|
|
continue
|
|
}
|
|
}
|
|
if increment >= maxUniqueFWMarkInc {
|
|
return 0, fmt.Errorf("could not obtain a unique FWMark for %s:%s:%s after %d tries",
|
|
protocol, ip, port, maxUniqueFWMarkInc)
|
|
}
|
|
nsc.fwMarkMap[potentialFWMark] = serviceKey
|
|
return potentialFWMark, nil
|
|
}
|
|
}
|
|
|
|
// lookupFWMarkByService finds the related FW mark from the internal fwMarkMap kept by the NetworkServiceController
|
|
// given the related ip, protocol, and port. If it isn't able to find a matching FW mark, then it returns an error.
|
|
func (nsc *NetworkServicesController) lookupFWMarkByService(ip, protocol, port string) uint32 {
|
|
needle := fmt.Sprintf("%s-%s-%s", ip, protocol, port)
|
|
for fwMark, serviceKey := range nsc.fwMarkMap {
|
|
if needle == serviceKey {
|
|
return fwMark
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// lookupServiceByFWMark Lookup service ip, protocol, port by given FW Mark value (reverse of lookupFWMarkByService)
|
|
func (nsc *NetworkServicesController) lookupServiceByFWMark(fwMark uint32) (string, string, int, error) {
|
|
serviceKey, ok := nsc.fwMarkMap[fwMark]
|
|
if !ok {
|
|
return "", "", 0, fmt.Errorf("could not find service matching the given FW mark")
|
|
}
|
|
serviceKeySplit := strings.Split(serviceKey, "-")
|
|
if len(serviceKeySplit) != 3 {
|
|
return "", "", 0, fmt.Errorf("service key for found FW mark did not have 3 parts, this shouldn't be possible")
|
|
}
|
|
port, err := strconv.ParseInt(serviceKeySplit[2], 10, 32)
|
|
if err != nil {
|
|
return "", "", 0, fmt.Errorf("port number for service key for found FW mark was not a 32-bit int: %v", err)
|
|
}
|
|
return serviceKeySplit[0], serviceKeySplit[1], int(port), nil
|
|
}
|
|
|
|
// unsortedListsEquivalent compares two lists of endpointsInfo and considers them the same if they contains the same
|
|
// contents regardless of order. Returns true if both lists contain the same contents.
|
|
func unsortedListsEquivalent(a, b []endpointSliceInfo) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
|
|
values := make(map[interface{}]int)
|
|
for _, val := range a {
|
|
values[val] = 1
|
|
}
|
|
for _, val := range b {
|
|
values[val]++
|
|
}
|
|
|
|
for _, val := range values {
|
|
if val == 1 {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// endpointsMapsEquivalent compares two maps of endpointsInfoMap to see if they have the same keys and values. Returns
|
|
// true if both maps contain the same keys and values.
|
|
func endpointsMapsEquivalent(a, b endpointSliceInfoMap) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
|
|
for key, valA := range a {
|
|
valB, ok := b[key]
|
|
if !ok {
|
|
return false
|
|
}
|
|
|
|
if !unsortedListsEquivalent(valA, valB) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// convertSvcProtoToSysCallProto converts a string based protocol that we receive from Kubernetes via something like the
|
|
// serviceInfo object into the uint16 syscall version of the protocol that is capable of interfacing with aspects of the
|
|
// Linux sub-sysem like IPVS
|
|
func convertSvcProtoToSysCallProto(svcProtocol string) uint16 {
|
|
switch svcProtocol {
|
|
case tcpProtocol:
|
|
return syscall.IPPROTO_TCP
|
|
case udpProtocol:
|
|
return syscall.IPPROTO_UDP
|
|
default:
|
|
return syscall.IPPROTO_NONE
|
|
}
|
|
}
|
|
|
|
// convertSysCallProtoToSvcProto converts a syscall based protocol version to a string representation that Kubernetes
|
|
// and other parts of kube-router understand
|
|
func convertSysCallProtoToSvcProto(sysProtocol uint16) string {
|
|
switch sysProtocol {
|
|
case syscall.IPPROTO_TCP:
|
|
return tcpProtocol
|
|
case syscall.IPPROTO_UDP:
|
|
return udpProtocol
|
|
default:
|
|
return noneProtocol
|
|
}
|
|
}
|
|
|
|
// findContainerRuntimeReferences find the container runtime and container ID for a given endpoint IP do this by:
|
|
// - Resolving the endpoint IP to a pod
|
|
// - Ensure that the pod actually exists on the node in question
|
|
// - Get the container ID of the primary container (since this function primarily allows us to enter the pod's
|
|
// namespace, it doesn't really matter which container we choose here, if this function gets used for something
|
|
// else in the future, this might have to be re-evaluated)
|
|
func (nsc *NetworkServicesController) findContainerRuntimeReferences(endpointIP string) (string, string, error) {
|
|
podObj, err := nsc.getPodObjectForEndpoint(endpointIP)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("failed to find endpoint with ip: %s. so skipping preparing endpoint for DSR",
|
|
endpointIP)
|
|
}
|
|
|
|
// we are only concerned with endpoint pod running on current node
|
|
if strings.Compare(podObj.Status.HostIP, nsc.primaryIP.String()) != 0 {
|
|
return "", "", nil
|
|
}
|
|
|
|
containerURL := podObj.Status.ContainerStatuses[0].ContainerID
|
|
runtime, containerID, err := cri.EndpointParser(containerURL)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("couldn't get containerID (container=%s, pod=%s). Skipping DSR endpoint set up",
|
|
podObj.Spec.Containers[0].Name, podObj.Name)
|
|
}
|
|
|
|
if containerID == "" {
|
|
return "", "", fmt.Errorf("failed to find container id for the endpoint with ip: %s so skipping preparing "+
|
|
"endpoint for DSR", endpointIP)
|
|
}
|
|
|
|
return runtime, containerID, nil
|
|
}
|
|
|
|
// addDSRIPInsidePodNetNamespace takes a given external IP and endpoint IP for a DSR service and then uses the container
|
|
// runtime to add the external IP to a virtual interface inside the pod so that it can receive DSR traffic inside its
|
|
// network namespace.
|
|
func (nsc *NetworkServicesController) addDSRIPInsidePodNetNamespace(externalIP, endpointIP string) error {
|
|
crRuntime, containerID, err := nsc.findContainerRuntimeReferences(endpointIP)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
runtime.LockOSThread()
|
|
defer runtime.UnlockOSThread()
|
|
|
|
hostNetworkNamespaceHandle, err := netns.Get()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get namespace due to %v", err)
|
|
}
|
|
defer utils.CloseCloserDisregardError(&hostNetworkNamespaceHandle)
|
|
|
|
var pid int
|
|
if crRuntime == "docker" {
|
|
// WARN: This method is deprecated and will be removed once docker-shim is removed from kubelet.
|
|
pid, err = nsc.ln.getContainerPidWithDocker(containerID)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to prepare endpoint %s to do direct server return due to %v",
|
|
endpointIP, err)
|
|
}
|
|
} else {
|
|
// We expect CRI compliant runtimes here
|
|
// ugly workaround, refactoring of pkg/Proxy is required
|
|
pid, err = nsc.ln.getContainerPidWithCRI(nsc.dsr.runtimeEndpoint, containerID)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to prepare endpoint %s to do DSR due to: %v", endpointIP, err)
|
|
}
|
|
}
|
|
|
|
return nsc.ln.configureContainerForDSR(externalIP, endpointIP, containerID, pid, hostNetworkNamespaceHandle)
|
|
}
|
|
|
|
// getPrimaryAndCIDRsByFamily returns the best primary nodeIP and a slice of all of the relevant podCIDRs based upon a
|
|
// given IP family
|
|
func (nsc *NetworkServicesController) getPrimaryAndCIDRsByFamily(ipFamily v1.IPFamily) (string, []string) {
|
|
var primaryIP string
|
|
cidrMap := make(map[string]bool)
|
|
switch ipFamily {
|
|
case v1.IPv4Protocol:
|
|
// If we're not detected to be IPv4 capable break early
|
|
if !nsc.isIPv4Capable {
|
|
return "", nil
|
|
}
|
|
|
|
primaryIP = utils.FindBestIPv4NodeAddress(nsc.primaryIP, nsc.nodeIPv4Addrs).String()
|
|
if len(nsc.podCidr) > 0 && netutils.IsIPv4CIDRString(nsc.podCidr) {
|
|
cidrMap[nsc.podCidr] = true
|
|
}
|
|
if len(nsc.podIPv4CIDRs) > 0 {
|
|
for _, cidr := range nsc.podIPv4CIDRs {
|
|
if _, ok := cidrMap[cidr]; !ok {
|
|
cidrMap[cidr] = true
|
|
}
|
|
}
|
|
}
|
|
case v1.IPv6Protocol:
|
|
// If we're not detected to be IPv6 capable break early
|
|
if !nsc.isIPv6Capable {
|
|
return "", nil
|
|
}
|
|
|
|
primaryIP = utils.FindBestIPv6NodeAddress(nsc.primaryIP, nsc.nodeIPv6Addrs).String()
|
|
if len(nsc.podCidr) > 0 && netutils.IsIPv6CIDRString(nsc.podCidr) {
|
|
cidrMap[nsc.podCidr] = true
|
|
}
|
|
if len(nsc.podIPv6CIDRs) > 0 {
|
|
for _, cidr := range nsc.podIPv6CIDRs {
|
|
if _, ok := cidrMap[cidr]; !ok {
|
|
cidrMap[cidr] = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
cidrs := make([]string, len(cidrMap))
|
|
idx := 0
|
|
for cidr := range cidrMap {
|
|
cidrs[idx] = cidr
|
|
idx++
|
|
}
|
|
|
|
return primaryIP, cidrs
|
|
}
|
|
|
|
// GetAllClusterIPs returns all of the cluster IPs on a service separated into IPv4 and IPv6 lists
|
|
func getAllClusterIPs(svc *serviceInfo) map[v1.IPFamily][]net.IP {
|
|
// We use maps here so that we can de-duplicate repeat IP addresses
|
|
v4Map := make(map[string]bool)
|
|
v6Map := make(map[string]bool)
|
|
|
|
if svc.clusterIP != nil {
|
|
if svc.clusterIP.To4() != nil {
|
|
v4Map[svc.clusterIP.String()] = true
|
|
} else {
|
|
v6Map[svc.clusterIP.String()] = true
|
|
}
|
|
}
|
|
|
|
for _, clIP := range svc.clusterIPs {
|
|
ip := net.ParseIP(clIP)
|
|
if ip == nil {
|
|
continue
|
|
}
|
|
|
|
if ip.To4() != nil {
|
|
v4Map[ip.String()] = true
|
|
} else {
|
|
v6Map[ip.String()] = true
|
|
}
|
|
}
|
|
|
|
return convertIPMapsToFamilyMap(v4Map, v6Map)
|
|
}
|
|
|
|
// getAllClusterIPs returns all of the cluster IPs on a service separated into IPv4 and IPv6 lists
|
|
func getAllExternalIPs(svc *serviceInfo, includeLBIPs bool) map[v1.IPFamily][]net.IP {
|
|
// We use maps here so that we can de-duplicate repeat IP addresses
|
|
v4Map := make(map[string]bool)
|
|
v6Map := make(map[string]bool)
|
|
|
|
for _, exIP := range svc.externalIPs {
|
|
ip := net.ParseIP(exIP)
|
|
if ip == nil {
|
|
continue
|
|
}
|
|
|
|
if ip.To4() != nil {
|
|
v4Map[ip.String()] = true
|
|
} else {
|
|
v6Map[ip.String()] = true
|
|
}
|
|
}
|
|
|
|
if !includeLBIPs {
|
|
return convertIPMapsToFamilyMap(v4Map, v6Map)
|
|
}
|
|
|
|
for _, lbIP := range svc.loadBalancerIPs {
|
|
ip := net.ParseIP(lbIP)
|
|
if ip == nil {
|
|
continue
|
|
}
|
|
|
|
if ip.To4() != nil {
|
|
v4Map[ip.String()] = true
|
|
} else {
|
|
v6Map[ip.String()] = true
|
|
}
|
|
}
|
|
|
|
return convertIPMapsToFamilyMap(v4Map, v6Map)
|
|
}
|
|
|
|
// convertIPMapsToFamilyMap converts family specific maps of string IPs to a single map that is keyed by IPFamily and
|
|
// has parsed net.IPs
|
|
func convertIPMapsToFamilyMap(v4Map map[string]bool, v6Map map[string]bool) map[v1.IPFamily][]net.IP {
|
|
allIPs := make(map[v1.IPFamily][]net.IP)
|
|
|
|
allIPs[v1.IPv4Protocol] = make([]net.IP, len(v4Map))
|
|
allIPs[v1.IPv6Protocol] = make([]net.IP, len(v6Map))
|
|
|
|
idx := 0
|
|
for ip := range v4Map {
|
|
allIPs[v1.IPv4Protocol][idx] = net.ParseIP(ip)
|
|
idx++
|
|
}
|
|
|
|
idx = 0
|
|
for ip := range v6Map {
|
|
allIPs[v1.IPv6Protocol][idx] = net.ParseIP(ip)
|
|
idx++
|
|
}
|
|
|
|
return allIPs
|
|
}
|
|
|
|
// hairpinRuleFrom create hairpin rules for a given set of parameters
|
|
func hairpinRuleFrom(serviceIPs []net.IP, endpointIP string, endpointFamily v1.IPFamily, servicePort int,
|
|
ruleMap map[string][]string) {
|
|
var vipSubnet string
|
|
|
|
switch endpointFamily {
|
|
case v1.IPv4Protocol:
|
|
vipSubnet = "/32"
|
|
case v1.IPv6Protocol:
|
|
vipSubnet = "/128"
|
|
}
|
|
|
|
for _, svcIP := range serviceIPs {
|
|
ruleArgs := []string{"-s", endpointIP + "/32", "-d", endpointIP + vipSubnet,
|
|
"-m", "ipvs", "--vaddr", svcIP.String(), "--vport", strconv.Itoa(servicePort),
|
|
"-j", "SNAT", "--to-source", svcIP.String()}
|
|
|
|
// Trying to ensure this matches iptables.List()
|
|
ruleString := "-A " + ipvsHairpinChainName + " -s " + endpointIP + vipSubnet + " -d " +
|
|
endpointIP + vipSubnet + " -m ipvs" + " --vaddr " + svcIP.String() + " --vport " +
|
|
strconv.Itoa(servicePort) + " -j SNAT" + " --to-source " + svcIP.String()
|
|
|
|
ruleMap[ruleString] = ruleArgs
|
|
}
|
|
}
|
|
|
|
// ensureHairpinChain make sure that the hairpin chain in the nat table exists and that it has a jump rule from the
|
|
// POSTROUTING chain
|
|
func ensureHairpinChain(iptablesCmdHandler utils.IPTablesHandler) error {
|
|
hasHairpinChain := false
|
|
chains, err := iptablesCmdHandler.ListChains("nat")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list iptables chains: %v", err)
|
|
}
|
|
for _, chain := range chains {
|
|
if chain == ipvsHairpinChainName {
|
|
hasHairpinChain = true
|
|
}
|
|
}
|
|
|
|
// Create a chain for hairpin rules, if needed
|
|
if !hasHairpinChain {
|
|
err = iptablesCmdHandler.NewChain("nat", ipvsHairpinChainName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create iptables chain \"%s\": %v", ipvsHairpinChainName, err)
|
|
}
|
|
}
|
|
|
|
// Create a jump that points to our hairpin chain
|
|
jumpArgs := []string{"-m", "ipvs", "--vdir", "ORIGINAL", "-j", ipvsHairpinChainName}
|
|
err = iptablesCmdHandler.AppendUnique("nat", "POSTROUTING", jumpArgs...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to add hairpin iptables jump rule: %v", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// getAllLocalIPs returns all IP addresses found on any network address in the system, excluding dummy and docker
|
|
// interfaces in a map that distinguishes between IPv4 and IPv6 addresses by v1.IPFamily
|
|
func getAllLocalIPs() (map[v1.IPFamily][]net.IP, error) {
|
|
// We use maps here so that we can de-duplicate repeat IP addresses
|
|
v4Map := make(map[string]bool)
|
|
v6Map := make(map[string]bool)
|
|
links, err := netlink.LinkList()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not load list of net interfaces: %v", err)
|
|
}
|
|
|
|
for _, link := range links {
|
|
// do not include IPs for any interface that calls itself "dummy", "kube", or "docker"
|
|
if strings.Contains(link.Attrs().Name, "dummy") ||
|
|
strings.Contains(link.Attrs().Name, "kube") ||
|
|
strings.Contains(link.Attrs().Name, "docker") {
|
|
|
|
continue
|
|
}
|
|
|
|
linkAddrs, err := netlink.AddrList(link, netlink.FAMILY_V4)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get IPs for interface: %v", err)
|
|
}
|
|
|
|
for _, addr := range linkAddrs {
|
|
if addr.IP.To4() != nil {
|
|
v4Map[addr.IP.String()] = true
|
|
} else {
|
|
v6Map[addr.IP.String()] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return convertIPMapsToFamilyMap(v4Map, v6Map), nil
|
|
}
|
|
|
|
// getIPSetName formulates an IP Family specific ipset name based upon the prefix and IPFamily passed
|
|
func getIPSetName(nameBase string, family v1.IPFamily) string {
|
|
var sb strings.Builder
|
|
|
|
if family == v1.IPv6Protocol {
|
|
sb.WriteString("inet6:")
|
|
}
|
|
sb.WriteString(nameBase)
|
|
|
|
return sb.String()
|
|
}
|
|
|
|
// getIPVSFirewallInputChainRule creates IPVS firwall input chain rule based upon the family that is passed. This is
|
|
// used by the NSC to ensure that traffic destined for IPVS services on the INPUT table will be directed to the IPVS
|
|
// firewall chain
|
|
func getIPVSFirewallInputChainRule(family v1.IPFamily) []string {
|
|
// The iptables rule for use in {setup,cleanup}IpvsFirewall.
|
|
return []string{
|
|
"-m", "comment", "--comment", "handle traffic to IPVS service IPs in custom chain",
|
|
"-m", "set", "--match-set", getIPSetName(serviceIPsIPSetName, family), "dst",
|
|
"-j", ipvsFirewallChainName}
|
|
}
|
|
|
|
// runIPCommandsWithArgs extend the exec.Command interface to allow passing an additional array of arguments to ip
|
|
func runIPCommandsWithArgs(ipArgs []string, additionalArgs ...string) *exec.Cmd {
|
|
var allArgs []string
|
|
allArgs = append(allArgs, ipArgs...)
|
|
allArgs = append(allArgs, additionalArgs...)
|
|
return exec.Command("ip", allArgs...)
|
|
}
|
|
|
|
// getLabelFromMap checks the list of passed labels for the service.kubernetes.io/service-proxy-name
|
|
// label and if it exists, returns it otherwise returns an error
|
|
func getLabelFromMap(label string, labels map[string]string) (string, error) {
|
|
for lbl, val := range labels {
|
|
if lbl == label {
|
|
return val, nil
|
|
}
|
|
}
|
|
|
|
return "", fmt.Errorf("label doesn't exist in map")
|
|
}
|