mirror of
https://github.com/cloudnativelabs/kube-router.git
synced 2025-09-27 11:01:11 +02:00
2469 lines
81 KiB
Go
2469 lines
81 KiB
Go
package proxy
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"hash/fnv"
|
|
"io/ioutil"
|
|
"math/rand"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"reflect"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"golang.org/x/net/context"
|
|
|
|
"github.com/cloudnativelabs/kube-router/pkg/cri"
|
|
"github.com/cloudnativelabs/kube-router/pkg/healthcheck"
|
|
"github.com/cloudnativelabs/kube-router/pkg/metrics"
|
|
"github.com/cloudnativelabs/kube-router/pkg/options"
|
|
"github.com/cloudnativelabs/kube-router/pkg/utils"
|
|
"github.com/coreos/go-iptables/iptables"
|
|
"github.com/docker/docker/client"
|
|
"github.com/moby/ipvs"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/vishvananda/netlink"
|
|
"github.com/vishvananda/netns"
|
|
api "k8s.io/api/core/v1"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/klog/v2"
|
|
)
|
|
|
|
const (
|
|
KubeDummyIf = "kube-dummy-if"
|
|
KubeTunnelIf = "kube-tunnel-if"
|
|
IfaceNotFound = "Link not found"
|
|
IfaceHasAddr = "file exists"
|
|
IfaceHasNoAddr = "cannot assign requested address"
|
|
IpvsServerExists = "file exists"
|
|
IpvsMaglevHashing = "mh"
|
|
IpvsSvcFSched1 = "flag-1"
|
|
IpvsSvcFSched2 = "flag-2"
|
|
IpvsSvcFSched3 = "flag-3"
|
|
|
|
// Taken from https://github.com/torvalds/linux/blob/master/include/uapi/linux/ip_vs.h#L21
|
|
ipvsPersistentFlagHex = 0x0001
|
|
ipvsHashedFlagHex = 0x0002
|
|
ipvsOnePacketFlagHex = 0x0004
|
|
ipvsSched1FlagHex = 0x0008
|
|
ipvsSched2FlagHex = 0x0010
|
|
ipvsSched3FlagHex = 0x0020
|
|
|
|
// Taken from https://www.kernel.org/doc/Documentation/networking/ipvs-sysctl.txt
|
|
ipvsConnReuseModeDisableSpecialHandling = 0
|
|
ipvsExpireQuiescentTemplateEnable = 1
|
|
ipvsExpireNodestConnEnable = 1
|
|
ipvsConntrackEnable = 1
|
|
|
|
// Taken from https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
|
|
arpAnnounceUseBestLocalAddress = 2
|
|
arpIgnoreReplyOnlyIfTargetIPIsLocal = 1
|
|
|
|
svcDSRAnnotation = "kube-router.io/service.dsr"
|
|
svcSchedulerAnnotation = "kube-router.io/service.scheduler"
|
|
svcHairpinAnnotation = "kube-router.io/service.hairpin"
|
|
svcHairpinExternalIPsAnnotation = "kube-router.io/service.hairpin.externalips"
|
|
svcLocalAnnotation = "kube-router.io/service.local"
|
|
svcSkipLbIpsAnnotation = "kube-router.io/service.skiplbips"
|
|
svcSchedFlagsAnnotation = "kube-router.io/service.schedflags"
|
|
|
|
LeaderElectionRecordAnnotationKey = "control-plane.alpha.kubernetes.io/leader"
|
|
localIPsIPSetName = "kube-router-local-ips"
|
|
ipvsServicesIPSetName = "kube-router-ipvs-services"
|
|
serviceIPsIPSetName = "kube-router-service-ips"
|
|
ipvsFirewallChainName = "KUBE-ROUTER-SERVICES"
|
|
ipvsHairpinChainName = "KUBE-ROUTER-HAIRPIN"
|
|
synctypeAll = iota
|
|
synctypeIpvs
|
|
|
|
tcpProtocol = "tcp"
|
|
udpProtocol = "udp"
|
|
tunnelInterfaceType = "tunnel"
|
|
|
|
gracefulTermServiceTickTime = 5 * time.Second
|
|
)
|
|
|
|
var (
|
|
NodeIP net.IP
|
|
)
|
|
|
|
type ipvsCalls interface {
|
|
ipvsNewService(ipvsSvc *ipvs.Service) error
|
|
ipvsAddService(svcs []*ipvs.Service, vip net.IP, protocol, port uint16, persistent bool,
|
|
persistentTimeout int32, scheduler string, flags schedFlags) (*ipvs.Service, error)
|
|
ipvsDelService(ipvsSvc *ipvs.Service) error
|
|
ipvsUpdateService(ipvsSvc *ipvs.Service) error
|
|
ipvsGetServices() ([]*ipvs.Service, error)
|
|
ipvsAddServer(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error
|
|
ipvsNewDestination(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error
|
|
ipvsUpdateDestination(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error
|
|
ipvsGetDestinations(ipvsSvc *ipvs.Service) ([]*ipvs.Destination, error)
|
|
ipvsDelDestination(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error
|
|
ipvsAddFWMarkService(vip net.IP, protocol, port uint16, persistent bool, persistentTimeout int32,
|
|
scheduler string, flags schedFlags) (*ipvs.Service, error)
|
|
}
|
|
|
|
type netlinkCalls interface {
|
|
ipAddrAdd(iface netlink.Link, ip string, addRoute bool) error
|
|
ipAddrDel(iface netlink.Link, ip string) error
|
|
prepareEndpointForDsrWithDocker(containerID string, endpointIP string, vip string) error
|
|
getKubeDummyInterface() (netlink.Link, error)
|
|
setupRoutesForExternalIPForDSR(serviceInfoMap) error
|
|
prepareEndpointForDsrWithCRI(runtimeEndpoint, containerID, endpointIP, vip string) error
|
|
configureContainerForDSR(vip, endpointIP, containerID string, pid int,
|
|
hostNetworkNamespaceHandle netns.NsHandle) error
|
|
setupPolicyRoutingForDSR() error
|
|
cleanupMangleTableRule(ip string, protocol string, port string, fwmark string, tcpMSS int) error
|
|
}
|
|
|
|
// LinuxNetworking interface contains all linux networking subsystem calls
|
|
//go:generate moq -out network_services_controller_moq.go . LinuxNetworking
|
|
type LinuxNetworking interface {
|
|
ipvsCalls
|
|
netlinkCalls
|
|
}
|
|
|
|
type linuxNetworking struct {
|
|
ipvsHandle *ipvs.Handle
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipAddrDel(iface netlink.Link, ip string) error {
|
|
naddr := &netlink.Addr{IPNet: &net.IPNet{
|
|
IP: net.ParseIP(ip), Mask: net.IPv4Mask(255, 255, 255, 255),
|
|
}, Scope: syscall.RT_SCOPE_LINK}
|
|
err := netlink.AddrDel(iface, naddr)
|
|
if err != nil && err.Error() != IfaceHasNoAddr {
|
|
klog.Errorf("Failed to verify is external ip %s is assocated with dummy interface %s due to %s",
|
|
naddr.IPNet.IP.String(), KubeDummyIf, err.Error())
|
|
}
|
|
// Delete VIP addition to "local" rt table also, fail silently if not found (DSR special case)
|
|
if err == nil {
|
|
out, err := exec.Command("ip", "route", "delete", "local", ip, "dev", KubeDummyIf,
|
|
"table", "local", "proto", "kernel", "scope", "host", "src",
|
|
NodeIP.String(), "table", "local").CombinedOutput()
|
|
if err != nil && !strings.Contains(string(out), "No such process") {
|
|
klog.Errorf("Failed to delete route to service VIP %s configured on %s. Error: %v, Output: %s",
|
|
ip, KubeDummyIf, err, out)
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
// utility method to assign an IP to an interface. Mainly used to assign service VIP's
|
|
// to kube-dummy-if. Also when DSR is used, used to assign VIP to dummy interface
|
|
// inside the container.
|
|
func (ln *linuxNetworking) ipAddrAdd(iface netlink.Link, ip string, addRoute bool) error {
|
|
naddr := &netlink.Addr{IPNet: &net.IPNet{
|
|
IP: net.ParseIP(ip), Mask: net.IPv4Mask(255, 255, 255, 255),
|
|
}, Scope: syscall.RT_SCOPE_LINK}
|
|
err := netlink.AddrAdd(iface, naddr)
|
|
if err != nil && err.Error() != IfaceHasAddr {
|
|
klog.Errorf("Failed to assign cluster ip %s to dummy interface: %s",
|
|
naddr.IPNet.IP.String(), err.Error())
|
|
return err
|
|
}
|
|
|
|
// When a service VIP is assigned to a dummy interface and accessed from host, in some of the
|
|
// case Linux source IP selection logix selects VIP itself as source leading to problems
|
|
// to avoid this an explicit entry is added to use node IP as source IP when accessing
|
|
// VIP from the host. Please see https://github.com/cloudnativelabs/kube-router/issues/376
|
|
|
|
if !addRoute {
|
|
return nil
|
|
}
|
|
|
|
// TODO: netlink.RouteReplace which is replacement for below command is not working as expected. Call succeeds but
|
|
// route is not replaced. For now do it with command.
|
|
out, err := exec.Command("ip", "route", "replace", "local", ip, "dev", KubeDummyIf,
|
|
"table", "local", "proto", "kernel", "scope", "host", "src",
|
|
NodeIP.String(), "table", "local").CombinedOutput()
|
|
if err != nil {
|
|
klog.Errorf("Failed to replace route to service VIP %s configured on %s. Error: %v, Output: %s",
|
|
ip, KubeDummyIf, err, out)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsGetServices() ([]*ipvs.Service, error) {
|
|
return ln.ipvsHandle.GetServices()
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsGetDestinations(ipvsSvc *ipvs.Service) ([]*ipvs.Destination, error) {
|
|
return ln.ipvsHandle.GetDestinations(ipvsSvc)
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsDelDestination(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error {
|
|
return ln.ipvsHandle.DelDestination(ipvsSvc, ipvsDst)
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsNewDestination(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error {
|
|
return ln.ipvsHandle.NewDestination(ipvsSvc, ipvsDst)
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsUpdateDestination(ipvsSvc *ipvs.Service, ipvsDst *ipvs.Destination) error {
|
|
return ln.ipvsHandle.UpdateDestination(ipvsSvc, ipvsDst)
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsDelService(ipvsSvc *ipvs.Service) error {
|
|
return ln.ipvsHandle.DelService(ipvsSvc)
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsUpdateService(ipvsSvc *ipvs.Service) error {
|
|
return ln.ipvsHandle.UpdateService(ipvsSvc)
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsNewService(ipvsSvc *ipvs.Service) error {
|
|
return ln.ipvsHandle.NewService(ipvsSvc)
|
|
}
|
|
|
|
func newLinuxNetworking() (*linuxNetworking, error) {
|
|
ln := &linuxNetworking{}
|
|
ipvsHandle, err := ipvs.New("")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ln.ipvsHandle = ipvsHandle
|
|
return ln, nil
|
|
}
|
|
|
|
// NetworkServicesController enables local node as network service proxy through IPVS/LVS.
|
|
// Support only Kubernetes network services of type NodePort, ClusterIP, and LoadBalancer. For each service a
|
|
// IPVS service is created and for each service endpoint a server is added to the IPVS service.
|
|
// As services and endpoints are updated, network service controller gets the updates from
|
|
// the kubernetes api server and syncs the ipvs configuration to reflect state of services
|
|
// and endpoints
|
|
|
|
// NetworkServicesController struct stores information needed by the controller
|
|
type NetworkServicesController struct {
|
|
nodeIP net.IP
|
|
nodeHostName string
|
|
syncPeriod time.Duration
|
|
mu sync.Mutex
|
|
serviceMap serviceInfoMap
|
|
endpointsMap endpointsInfoMap
|
|
podCidr string
|
|
excludedCidrs []net.IPNet
|
|
masqueradeAll bool
|
|
globalHairpin bool
|
|
ipvsPermitAll bool
|
|
client kubernetes.Interface
|
|
nodeportBindOnAllIP bool
|
|
MetricsEnabled bool
|
|
metricsMap map[string][]string
|
|
ln LinuxNetworking
|
|
readyForUpdates bool
|
|
ProxyFirewallSetup *sync.Cond
|
|
ipsetMutex *sync.Mutex
|
|
|
|
// Map of ipsets that we use.
|
|
ipsetMap map[string]*utils.Set
|
|
|
|
svcLister cache.Indexer
|
|
epLister cache.Indexer
|
|
podLister cache.Indexer
|
|
|
|
ServiceEventHandler cache.ResourceEventHandler
|
|
EndpointsEventHandler cache.ResourceEventHandler
|
|
|
|
gracefulPeriod time.Duration
|
|
gracefulQueue gracefulQueue
|
|
gracefulTermination bool
|
|
syncChan chan int
|
|
dsr *dsrOpt
|
|
dsrTCPMSS int
|
|
}
|
|
|
|
// DSR related options
|
|
type dsrOpt struct {
|
|
runtimeEndpoint string
|
|
}
|
|
|
|
// internal representation of kubernetes service
|
|
type serviceInfo struct {
|
|
name string
|
|
namespace string
|
|
clusterIP net.IP
|
|
port int
|
|
targetPort string
|
|
protocol string
|
|
nodePort int
|
|
sessionAffinity bool
|
|
sessionAffinityTimeoutSeconds int32
|
|
directServerReturn bool
|
|
scheduler string
|
|
directServerReturnMethod string
|
|
hairpin bool
|
|
hairpinExternalIPs bool
|
|
skipLbIps bool
|
|
externalIPs []string
|
|
loadBalancerIPs []string
|
|
local bool
|
|
flags schedFlags
|
|
}
|
|
|
|
// IPVS scheduler flags
|
|
type schedFlags struct {
|
|
flag1 bool /* ipvs scheduler flag-1 */
|
|
flag2 bool /* ipvs scheduler flag-2 */
|
|
flag3 bool /* ipvs scheduler flag-3 */
|
|
}
|
|
|
|
// map of all services, with unique service id(namespace name, service name, port) as key
|
|
type serviceInfoMap map[string]*serviceInfo
|
|
|
|
// internal representation of endpoints
|
|
type endpointsInfo struct {
|
|
ip string
|
|
port int
|
|
isLocal bool
|
|
}
|
|
|
|
// map of all endpoints, with unique service id(namespace name, service name, port) as key
|
|
type endpointsInfoMap map[string][]endpointsInfo
|
|
|
|
// Run periodically sync ipvs configuration to reflect desired state of services and endpoints
|
|
func (nsc *NetworkServicesController) Run(healthChan chan<- *healthcheck.ControllerHeartbeat,
|
|
stopCh <-chan struct{}, wg *sync.WaitGroup) {
|
|
t := time.NewTicker(nsc.syncPeriod)
|
|
defer t.Stop()
|
|
defer wg.Done()
|
|
defer close(nsc.syncChan)
|
|
|
|
klog.Infof("Starting network services controller")
|
|
|
|
klog.V(1).Info("Performing cleanup of depreciated masquerade iptables rules (if needed).")
|
|
err := nsc.deleteBadMasqueradeIptablesRules()
|
|
if err != nil {
|
|
klog.Errorf("Error cleaning up old/bad masquerade rules: %s", err.Error())
|
|
}
|
|
|
|
// enable masquerade rule
|
|
err = nsc.ensureMasqueradeIptablesRule()
|
|
if err != nil {
|
|
klog.Errorf("Failed to do add masquerade rule in POSTROUTING chain of nat table due to: %s", err.Error())
|
|
}
|
|
// https://www.kernel.org/doc/Documentation/networking/ipvs-sysctl.txt
|
|
// enable ipvs connection tracking
|
|
sysctlErr := utils.SetSysctl("net/ipv4/vs/conntrack", ipvsConntrackEnable)
|
|
if sysctlErr != nil {
|
|
klog.Error(sysctlErr.Error())
|
|
}
|
|
|
|
// LVS failover not working with UDP packets https://access.redhat.com/solutions/58653
|
|
sysctlErr = utils.SetSysctl("net/ipv4/vs/expire_nodest_conn", ipvsExpireNodestConnEnable)
|
|
if sysctlErr != nil {
|
|
klog.Error(sysctlErr.Error())
|
|
}
|
|
|
|
// LVS failover not working with UDP packets https://access.redhat.com/solutions/58653
|
|
sysctlErr = utils.SetSysctl("net/ipv4/vs/expire_quiescent_template", ipvsExpireQuiescentTemplateEnable)
|
|
if sysctlErr != nil {
|
|
klog.Error(sysctlErr.Error())
|
|
}
|
|
|
|
// https://github.com/kubernetes/kubernetes/pull/71114
|
|
sysctlErr = utils.SetSysctl("net/ipv4/vs/conn_reuse_mode", ipvsConnReuseModeDisableSpecialHandling)
|
|
if sysctlErr != nil {
|
|
// Check if the error is fatal, on older kernels this option does not exist and the same behaviour is default
|
|
// if option is not found just log it
|
|
if sysctlErr.IsFatal() {
|
|
klog.Fatal(sysctlErr.Error())
|
|
} else {
|
|
klog.Info(sysctlErr.Error())
|
|
}
|
|
}
|
|
|
|
// https://github.com/kubernetes/kubernetes/pull/70530/files
|
|
sysctlErr = utils.SetSysctl("net/ipv4/conf/all/arp_ignore", arpIgnoreReplyOnlyIfTargetIPIsLocal)
|
|
if sysctlErr != nil {
|
|
klog.Error(sysctlErr.Error())
|
|
}
|
|
|
|
// https://github.com/kubernetes/kubernetes/pull/70530/files
|
|
sysctlErr = utils.SetSysctl("net/ipv4/conf/all/arp_announce", arpAnnounceUseBestLocalAddress)
|
|
if sysctlErr != nil {
|
|
klog.Error(sysctlErr.Error())
|
|
}
|
|
|
|
// https://github.com/cloudnativelabs/kube-router/issues/282
|
|
err = nsc.setupIpvsFirewall()
|
|
if err != nil {
|
|
klog.Error("Error setting up ipvs firewall: " + err.Error())
|
|
}
|
|
nsc.ProxyFirewallSetup.Broadcast()
|
|
|
|
gracefulTicker := time.NewTicker(gracefulTermServiceTickTime)
|
|
defer gracefulTicker.Stop()
|
|
|
|
select {
|
|
case <-stopCh:
|
|
klog.Info("Shutting down network services controller")
|
|
return
|
|
default:
|
|
err := nsc.doSync()
|
|
if err != nil {
|
|
klog.Fatalf("Failed to perform initial full sync %s", err.Error())
|
|
}
|
|
nsc.readyForUpdates = true
|
|
}
|
|
|
|
// loop forever until notified to stop on stopCh
|
|
for {
|
|
select {
|
|
case <-stopCh:
|
|
nsc.mu.Lock()
|
|
nsc.readyForUpdates = false
|
|
nsc.mu.Unlock()
|
|
klog.Info("Shutting down network services controller")
|
|
return
|
|
|
|
case <-gracefulTicker.C:
|
|
if nsc.readyForUpdates && nsc.gracefulTermination {
|
|
klog.V(3).Info("Performing periodic graceful destination cleanup")
|
|
nsc.gracefulSync()
|
|
}
|
|
|
|
case perform := <-nsc.syncChan:
|
|
healthcheck.SendHeartBeat(healthChan, "NSC")
|
|
switch perform {
|
|
case synctypeAll:
|
|
klog.V(1).Info("Performing requested full sync of services")
|
|
err := nsc.doSync()
|
|
if err != nil {
|
|
klog.Errorf("Error during full sync in network service controller. Error: " + err.Error())
|
|
}
|
|
case synctypeIpvs:
|
|
klog.V(1).Info("Performing requested sync of ipvs services")
|
|
nsc.mu.Lock()
|
|
err := nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap)
|
|
nsc.mu.Unlock()
|
|
if err != nil {
|
|
klog.Errorf("Error during ipvs sync in network service controller. Error: " + err.Error())
|
|
}
|
|
}
|
|
if err == nil {
|
|
healthcheck.SendHeartBeat(healthChan, "NSC")
|
|
}
|
|
|
|
case <-t.C:
|
|
klog.V(1).Info("Performing periodic sync of ipvs services")
|
|
healthcheck.SendHeartBeat(healthChan, "NSC")
|
|
err := nsc.doSync()
|
|
if err != nil {
|
|
klog.Errorf("Error during periodic ipvs sync in network service controller. Error: " + err.Error())
|
|
klog.Errorf("Skipping sending heartbeat from network service controller as periodic sync failed.")
|
|
} else {
|
|
healthcheck.SendHeartBeat(healthChan, "NSC")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) sync(syncType int) {
|
|
select {
|
|
case nsc.syncChan <- syncType:
|
|
default:
|
|
klog.V(2).Infof("Already pending sync, dropping request for type %d", syncType)
|
|
}
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) doSync() error {
|
|
var err error
|
|
nsc.mu.Lock()
|
|
defer nsc.mu.Unlock()
|
|
|
|
// enable masquerade rule
|
|
err = nsc.ensureMasqueradeIptablesRule()
|
|
if err != nil {
|
|
klog.Errorf("Failed to do add masquerade rule in POSTROUTING chain of nat table due to: %s", err.Error())
|
|
}
|
|
|
|
nsc.serviceMap = nsc.buildServicesInfo()
|
|
nsc.endpointsMap = nsc.buildEndpointsInfo()
|
|
err = nsc.syncHairpinIptablesRules()
|
|
if err != nil {
|
|
klog.Errorf("Error syncing hairpin iptables rules: %s", err.Error())
|
|
}
|
|
|
|
err = nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap)
|
|
if err != nil {
|
|
klog.Errorf("Error syncing IPVS services: %s", err.Error())
|
|
return err
|
|
}
|
|
|
|
if nsc.MetricsEnabled {
|
|
err = nsc.publishMetrics(nsc.serviceMap)
|
|
if err != nil {
|
|
klog.Errorf("Error publishing metrics: %v", err)
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Lookup service ip, protocol, port by given fwmark value (reverse of generateFwmark)
|
|
func (nsc *NetworkServicesController) lookupServiceByFwMark(fwMark uint32) (string, string, int, error) {
|
|
for _, svc := range nsc.serviceMap {
|
|
for _, externalIP := range svc.externalIPs {
|
|
gfwmark, err := generateFwmark(externalIP, svc.protocol, fmt.Sprint(svc.port))
|
|
if err != nil {
|
|
return "", "", 0, err
|
|
}
|
|
if fwMark == gfwmark {
|
|
return externalIP, svc.protocol, svc.port, nil
|
|
}
|
|
}
|
|
}
|
|
return "", "", 0, nil
|
|
}
|
|
|
|
func getIpvsFirewallInputChainRule() []string {
|
|
// The iptables rule for use in {setup,cleanup}IpvsFirewall.
|
|
return []string{
|
|
"-m", "comment", "--comment", "handle traffic to IPVS service IPs in custom chain",
|
|
"-m", "set", "--match-set", serviceIPsIPSetName, "dst",
|
|
"-j", ipvsFirewallChainName}
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) setupIpvsFirewall() error {
|
|
/*
|
|
- create ipsets
|
|
- create firewall rules
|
|
*/
|
|
|
|
var err error
|
|
var ipset *utils.Set
|
|
|
|
ipSetHandler, err := utils.NewIPSet(false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Remember ipsets for use in syncIpvsFirewall
|
|
nsc.ipsetMap = make(map[string]*utils.Set)
|
|
|
|
// Create ipset for local addresses.
|
|
ipset, err = ipSetHandler.Create(localIPsIPSetName, utils.TypeHashIP, utils.OptionTimeout, "0")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create ipset: %s", err.Error())
|
|
}
|
|
nsc.ipsetMap[localIPsIPSetName] = ipset
|
|
|
|
// Create 2 ipsets for services. One for 'ip' and one for 'ip,port'
|
|
ipset, err = ipSetHandler.Create(serviceIPsIPSetName, utils.TypeHashIP, utils.OptionTimeout, "0")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create ipset: %s", err.Error())
|
|
}
|
|
nsc.ipsetMap[serviceIPsIPSetName] = ipset
|
|
|
|
ipset, err = ipSetHandler.Create(ipvsServicesIPSetName, utils.TypeHashIPPort, utils.OptionTimeout, "0")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create ipset: %s", err.Error())
|
|
}
|
|
nsc.ipsetMap[ipvsServicesIPSetName] = ipset
|
|
|
|
// Setup a custom iptables chain to explicitly allow input traffic to
|
|
// ipvs services only.
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("failed to initialize iptables executor" + err.Error())
|
|
}
|
|
|
|
// ClearChain either clears an existing chain or creates a new one.
|
|
err = iptablesCmdHandler.ClearChain("filter", ipvsFirewallChainName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
|
|
// config.IpvsPermitAll: true then create INPUT/KUBE-ROUTER-SERVICE Chain creation else return
|
|
if !nsc.ipvsPermitAll {
|
|
return nil
|
|
}
|
|
|
|
var comment string
|
|
var args []string
|
|
var exists bool
|
|
|
|
comment = "allow input traffic to ipvs services"
|
|
args = []string{"-m", "comment", "--comment", comment,
|
|
"-m", "set", "--match-set", ipvsServicesIPSetName, "dst,dst",
|
|
"-j", "ACCEPT"}
|
|
exists, err = iptablesCmdHandler.Exists("filter", ipvsFirewallChainName, args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
if !exists {
|
|
err := iptablesCmdHandler.Insert("filter", ipvsFirewallChainName, 1, args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
}
|
|
|
|
comment = "allow icmp echo requests to service IPs"
|
|
args = []string{"-m", "comment", "--comment", comment,
|
|
"-p", "icmp", "--icmp-type", "echo-request",
|
|
"-j", "ACCEPT"}
|
|
err = iptablesCmdHandler.AppendUnique("filter", ipvsFirewallChainName, args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
|
|
comment = "allow icmp destination unreachable messages to service IPs"
|
|
args = []string{"-m", "comment", "--comment", comment,
|
|
"-p", "icmp", "--icmp-type", "destination-unreachable",
|
|
"-j", "ACCEPT"}
|
|
err = iptablesCmdHandler.AppendUnique("filter", ipvsFirewallChainName, args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
|
|
comment = "allow icmp ttl exceeded messages to service IPs"
|
|
args = []string{"-m", "comment", "--comment", comment,
|
|
"-p", "icmp", "--icmp-type", "time-exceeded",
|
|
"-j", "ACCEPT"}
|
|
err = iptablesCmdHandler.AppendUnique("filter", ipvsFirewallChainName, args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
|
|
// We exclude the local addresses here as that would otherwise block all
|
|
// traffic to local addresses if any NodePort service exists.
|
|
comment = "reject all unexpected traffic to service IPs"
|
|
args = []string{"-m", "comment", "--comment", comment,
|
|
"-m", "set", "!", "--match-set", localIPsIPSetName, "dst",
|
|
"-j", "REJECT", "--reject-with", "icmp-port-unreachable"}
|
|
err = iptablesCmdHandler.AppendUnique("filter", ipvsFirewallChainName, args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
|
|
// Pass incoming traffic into our custom chain.
|
|
ipvsFirewallInputChainRule := getIpvsFirewallInputChainRule()
|
|
exists, err = iptablesCmdHandler.Exists("filter", "INPUT", ipvsFirewallInputChainRule...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
if !exists {
|
|
err = iptablesCmdHandler.Insert("filter", "INPUT", 1, ipvsFirewallInputChainRule...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to run iptables command: %s", err.Error())
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) cleanupIpvsFirewall() {
|
|
// Clear iptables rules
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
klog.Errorf("failed to initialize iptables executor: %v", err)
|
|
} else {
|
|
ipvsFirewallInputChainRule := getIpvsFirewallInputChainRule()
|
|
exists, err := iptablesCmdHandler.Exists("filter", "INPUT", ipvsFirewallInputChainRule...)
|
|
if err != nil {
|
|
// Changed to level 1 as errors occur when ipsets have already been cleaned and needlessly worries users
|
|
klog.V(1).Infof("failed to check if iptables rules exists: %v", err)
|
|
} else if exists {
|
|
err = iptablesCmdHandler.Delete("filter", "INPUT", ipvsFirewallInputChainRule...)
|
|
if err != nil {
|
|
klog.Errorf("failed to run iptables command: %v", err)
|
|
}
|
|
}
|
|
|
|
exists, err = iptablesCmdHandler.ChainExists("filter", ipvsFirewallChainName)
|
|
if err != nil {
|
|
klog.Errorf("failed to check if chain exists for deletion: %v", err)
|
|
} else if exists {
|
|
err = iptablesCmdHandler.ClearChain("filter", ipvsFirewallChainName)
|
|
if err != nil {
|
|
klog.Errorf("Failed to run iptables command: %s", err.Error())
|
|
}
|
|
|
|
err = iptablesCmdHandler.DeleteChain("filter", ipvsFirewallChainName)
|
|
if err != nil {
|
|
klog.Errorf("Failed to run iptables command: %s", err.Error())
|
|
}
|
|
}
|
|
}
|
|
|
|
// For some reason, if we go too fast into the ipset logic below it causes the system to think that the above
|
|
// iptables rules are still referencing the ipsets below, and we get errors
|
|
time.Sleep(1 * time.Second)
|
|
|
|
// Clear ipsets
|
|
// There are certain actions like Cleanup() actions that aren't working with full instantiations of the controller
|
|
// and in these instances the mutex may not be present and may not need to be present as they are operating out of a
|
|
// single goroutine where there is no need for locking
|
|
if nil != nsc.ipsetMutex {
|
|
klog.V(1).Infof("Attempting to attain ipset mutex lock")
|
|
nsc.ipsetMutex.Lock()
|
|
klog.V(1).Infof("Attained ipset mutex lock, continuing...")
|
|
defer func() {
|
|
nsc.ipsetMutex.Unlock()
|
|
klog.V(1).Infof("Returned ipset mutex lock")
|
|
}()
|
|
}
|
|
ipSetHandler, err := utils.NewIPSet(false)
|
|
if err != nil {
|
|
klog.Errorf("Failed to initialize ipset handler: %s", err.Error())
|
|
return
|
|
}
|
|
err = ipSetHandler.Save()
|
|
if err != nil {
|
|
klog.Fatalf("failed to initialize ipsets command executor due to %v", err)
|
|
return
|
|
}
|
|
|
|
if _, ok := ipSetHandler.Sets[localIPsIPSetName]; ok {
|
|
err = ipSetHandler.Destroy(localIPsIPSetName)
|
|
if err != nil {
|
|
klog.Errorf("failed to destroy ipset: %s", err.Error())
|
|
}
|
|
}
|
|
|
|
if _, ok := ipSetHandler.Sets[serviceIPsIPSetName]; ok {
|
|
err = ipSetHandler.Destroy(serviceIPsIPSetName)
|
|
if err != nil {
|
|
klog.Errorf("failed to destroy ipset: %s", err.Error())
|
|
}
|
|
}
|
|
|
|
if _, ok := ipSetHandler.Sets[ipvsServicesIPSetName]; ok {
|
|
err = ipSetHandler.Destroy(ipvsServicesIPSetName)
|
|
if err != nil {
|
|
klog.Errorf("failed to destroy ipset: %s", err.Error())
|
|
}
|
|
}
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) syncIpvsFirewall() error {
|
|
/*
|
|
- update ipsets based on currently active IPVS services
|
|
*/
|
|
var err error
|
|
klog.V(1).Infof("Attempting to attain ipset mutex lock")
|
|
nsc.ipsetMutex.Lock()
|
|
klog.V(1).Infof("Attained ipset mutex lock, continuing...")
|
|
defer func() {
|
|
nsc.ipsetMutex.Unlock()
|
|
klog.V(1).Infof("Returned ipset mutex lock")
|
|
}()
|
|
|
|
localIPsIPSet := nsc.ipsetMap[localIPsIPSetName]
|
|
|
|
// Populate local addresses ipset.
|
|
addrs, err := getAllLocalIPs()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get local IPs: %s", err)
|
|
}
|
|
localIPsSets := make([]string, 0, len(addrs))
|
|
for _, addr := range addrs {
|
|
localIPsSets = append(localIPsSets, addr.IP.String())
|
|
}
|
|
err = localIPsIPSet.Refresh(localIPsSets)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to sync ipset: %s", err.Error())
|
|
}
|
|
|
|
// Populate service ipsets.
|
|
ipvsServices, err := nsc.ln.ipvsGetServices()
|
|
if err != nil {
|
|
return errors.New("Failed to list IPVS services: " + err.Error())
|
|
}
|
|
|
|
serviceIPsSets := make([]string, 0, len(ipvsServices))
|
|
ipvsServicesSets := make([]string, 0, len(ipvsServices))
|
|
|
|
for _, ipvsService := range ipvsServices {
|
|
var address, protocol string
|
|
var port int
|
|
if ipvsService.Address != nil {
|
|
address = ipvsService.Address.String()
|
|
if ipvsService.Protocol == syscall.IPPROTO_TCP {
|
|
protocol = tcpProtocol
|
|
} else {
|
|
protocol = udpProtocol
|
|
}
|
|
port = int(ipvsService.Port)
|
|
} else if ipvsService.FWMark != 0 {
|
|
address, protocol, port, err = nsc.lookupServiceByFwMark(ipvsService.FWMark)
|
|
if err != nil {
|
|
klog.Errorf("failed to lookup %d by FWMark: %s", ipvsService.FWMark, err)
|
|
}
|
|
if address == "" {
|
|
continue
|
|
}
|
|
}
|
|
|
|
serviceIPsSet := address
|
|
serviceIPsSets = append(serviceIPsSets, serviceIPsSet)
|
|
|
|
ipvsServicesSet := fmt.Sprintf("%s,%s:%d", address, protocol, port)
|
|
ipvsServicesSets = append(ipvsServicesSets, ipvsServicesSet)
|
|
|
|
}
|
|
|
|
serviceIPsIPSet := nsc.ipsetMap[serviceIPsIPSetName]
|
|
err = serviceIPsIPSet.Refresh(serviceIPsSets)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to sync ipset: %s", err.Error())
|
|
}
|
|
|
|
ipvsServicesIPSet := nsc.ipsetMap[ipvsServicesIPSetName]
|
|
err = ipvsServicesIPSet.Refresh(ipvsServicesSets)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to sync ipset: %s", err.Error())
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoMap) error {
|
|
start := time.Now()
|
|
defer func() {
|
|
endTime := time.Since(start)
|
|
klog.V(2).Infof("Publishing IPVS metrics took %v", endTime)
|
|
if nsc.MetricsEnabled {
|
|
metrics.ControllerIpvsMetricsExportTime.Observe(endTime.Seconds())
|
|
}
|
|
}()
|
|
|
|
ipvsSvcs, err := nsc.ln.ipvsGetServices()
|
|
if err != nil {
|
|
return errors.New("Failed to list IPVS services: " + err.Error())
|
|
}
|
|
|
|
klog.V(1).Info("Publishing IPVS metrics")
|
|
for _, svc := range serviceInfoMap {
|
|
var protocol uint16
|
|
var pushMetric bool
|
|
var svcVip string
|
|
|
|
switch aProtocol := svc.protocol; aProtocol {
|
|
case tcpProtocol:
|
|
protocol = syscall.IPPROTO_TCP
|
|
case udpProtocol:
|
|
protocol = syscall.IPPROTO_UDP
|
|
default:
|
|
protocol = syscall.IPPROTO_NONE
|
|
}
|
|
for _, ipvsSvc := range ipvsSvcs {
|
|
|
|
switch svcAddress := ipvsSvc.Address.String(); svcAddress {
|
|
case svc.clusterIP.String():
|
|
if protocol == ipvsSvc.Protocol && uint16(svc.port) == ipvsSvc.Port {
|
|
pushMetric = true
|
|
svcVip = svc.clusterIP.String()
|
|
} else {
|
|
pushMetric = false
|
|
}
|
|
case nsc.nodeIP.String():
|
|
if protocol == ipvsSvc.Protocol && uint16(svc.port) == ipvsSvc.Port {
|
|
pushMetric = true
|
|
svcVip = nsc.nodeIP.String()
|
|
} else {
|
|
pushMetric = false
|
|
}
|
|
default:
|
|
svcVip = ""
|
|
pushMetric = false
|
|
}
|
|
|
|
if pushMetric {
|
|
|
|
klog.V(3).Infof("Publishing metrics for %s/%s (%s:%d/%s)",
|
|
svc.namespace, svc.name, svcVip, svc.port, svc.protocol)
|
|
|
|
labelValues := []string{
|
|
svc.namespace,
|
|
svc.name,
|
|
svcVip,
|
|
svc.protocol,
|
|
strconv.Itoa(svc.port),
|
|
}
|
|
|
|
key := generateIPPortID(svcVip, svc.protocol, strconv.Itoa(svc.port))
|
|
nsc.metricsMap[key] = labelValues
|
|
// these same metrics should be deleted when the service is deleted.
|
|
metrics.ServiceBpsIn.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.BPSIn))
|
|
metrics.ServiceBpsOut.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.BPSOut))
|
|
metrics.ServiceBytesIn.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.BytesIn))
|
|
metrics.ServiceBytesOut.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.BytesOut))
|
|
metrics.ServiceCPS.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.CPS))
|
|
metrics.ServicePacketsIn.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.PacketsIn))
|
|
metrics.ServicePacketsOut.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.PacketsOut))
|
|
metrics.ServicePpsIn.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.PPSIn))
|
|
metrics.ServicePpsOut.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.PPSOut))
|
|
metrics.ServiceTotalConn.WithLabelValues(labelValues...).Set(float64(ipvsSvc.Stats.Connections))
|
|
metrics.ControllerIpvsServices.Set(float64(len(ipvsSvcs)))
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// TODO Move to utils
|
|
func unsortedListsEquivalent(a, b []endpointsInfo) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
|
|
values := make(map[interface{}]int)
|
|
for _, val := range a {
|
|
values[val] = 1
|
|
}
|
|
for _, val := range b {
|
|
values[val]++
|
|
}
|
|
|
|
for _, val := range values {
|
|
if val == 1 {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func endpointsMapsEquivalent(a, b endpointsInfoMap) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
|
|
for key, valA := range a {
|
|
valB, ok := b[key]
|
|
if !ok {
|
|
return false
|
|
}
|
|
|
|
if !unsortedListsEquivalent(valA, valB) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// OnEndpointsUpdate handle change in endpoints update from the API server
|
|
func (nsc *NetworkServicesController) OnEndpointsUpdate(ep *api.Endpoints) {
|
|
|
|
if isEndpointsForLeaderElection(ep) {
|
|
return
|
|
}
|
|
|
|
nsc.mu.Lock()
|
|
defer nsc.mu.Unlock()
|
|
klog.V(1).Infof("Received update to endpoint: %s/%s from watch API", ep.Namespace, ep.Name)
|
|
if !nsc.readyForUpdates {
|
|
klog.V(3).Infof(
|
|
"Skipping update to endpoint: %s/%s as controller is not ready to process service and endpoints updates",
|
|
ep.Namespace, ep.Name)
|
|
return
|
|
}
|
|
|
|
// If the service is headless and the previous version of the service is either non-existent or also headless,
|
|
// skip processing as we only work with VIPs in the next section. Since the ClusterIP field is immutable we don't
|
|
// need to consider previous versions of the service here as we are guaranteed if is a ClusterIP now, it was a
|
|
// ClusterIP before.
|
|
svc, exists, err := utils.ServiceForEndpoints(&nsc.svcLister, ep)
|
|
if err != nil {
|
|
klog.Errorf("failed to convert endpoints resource to service: %s", err)
|
|
return
|
|
}
|
|
// ignore updates to Endpoints object with no corresponding Service object
|
|
if !exists {
|
|
return
|
|
}
|
|
if utils.ServiceIsHeadless(svc) {
|
|
klog.V(1).Infof("The service associated with endpoint: %s/%s is headless, skipping...",
|
|
ep.Namespace, ep.Name)
|
|
return
|
|
}
|
|
|
|
// build new service and endpoints map to reflect the change
|
|
newServiceMap := nsc.buildServicesInfo()
|
|
newEndpointsMap := nsc.buildEndpointsInfo()
|
|
|
|
if !endpointsMapsEquivalent(newEndpointsMap, nsc.endpointsMap) {
|
|
nsc.endpointsMap = newEndpointsMap
|
|
nsc.serviceMap = newServiceMap
|
|
klog.V(1).Infof("Syncing IPVS services sync for update to endpoint: %s/%s", ep.Namespace, ep.Name)
|
|
nsc.sync(synctypeIpvs)
|
|
} else {
|
|
klog.V(1).Infof("Skipping IPVS services sync on endpoint: %s/%s update as nothing changed",
|
|
ep.Namespace, ep.Name)
|
|
}
|
|
}
|
|
|
|
// OnServiceUpdate handle change in service update from the API server
|
|
func (nsc *NetworkServicesController) OnServiceUpdate(svc *api.Service) {
|
|
|
|
nsc.mu.Lock()
|
|
defer nsc.mu.Unlock()
|
|
|
|
klog.V(1).Infof("Received update to service: %s/%s from watch API", svc.Namespace, svc.Name)
|
|
if !nsc.readyForUpdates {
|
|
klog.V(3).Infof(
|
|
"Skipping update to service: %s/%s as controller is not ready to process service and endpoints updates",
|
|
svc.Namespace, svc.Name)
|
|
return
|
|
}
|
|
|
|
// If the service is headless and the previous version of the service is either non-existent or also headless,
|
|
// skip processing as we only work with VIPs in the next section. Since the ClusterIP field is immutable we don't
|
|
// need to consider previous versions of the service here as we are guaranteed if is a ClusterIP now, it was a
|
|
// ClusterIP before.
|
|
if utils.ServiceIsHeadless(svc) {
|
|
klog.V(1).Infof("%s/%s is headless, skipping...", svc.Namespace, svc.Name)
|
|
return
|
|
}
|
|
|
|
// build new service and endpoints map to reflect the change
|
|
newServiceMap := nsc.buildServicesInfo()
|
|
newEndpointsMap := nsc.buildEndpointsInfo()
|
|
|
|
if len(newServiceMap) != len(nsc.serviceMap) || !reflect.DeepEqual(newServiceMap, nsc.serviceMap) {
|
|
nsc.endpointsMap = newEndpointsMap
|
|
nsc.serviceMap = newServiceMap
|
|
klog.V(1).Infof("Syncing IPVS services sync on update to service: %s/%s", svc.Namespace, svc.Name)
|
|
nsc.sync(synctypeIpvs)
|
|
} else {
|
|
klog.V(1).Infof("Skipping syncing IPVS services for update to service: %s/%s as nothing changed",
|
|
svc.Namespace, svc.Name)
|
|
}
|
|
}
|
|
|
|
type externalIPService struct {
|
|
ipvsSvc *ipvs.Service
|
|
externalIP string
|
|
}
|
|
|
|
func hasActiveEndpoints(endpoints []endpointsInfo) bool {
|
|
for _, endpoint := range endpoints {
|
|
if endpoint.isLocal {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) getPodObjectForEndpoint(endpointIP string) (*api.Pod, error) {
|
|
for _, obj := range nsc.podLister.List() {
|
|
pod := obj.(*api.Pod)
|
|
if strings.Compare(pod.Status.PodIP, endpointIP) == 0 {
|
|
return pod, nil
|
|
}
|
|
}
|
|
return nil, errors.New("Failed to find pod with ip " + endpointIP)
|
|
}
|
|
|
|
// This function does the following
|
|
// - get the pod corresponding to the endpoint ip
|
|
// - get the container id from pod spec
|
|
// - from the container id, use docker client to get the pid
|
|
// - enter process network namespace and create ipip tunnel
|
|
// - add VIP to the tunnel interface
|
|
// - disable rp_filter
|
|
// WARN: This method is deprecated and will be removed once docker-shim is removed from kubelet.
|
|
func (ln *linuxNetworking) prepareEndpointForDsrWithDocker(containerID string, endpointIP string, vip string) error {
|
|
|
|
// Its possible switch namespaces may never work safely in GO without hacks.
|
|
// https://groups.google.com/forum/#!topic/golang-nuts/ss1gEOcehjk/discussion
|
|
// https://www.weave.works/blog/linux-namespaces-and-go-don-t-mix
|
|
// Dont know if same issue, but seen namespace issue, so adding
|
|
// logs and boilerplate code and verbose logs for diagnosis
|
|
|
|
runtime.LockOSThread()
|
|
defer runtime.UnlockOSThread()
|
|
|
|
var activeNetworkNamespaceHandle netns.NsHandle
|
|
|
|
hostNetworkNamespaceHandle, err := netns.Get()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get namespace due to %v", err)
|
|
}
|
|
defer utils.CloseCloserDisregardError(&hostNetworkNamespaceHandle)
|
|
|
|
activeNetworkNamespaceHandle, err = netns.Get()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get namespace due to %v", err)
|
|
}
|
|
klog.V(1).Infof("Current network namespace before netns.Set: %s", activeNetworkNamespaceHandle.String())
|
|
defer utils.CloseCloserDisregardError(&activeNetworkNamespaceHandle)
|
|
|
|
dockerClient, err := client.NewClientWithOpts(client.FromEnv)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get docker client due to %v", err)
|
|
}
|
|
defer utils.CloseCloserDisregardError(dockerClient)
|
|
|
|
containerSpec, err := dockerClient.ContainerInspect(context.Background(), containerID)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get docker container spec due to %v", err)
|
|
}
|
|
|
|
pid := containerSpec.State.Pid
|
|
return ln.configureContainerForDSR(vip, endpointIP, containerID, pid, hostNetworkNamespaceHandle)
|
|
}
|
|
|
|
// The same as prepareEndpointForDsr but using CRI instead of docker.
|
|
func (ln *linuxNetworking) prepareEndpointForDsrWithCRI(runtimeEndpoint, containerID, endpointIP, vip string) error {
|
|
|
|
// Its possible switch namespaces may never work safely in GO without hacks.
|
|
// https://groups.google.com/forum/#!topic/golang-nuts/ss1gEOcehjk/discussion
|
|
// https://www.weave.works/blog/linux-namespaces-and-go-don-t-mix
|
|
// Dont know if same issue, but seen namespace issue, so adding
|
|
// logs and boilerplate code and verbose logs for diagnosis
|
|
|
|
if runtimeEndpoint == "" {
|
|
return fmt.Errorf("runtimeEndpoint is not specified")
|
|
}
|
|
|
|
runtime.LockOSThread()
|
|
defer runtime.UnlockOSThread()
|
|
|
|
hostNetworkNamespaceHandle, err := netns.Get()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get host namespace due to %v", err)
|
|
}
|
|
klog.V(1).Infof("current network namespace before netns.Set: %s", hostNetworkNamespaceHandle.String())
|
|
defer utils.CloseCloserDisregardError(&hostNetworkNamespaceHandle)
|
|
|
|
rs, err := cri.NewRemoteRuntimeService(runtimeEndpoint, cri.DefaultConnectionTimeout)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer utils.CloseCloserDisregardError(rs)
|
|
|
|
info, err := rs.ContainerInfo(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
pid := info.Pid
|
|
return ln.configureContainerForDSR(vip, endpointIP, containerID, pid, hostNetworkNamespaceHandle)
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) buildServicesInfo() serviceInfoMap {
|
|
serviceMap := make(serviceInfoMap)
|
|
for _, obj := range nsc.svcLister.List() {
|
|
svc := obj.(*api.Service)
|
|
|
|
if utils.ClusterIPIsNoneOrBlank(svc.Spec.ClusterIP) {
|
|
klog.V(2).Infof("Skipping service name:%s namespace:%s as there is no cluster IP",
|
|
svc.Name, svc.Namespace)
|
|
continue
|
|
}
|
|
|
|
if svc.Spec.Type == "ExternalName" {
|
|
klog.V(2).Infof("Skipping service name:%s namespace:%s due to service Type=%s",
|
|
svc.Name, svc.Namespace, svc.Spec.Type)
|
|
continue
|
|
}
|
|
|
|
for _, port := range svc.Spec.Ports {
|
|
svcInfo := serviceInfo{
|
|
clusterIP: net.ParseIP(svc.Spec.ClusterIP),
|
|
port: int(port.Port),
|
|
targetPort: port.TargetPort.String(),
|
|
protocol: strings.ToLower(string(port.Protocol)),
|
|
nodePort: int(port.NodePort),
|
|
name: svc.ObjectMeta.Name,
|
|
namespace: svc.ObjectMeta.Namespace,
|
|
externalIPs: make([]string, len(svc.Spec.ExternalIPs)),
|
|
local: false,
|
|
}
|
|
dsrMethod, ok := svc.ObjectMeta.Annotations[svcDSRAnnotation]
|
|
if ok {
|
|
svcInfo.directServerReturn = true
|
|
svcInfo.directServerReturnMethod = dsrMethod
|
|
}
|
|
svcInfo.scheduler = ipvs.RoundRobin
|
|
schedulingMethod, ok := svc.ObjectMeta.Annotations[svcSchedulerAnnotation]
|
|
if ok {
|
|
switch {
|
|
case schedulingMethod == ipvs.RoundRobin:
|
|
svcInfo.scheduler = ipvs.RoundRobin
|
|
case schedulingMethod == ipvs.LeastConnection:
|
|
svcInfo.scheduler = ipvs.LeastConnection
|
|
case schedulingMethod == ipvs.DestinationHashing:
|
|
svcInfo.scheduler = ipvs.DestinationHashing
|
|
case schedulingMethod == ipvs.SourceHashing:
|
|
svcInfo.scheduler = ipvs.SourceHashing
|
|
case schedulingMethod == IpvsMaglevHashing:
|
|
svcInfo.scheduler = IpvsMaglevHashing
|
|
}
|
|
}
|
|
|
|
flags, ok := svc.ObjectMeta.Annotations[svcSchedFlagsAnnotation]
|
|
if ok && svcInfo.scheduler == IpvsMaglevHashing {
|
|
svcInfo.flags = parseSchedFlags(flags)
|
|
}
|
|
|
|
copy(svcInfo.externalIPs, svc.Spec.ExternalIPs)
|
|
for _, lbIngress := range svc.Status.LoadBalancer.Ingress {
|
|
if len(lbIngress.IP) > 0 {
|
|
svcInfo.loadBalancerIPs = append(svcInfo.loadBalancerIPs, lbIngress.IP)
|
|
}
|
|
}
|
|
svcInfo.sessionAffinity = svc.Spec.SessionAffinity == api.ServiceAffinityClientIP
|
|
|
|
if svcInfo.sessionAffinity {
|
|
// Kube-apiserver side guarantees SessionAffinityConfig won't be nil when session affinity
|
|
// type is ClientIP
|
|
// https://github.com/kubernetes/kubernetes/blob/master/pkg/apis/core/v1/defaults.go#L106
|
|
svcInfo.sessionAffinityTimeoutSeconds = *svc.Spec.SessionAffinityConfig.ClientIP.TimeoutSeconds
|
|
}
|
|
_, svcInfo.hairpin = svc.ObjectMeta.Annotations[svcHairpinAnnotation]
|
|
_, svcInfo.hairpinExternalIPs = svc.ObjectMeta.Annotations[svcHairpinExternalIPsAnnotation]
|
|
_, svcInfo.local = svc.ObjectMeta.Annotations[svcLocalAnnotation]
|
|
_, svcInfo.skipLbIps = svc.ObjectMeta.Annotations[svcSkipLbIpsAnnotation]
|
|
if svc.Spec.ExternalTrafficPolicy == api.ServiceExternalTrafficPolicyTypeLocal {
|
|
svcInfo.local = true
|
|
}
|
|
|
|
svcID := generateServiceID(svc.Namespace, svc.Name, port.Name)
|
|
serviceMap[svcID] = &svcInfo
|
|
}
|
|
}
|
|
return serviceMap
|
|
}
|
|
|
|
func parseSchedFlags(value string) schedFlags {
|
|
var flag1, flag2, flag3 bool
|
|
|
|
if len(value) < 1 {
|
|
return schedFlags{}
|
|
}
|
|
|
|
flags := strings.Split(value, ",")
|
|
for _, flag := range flags {
|
|
switch strings.Trim(flag, " ") {
|
|
case IpvsSvcFSched1:
|
|
flag1 = true
|
|
case IpvsSvcFSched2:
|
|
flag2 = true
|
|
case IpvsSvcFSched3:
|
|
flag3 = true
|
|
default:
|
|
}
|
|
}
|
|
|
|
return schedFlags{flag1, flag2, flag3}
|
|
}
|
|
|
|
func shuffle(endPoints []endpointsInfo) []endpointsInfo {
|
|
for index1 := range endPoints {
|
|
// nolint:gosec // we don't need cryptographic randomness here
|
|
index2 := rand.Intn(index1 + 1)
|
|
endPoints[index1], endPoints[index2] = endPoints[index2], endPoints[index1]
|
|
}
|
|
return endPoints
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) buildEndpointsInfo() endpointsInfoMap {
|
|
endpointsMap := make(endpointsInfoMap)
|
|
for _, obj := range nsc.epLister.List() {
|
|
ep := obj.(*api.Endpoints)
|
|
|
|
for _, epSubset := range ep.Subsets {
|
|
for _, port := range epSubset.Ports {
|
|
svcID := generateServiceID(ep.Namespace, ep.Name, port.Name)
|
|
endpoints := make([]endpointsInfo, 0)
|
|
for _, addr := range epSubset.Addresses {
|
|
isLocal := addr.NodeName != nil && *addr.NodeName == nsc.nodeHostName
|
|
endpoints = append(endpoints, endpointsInfo{ip: addr.IP, port: int(port.Port), isLocal: isLocal})
|
|
}
|
|
endpointsMap[svcID] = shuffle(endpoints)
|
|
}
|
|
}
|
|
}
|
|
return endpointsMap
|
|
}
|
|
|
|
// Add an iptables rule to masquerade outbound IPVS traffic. IPVS nat requires that reverse path traffic
|
|
// to go through the director for its functioning. So the masquerade rule ensures source IP is modified
|
|
// to node ip, so return traffic from real server (endpoint pods) hits the node/lvs director
|
|
func (nsc *NetworkServicesController) ensureMasqueradeIptablesRule() error {
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed to initialize iptables executor" + err.Error())
|
|
}
|
|
var args = []string{"-m", "ipvs", "--ipvs", "--vdir", "ORIGINAL", "--vmethod", "MASQ",
|
|
"-m", "comment", "--comment", "", "-j", "SNAT", "--to-source", nsc.nodeIP.String()}
|
|
if iptablesCmdHandler.HasRandomFully() {
|
|
args = append(args, "--random-fully")
|
|
}
|
|
if nsc.masqueradeAll {
|
|
err = iptablesCmdHandler.AppendUnique("nat", "POSTROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to create iptables rule to masquerade all outbound IPVS traffic" + err.Error())
|
|
}
|
|
} else {
|
|
exists, err := iptablesCmdHandler.Exists("nat", "POSTROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to lookup iptables rule to masquerade all outbound IPVS traffic: " + err.Error())
|
|
}
|
|
if exists {
|
|
err = iptablesCmdHandler.Delete("nat", "POSTROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to delete iptables rule to masquerade all outbound IPVS traffic: " +
|
|
err.Error() + ". Masquerade might still work...")
|
|
}
|
|
klog.Infof("Deleted iptables rule to masquerade all outbound IVPS traffic.")
|
|
}
|
|
}
|
|
if len(nsc.podCidr) > 0 {
|
|
// TODO: ipset should be used for destination podCidr(s) match after multiple podCidr(s) per node get supported
|
|
args = []string{"-m", "ipvs", "--ipvs", "--vdir", "ORIGINAL", "--vmethod", "MASQ",
|
|
"-m", "comment", "--comment", "", "!", "-s", nsc.podCidr, "!", "-d", nsc.podCidr,
|
|
"-j", "SNAT", "--to-source", nsc.nodeIP.String()}
|
|
if iptablesCmdHandler.HasRandomFully() {
|
|
args = append(args, "--random-fully")
|
|
}
|
|
|
|
err = iptablesCmdHandler.AppendUnique("nat", "POSTROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to run iptables command" + err.Error())
|
|
}
|
|
}
|
|
klog.V(2).Info("Successfully synced iptables masquerade rule")
|
|
return nil
|
|
}
|
|
|
|
// Delete old/bad iptables rules to masquerade outbound IPVS traffic.
|
|
func (nsc *NetworkServicesController) deleteBadMasqueradeIptablesRules() error {
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed create iptables handler:" + err.Error())
|
|
}
|
|
|
|
var argsBad = [][]string{
|
|
{"-m", "ipvs", "--ipvs", "--vdir", "ORIGINAL", "--vmethod", "MASQ", "-m", "comment", "--comment", "",
|
|
"-j", "MASQUERADE"},
|
|
{"-m", "ipvs", "--ipvs", "--vdir", "ORIGINAL", "--vmethod", "MASQ", "-m", "comment", "--comment", "",
|
|
"!", "-s", nsc.podCidr, "!", "-d", nsc.podCidr, "-j", "MASQUERADE"},
|
|
}
|
|
|
|
// If random fully is supported remove the original rules as well
|
|
if iptablesCmdHandler.HasRandomFully() {
|
|
argsBad = append(argsBad, []string{"-m", "ipvs", "--ipvs", "--vdir", "ORIGINAL", "--vmethod", "MASQ",
|
|
"-m", "comment", "--comment", "", "-j", "SNAT", "--to-source", nsc.nodeIP.String()})
|
|
|
|
if len(nsc.podCidr) > 0 {
|
|
argsBad = append(argsBad, []string{"-m", "ipvs", "--ipvs", "--vdir", "ORIGINAL", "--vmethod", "MASQ",
|
|
"-m", "comment", "--comment", "",
|
|
"!", "-s", nsc.podCidr, "!", "-d", nsc.podCidr, "-j", "SNAT", "--to-source", nsc.nodeIP.String()})
|
|
}
|
|
}
|
|
|
|
for _, args := range argsBad {
|
|
exists, err := iptablesCmdHandler.Exists("nat", "POSTROUTING", args...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to lookup iptables rule: %s", err.Error())
|
|
}
|
|
|
|
if exists {
|
|
err = iptablesCmdHandler.Delete("nat", "POSTROUTING", args...)
|
|
if err != nil {
|
|
return fmt.Errorf("Failed to delete old/bad iptables rule to "+
|
|
"masquerade outbound IVPS traffic: %s.\n"+
|
|
"Masquerade all might still work, or bugs may persist after upgrade...",
|
|
err)
|
|
}
|
|
klog.Infof("Deleted old/bad iptables rule to masquerade outbound traffic.")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// syncHairpinIptablesRules adds/removes iptables rules pertaining to traffic
|
|
// from an Endpoint (Pod) to its own service VIP. Rules are only applied if
|
|
// enabled globally via CLI argument or a service has an annotation requesting
|
|
// it.
|
|
func (nsc *NetworkServicesController) syncHairpinIptablesRules() error {
|
|
// TODO: Use ipset?
|
|
// TODO: Log a warning that this will not work without hairpin sysctl set on veth
|
|
|
|
// Key is a string that will match iptables.List() rules
|
|
// Value is a string[] with arguments that iptables transaction functions expect
|
|
rulesNeeded := make(map[string][]string)
|
|
|
|
// Generate the rules that we need
|
|
for svcName, svcInfo := range nsc.serviceMap {
|
|
if nsc.globalHairpin || svcInfo.hairpin {
|
|
for _, ep := range nsc.endpointsMap[svcName] {
|
|
// Handle ClusterIP Service
|
|
rule, ruleArgs := hairpinRuleFrom(svcInfo.clusterIP.String(), ep.ip, svcInfo.port)
|
|
rulesNeeded[rule] = ruleArgs
|
|
|
|
// Handle ExternalIPs if requested
|
|
if svcInfo.hairpinExternalIPs {
|
|
for _, extIP := range svcInfo.externalIPs {
|
|
rule, ruleArgs := hairpinRuleFrom(extIP, ep.ip, svcInfo.port)
|
|
rulesNeeded[rule] = ruleArgs
|
|
}
|
|
}
|
|
|
|
// Handle NodePort Service
|
|
if svcInfo.nodePort != 0 {
|
|
rule, ruleArgs := hairpinRuleFrom(nsc.nodeIP.String(), ep.ip, svcInfo.nodePort)
|
|
rulesNeeded[rule] = ruleArgs
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Cleanup (if needed) and return if there's no hairpin-mode Services
|
|
if len(rulesNeeded) == 0 {
|
|
klog.V(1).Info("No hairpin-mode enabled services found -- no hairpin rules created")
|
|
err := deleteHairpinIptablesRules()
|
|
if err != nil {
|
|
return errors.New("Error deleting hairpin rules: " + err.Error())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed to initialize iptables executor" + err.Error())
|
|
}
|
|
|
|
// TODO: Factor these variables out
|
|
hasHairpinChain := false
|
|
|
|
// TODO: Factor out this code
|
|
chains, err := iptablesCmdHandler.ListChains("nat")
|
|
if err != nil {
|
|
return errors.New("Failed to list iptables chains: " + err.Error())
|
|
}
|
|
|
|
// TODO: Factor out this code
|
|
for _, chain := range chains {
|
|
if chain == ipvsHairpinChainName {
|
|
hasHairpinChain = true
|
|
}
|
|
}
|
|
|
|
// Create a chain for hairpin rules, if needed
|
|
if !hasHairpinChain {
|
|
err = iptablesCmdHandler.NewChain("nat", ipvsHairpinChainName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create iptables chain \"%s\": %v", ipvsHairpinChainName, err)
|
|
}
|
|
}
|
|
|
|
// Create a rule that targets our hairpin chain, if needed
|
|
// TODO: Factor this static rule out
|
|
jumpArgs := []string{"-m", "ipvs", "--vdir", "ORIGINAL", "-j", ipvsHairpinChainName}
|
|
err = iptablesCmdHandler.AppendUnique("nat", "POSTROUTING", jumpArgs...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to add hairpin iptables jump rule: %v", err)
|
|
}
|
|
|
|
rulesFromNode, err := iptablesCmdHandler.List("nat", ipvsHairpinChainName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get rules from iptables chain \"%s\": %v", ipvsHairpinChainName, err)
|
|
}
|
|
|
|
// Apply the rules we need
|
|
for _, ruleArgs := range rulesNeeded {
|
|
ruleExists := false
|
|
for _, ruleFromNode := range rulesFromNode {
|
|
_, ruleExists = rulesNeeded[ruleFromNode]
|
|
if ruleExists {
|
|
break
|
|
}
|
|
}
|
|
if !ruleExists {
|
|
err = iptablesCmdHandler.AppendUnique("nat", ipvsHairpinChainName, ruleArgs...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to apply hairpin iptables rule: %v", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Delete invalid/outdated rules
|
|
for _, ruleFromNode := range rulesFromNode {
|
|
_, ruleIsNeeded := rulesNeeded[ruleFromNode]
|
|
if !ruleIsNeeded {
|
|
args := strings.Fields(ruleFromNode)
|
|
if len(args) > 2 {
|
|
args = args[2:] // Strip "-A CHAIN_NAME"
|
|
|
|
err = iptablesCmdHandler.Delete("nat", ipvsHairpinChainName, args...)
|
|
if err != nil {
|
|
klog.Errorf("Unable to delete hairpin rule \"%s\" from chain %s: %e", ruleFromNode,
|
|
ipvsHairpinChainName, err)
|
|
} else {
|
|
klog.V(1).Infof("Deleted invalid/outdated hairpin rule \"%s\" from chain %s",
|
|
ruleFromNode, ipvsHairpinChainName)
|
|
}
|
|
} else {
|
|
// Ignore the chain creation rule
|
|
if ruleFromNode == "-N "+ipvsHairpinChainName {
|
|
continue
|
|
}
|
|
klog.V(1).Infof("Not removing invalid hairpin rule \"%s\" from chain %s", ruleFromNode,
|
|
ipvsHairpinChainName)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func hairpinRuleFrom(serviceIP string, endpointIP string, servicePort int) (string, []string) {
|
|
// TODO: Factor hairpinChain out
|
|
hairpinChain := "KUBE-ROUTER-HAIRPIN"
|
|
|
|
ruleArgs := []string{"-s", endpointIP + "/32", "-d", endpointIP + "/32",
|
|
"-m", "ipvs", "--vaddr", serviceIP, "--vport", strconv.Itoa(servicePort),
|
|
"-j", "SNAT", "--to-source", serviceIP}
|
|
|
|
// Trying to ensure this matches iptables.List()
|
|
ruleString := "-A " + hairpinChain + " -s " + endpointIP + "/32" + " -d " +
|
|
endpointIP + "/32" + " -m ipvs" + " --vaddr " + serviceIP + " --vport " +
|
|
strconv.Itoa(servicePort) + " -j SNAT" + " --to-source " + serviceIP
|
|
|
|
return ruleString, ruleArgs
|
|
}
|
|
|
|
func deleteHairpinIptablesRules() error {
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed to initialize iptables executor" + err.Error())
|
|
}
|
|
|
|
// TODO: Factor out this code
|
|
chains, err := iptablesCmdHandler.ListChains("nat")
|
|
if err != nil {
|
|
return errors.New("Failed to list iptables chains: " + err.Error())
|
|
}
|
|
|
|
// TODO: Factor these variables out
|
|
hasHairpinChain := false
|
|
|
|
// TODO: Factor out this code
|
|
for _, chain := range chains {
|
|
if chain == ipvsHairpinChainName {
|
|
hasHairpinChain = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// Nothing left to do if hairpin chain doesn't exist
|
|
if !hasHairpinChain {
|
|
return nil
|
|
}
|
|
|
|
// TODO: Factor this static jump rule out
|
|
jumpArgs := []string{"-m", "ipvs", "--vdir", "ORIGINAL", "-j", ipvsHairpinChainName}
|
|
hasHairpinJumpRule, err := iptablesCmdHandler.Exists("nat", "POSTROUTING", jumpArgs...)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to search POSTROUTING iptables rules: %v", err)
|
|
}
|
|
|
|
// Delete the jump rule to the hairpin chain
|
|
if hasHairpinJumpRule {
|
|
err = iptablesCmdHandler.Delete("nat", "POSTROUTING", jumpArgs...)
|
|
if err != nil {
|
|
klog.Errorf("unable to delete hairpin jump rule from chain \"POSTROUTING\": %v", err)
|
|
} else {
|
|
klog.V(1).Info("Deleted hairpin jump rule from chain \"POSTROUTING\"")
|
|
}
|
|
}
|
|
|
|
// Flush and delete the chain for hairpin rules
|
|
err = iptablesCmdHandler.ClearChain("nat", ipvsHairpinChainName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to flush iptables chain \"%s\": %v", ipvsHairpinChainName, err)
|
|
}
|
|
err = iptablesCmdHandler.DeleteChain("nat", ipvsHairpinChainName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to delete iptables chain \"%s\": %v", ipvsHairpinChainName, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func deleteMasqueradeIptablesRule() error {
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed to initialize iptables executor" + err.Error())
|
|
}
|
|
postRoutingChainRules, err := iptablesCmdHandler.List("nat", "POSTROUTING")
|
|
if err != nil {
|
|
return errors.New("Failed to list iptables rules in POSTROUTING chain in nat table" + err.Error())
|
|
}
|
|
for i, rule := range postRoutingChainRules {
|
|
if strings.Contains(rule, "ipvs") && strings.Contains(rule, "SNAT") {
|
|
err = iptablesCmdHandler.Delete("nat", "POSTROUTING", strconv.Itoa(i))
|
|
if err != nil {
|
|
return errors.New("Failed to run iptables command" + err.Error())
|
|
}
|
|
klog.V(2).Infof("Deleted iptables masquerade rule: %s", rule)
|
|
break
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func ipvsServiceString(s *ipvs.Service) string {
|
|
var flags, protocol string
|
|
|
|
switch s.Protocol {
|
|
case syscall.IPPROTO_TCP:
|
|
protocol = "TCP"
|
|
case syscall.IPPROTO_UDP:
|
|
protocol = "UDP"
|
|
default:
|
|
protocol = "UNKNOWN"
|
|
}
|
|
|
|
if s.Flags&ipvsPersistentFlagHex != 0 {
|
|
flags += "[persistent port]"
|
|
}
|
|
|
|
if s.Flags&ipvsHashedFlagHex != 0 {
|
|
flags += "[hashed entry]"
|
|
}
|
|
|
|
if s.Flags&ipvsOnePacketFlagHex != 0 {
|
|
flags += "[one-packet scheduling]"
|
|
}
|
|
|
|
if s.Flags&ipvsSched1FlagHex != 0 {
|
|
flags += "[flag-1(fallback)]"
|
|
}
|
|
|
|
if s.Flags&ipvsSched2FlagHex != 0 {
|
|
flags += "[flag-2(port)]"
|
|
}
|
|
|
|
if s.Flags&ipvsSched3FlagHex != 0 {
|
|
flags += "[flag-3]"
|
|
}
|
|
|
|
return fmt.Sprintf("%s:%s:%v (Flags: %s)", protocol, s.Address, s.Port, flags)
|
|
}
|
|
|
|
func ipvsDestinationString(d *ipvs.Destination) string {
|
|
return fmt.Sprintf("%s:%v (Weight: %v)", d.Address, d.Port, d.Weight)
|
|
}
|
|
|
|
func ipvsSetPersistence(svc *ipvs.Service, p bool, timeout int32) {
|
|
if p {
|
|
svc.Flags |= ipvsPersistentFlagHex
|
|
svc.Netmask |= 0xFFFFFFFF
|
|
svc.Timeout = uint32(timeout)
|
|
} else {
|
|
svc.Flags &^= ipvsPersistentFlagHex
|
|
svc.Netmask &^= 0xFFFFFFFF
|
|
svc.Timeout = 0
|
|
}
|
|
}
|
|
|
|
func ipvsSetSchedFlags(svc *ipvs.Service, s schedFlags) {
|
|
if s.flag1 {
|
|
svc.Flags |= ipvsSched1FlagHex
|
|
} else {
|
|
svc.Flags &^= ipvsSched1FlagHex
|
|
}
|
|
|
|
if s.flag2 {
|
|
svc.Flags |= ipvsSched2FlagHex
|
|
} else {
|
|
svc.Flags &^= ipvsSched2FlagHex
|
|
}
|
|
|
|
if s.flag3 {
|
|
svc.Flags |= ipvsSched3FlagHex
|
|
} else {
|
|
svc.Flags &^= ipvsSched3FlagHex
|
|
}
|
|
|
|
/* Keep netmask which is set by ipvsSetPersistence() before */
|
|
if (svc.Netmask&0xFFFFFFFF != 0) || (s.flag1 || s.flag2 || s.flag3) {
|
|
svc.Netmask |= 0xFFFFFFFF
|
|
} else {
|
|
svc.Netmask &^= 0xFFFFFFFF
|
|
}
|
|
}
|
|
|
|
/* Compare service scheduler flags with ipvs service */
|
|
func changedIpvsSchedFlags(svc *ipvs.Service, s schedFlags) bool {
|
|
if (s.flag1 && (svc.Flags&ipvsSched1FlagHex) == 0) || (!s.flag1 && (svc.Flags&ipvsSched1FlagHex) != 0) {
|
|
return true
|
|
}
|
|
|
|
if (s.flag2 && (svc.Flags&ipvsSched2FlagHex) == 0) || (!s.flag2 && (svc.Flags&ipvsSched2FlagHex) != 0) {
|
|
return true
|
|
}
|
|
|
|
if (s.flag3 && (svc.Flags&ipvsSched3FlagHex) == 0) || (!s.flag3 && (svc.Flags&ipvsSched3FlagHex) != 0) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsAddService(svcs []*ipvs.Service, vip net.IP, protocol, port uint16,
|
|
persistent bool, persistentTimeout int32, scheduler string, flags schedFlags) (*ipvs.Service, error) {
|
|
|
|
var err error
|
|
for _, svc := range svcs {
|
|
if vip.Equal(svc.Address) && protocol == svc.Protocol && port == svc.Port {
|
|
if (persistent && (svc.Flags&ipvsPersistentFlagHex) == 0) ||
|
|
(!persistent && (svc.Flags&ipvsPersistentFlagHex) != 0) ||
|
|
svc.Timeout != uint32(persistentTimeout) {
|
|
ipvsSetPersistence(svc, persistent, persistentTimeout)
|
|
|
|
err = ln.ipvsUpdateService(svc)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
klog.V(2).Infof("Updated persistence/session-affinity for service: %s",
|
|
ipvsServiceString(svc))
|
|
}
|
|
|
|
if changedIpvsSchedFlags(svc, flags) {
|
|
ipvsSetSchedFlags(svc, flags)
|
|
|
|
err = ln.ipvsUpdateService(svc)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
klog.V(2).Infof("Updated scheduler flags for service: %s", ipvsServiceString(svc))
|
|
}
|
|
|
|
if scheduler != svc.SchedName {
|
|
svc.SchedName = scheduler
|
|
err = ln.ipvsUpdateService(svc)
|
|
if err != nil {
|
|
return nil, errors.New("Failed to update the scheduler for the service due to " + err.Error())
|
|
}
|
|
klog.V(2).Infof("Updated schedule for the service: %s", ipvsServiceString(svc))
|
|
}
|
|
// TODO: Make this debug output when we get log levels
|
|
// klog.Fatal("ipvs service %s:%s:%s already exists so returning", vip.String(),
|
|
// protocol, strconv.Itoa(int(port)))
|
|
|
|
return svc, nil
|
|
}
|
|
}
|
|
|
|
svc := ipvs.Service{
|
|
Address: vip,
|
|
AddressFamily: syscall.AF_INET,
|
|
Protocol: protocol,
|
|
Port: port,
|
|
SchedName: scheduler,
|
|
}
|
|
|
|
ipvsSetPersistence(&svc, persistent, persistentTimeout)
|
|
ipvsSetSchedFlags(&svc, flags)
|
|
|
|
err = ln.ipvsNewService(&svc)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
klog.V(1).Infof("Successfully added service: %s", ipvsServiceString(&svc))
|
|
return &svc, nil
|
|
}
|
|
|
|
// generateFwmark: generate a uint32 hash value using the IP address, port, protocol information
|
|
// TODO: collision can rarely happen but still need to be ruled out
|
|
// TODO: I ran into issues with FWMARK for any value above 2^15. Either policy
|
|
// routing and IPVS FWMARK service was not functioning with value above 2^15
|
|
func generateFwmark(ip, protocol, port string) (uint32, error) {
|
|
const maxFwMarkBitSize = 0x3FFF
|
|
h := fnv.New32a()
|
|
_, err := h.Write([]byte(ip + "-" + protocol + "-" + port))
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return h.Sum32() & maxFwMarkBitSize, err
|
|
}
|
|
|
|
// ipvsAddFWMarkService: creates a IPVS service using FWMARK
|
|
func (ln *linuxNetworking) ipvsAddFWMarkService(vip net.IP, protocol, port uint16, persistent bool,
|
|
persistentTimeout int32, scheduler string, flags schedFlags) (*ipvs.Service, error) {
|
|
|
|
var protocolStr string
|
|
switch {
|
|
case protocol == syscall.IPPROTO_TCP:
|
|
protocolStr = tcpProtocol
|
|
case protocol == syscall.IPPROTO_UDP:
|
|
protocolStr = udpProtocol
|
|
default:
|
|
protocolStr = "unknown"
|
|
}
|
|
|
|
// generate a FWMARK value unique to the external IP + protocol+ port combination
|
|
fwmark, err := generateFwmark(vip.String(), protocolStr, fmt.Sprint(port))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
svcs, err := ln.ipvsGetServices()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, svc := range svcs {
|
|
if fwmark == svc.FWMark {
|
|
if (persistent && (svc.Flags&ipvsPersistentFlagHex) == 0) ||
|
|
(!persistent && (svc.Flags&ipvsPersistentFlagHex) != 0) {
|
|
ipvsSetPersistence(svc, persistent, persistentTimeout)
|
|
|
|
if changedIpvsSchedFlags(svc, flags) {
|
|
ipvsSetSchedFlags(svc, flags)
|
|
}
|
|
|
|
err = ln.ipvsUpdateService(svc)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
klog.V(2).Infof("Updated persistence/session-affinity for service: %s",
|
|
ipvsServiceString(svc))
|
|
}
|
|
|
|
if changedIpvsSchedFlags(svc, flags) {
|
|
ipvsSetSchedFlags(svc, flags)
|
|
|
|
err = ln.ipvsUpdateService(svc)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
klog.V(2).Infof("Updated scheduler flags for service: %s", ipvsServiceString(svc))
|
|
}
|
|
|
|
if scheduler != svc.SchedName {
|
|
svc.SchedName = scheduler
|
|
err = ln.ipvsUpdateService(svc)
|
|
if err != nil {
|
|
return nil, errors.New("Failed to update the scheduler for the service due to " + err.Error())
|
|
}
|
|
klog.V(2).Infof("Updated schedule for the service: %s", ipvsServiceString(svc))
|
|
}
|
|
// TODO: Make this debug output when we get log levels
|
|
// klog.Fatal("ipvs service %s:%s:%s already exists so returning", vip.String(),
|
|
// protocol, strconv.Itoa(int(port)))
|
|
|
|
return svc, nil
|
|
}
|
|
}
|
|
|
|
svc := ipvs.Service{
|
|
FWMark: fwmark,
|
|
AddressFamily: syscall.AF_INET,
|
|
Protocol: protocol,
|
|
Port: port,
|
|
SchedName: ipvs.RoundRobin,
|
|
}
|
|
|
|
ipvsSetPersistence(&svc, persistent, persistentTimeout)
|
|
ipvsSetSchedFlags(&svc, flags)
|
|
|
|
err = ln.ipvsNewService(&svc)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
klog.Infof("Successfully added service: %s", ipvsServiceString(&svc))
|
|
return &svc, nil
|
|
}
|
|
|
|
func (ln *linuxNetworking) ipvsAddServer(service *ipvs.Service, dest *ipvs.Destination) error {
|
|
err := ln.ipvsNewDestination(service, dest)
|
|
if err == nil {
|
|
klog.V(2).Infof("Successfully added destination %s to the service %s",
|
|
ipvsDestinationString(dest), ipvsServiceString(service))
|
|
return nil
|
|
}
|
|
|
|
if strings.Contains(err.Error(), IpvsServerExists) {
|
|
err = ln.ipvsUpdateDestination(service, dest)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to update ipvs destination %s to the ipvs service %s due to : %s",
|
|
ipvsDestinationString(dest), ipvsServiceString(service), err.Error())
|
|
}
|
|
// TODO: Make this debug output when we get log levels
|
|
// klog.Infof("ipvs destination %s already exists in the ipvs service %s so not adding destination",
|
|
// ipvsDestinationString(dest), ipvsServiceString(service))
|
|
} else {
|
|
return fmt.Errorf("failed to add ipvs destination %s to the ipvs service %s due to : %s",
|
|
ipvsDestinationString(dest), ipvsServiceString(service), err.Error())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
const (
|
|
customDSRRouteTableID = "78"
|
|
customDSRRouteTableName = "kube-router-dsr"
|
|
externalIPRouteTableID = "79"
|
|
externalIPRouteTableName = "external_ip"
|
|
)
|
|
|
|
// setupMangleTableRule: sets up iptables rule to FWMARK the traffic to external IP vip
|
|
func setupMangleTableRule(ip string, protocol string, port string, fwmark string, tcpMSS int) error {
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed to initialize iptables executor" + err.Error())
|
|
}
|
|
args := []string{"-d", ip, "-m", protocol, "-p", protocol, "--dport", port, "-j", "MARK", "--set-mark", fwmark}
|
|
err = iptablesCmdHandler.AppendUnique("mangle", "PREROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to run iptables command to set up FWMARK due to " + err.Error())
|
|
}
|
|
err = iptablesCmdHandler.AppendUnique("mangle", "OUTPUT", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to run iptables command to set up FWMARK due to " + err.Error())
|
|
}
|
|
|
|
// setup iptables rule TCPMSS for DSR mode to fix mtu problem
|
|
mtuArgs := []string{"-d", ip, "-m", tcpProtocol, "-p", tcpProtocol, "--tcp-flags", "SYN,RST", "SYN", "-j", "TCPMSS",
|
|
"--set-mss", strconv.Itoa(tcpMSS)}
|
|
err = iptablesCmdHandler.AppendUnique("mangle", "PREROUTING", mtuArgs...)
|
|
if err != nil {
|
|
return errors.New("Failed to run iptables command to set up TCPMSS due to " + err.Error())
|
|
}
|
|
mtuArgs[0] = "-s"
|
|
err = iptablesCmdHandler.AppendUnique("mangle", "POSTROUTING", mtuArgs...)
|
|
if err != nil {
|
|
return errors.New("Failed to run iptables command to set up TCPMSS due to " + err.Error())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (ln *linuxNetworking) cleanupMangleTableRule(ip string, protocol string, port string,
|
|
fwmark string, tcpMSS int) error {
|
|
iptablesCmdHandler, err := iptables.New()
|
|
if err != nil {
|
|
return errors.New("Failed to initialize iptables executor" + err.Error())
|
|
}
|
|
args := []string{"-d", ip, "-m", protocol, "-p", protocol, "--dport", port, "-j", "MARK", "--set-mark", fwmark}
|
|
exists, err := iptablesCmdHandler.Exists("mangle", "PREROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up FWMARK due to " + err.Error())
|
|
}
|
|
if exists {
|
|
err = iptablesCmdHandler.Delete("mangle", "PREROUTING", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up FWMARK due to " + err.Error())
|
|
}
|
|
}
|
|
exists, err = iptablesCmdHandler.Exists("mangle", "OUTPUT", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up FWMARK due to " + err.Error())
|
|
}
|
|
if exists {
|
|
err = iptablesCmdHandler.Delete("mangle", "OUTPUT", args...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up FWMARK due to " + err.Error())
|
|
}
|
|
}
|
|
|
|
// cleanup iptables rule TCPMSS
|
|
mtuArgs := []string{"-d", ip, "-m", tcpProtocol, "-p", tcpProtocol, "--tcp-flags", "SYN,RST", "SYN", "-j", "TCPMSS",
|
|
"--set-mss", strconv.Itoa(tcpMSS)}
|
|
exists, err = iptablesCmdHandler.Exists("mangle", "PREROUTING", mtuArgs...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up TCPMSS due to " + err.Error())
|
|
}
|
|
if exists {
|
|
err = iptablesCmdHandler.Delete("mangle", "PREROUTING", mtuArgs...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up TCPMSS due to " + err.Error())
|
|
}
|
|
}
|
|
mtuArgs[0] = "-s"
|
|
exists, err = iptablesCmdHandler.Exists("mangle", "POSTROUTING", mtuArgs...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up TCPMSS due to " + err.Error())
|
|
}
|
|
if exists {
|
|
err = iptablesCmdHandler.Delete("mangle", "POSTROUTING", mtuArgs...)
|
|
if err != nil {
|
|
return errors.New("Failed to cleanup iptables command to set up TCPMSS due to " + err.Error())
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// For DSR it is required that we dont assign the VIP to any interface to avoid martian packets
|
|
// http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.routing_to_VIP-less_director.html
|
|
// routeVIPTrafficToDirector: setups policy routing so that FWMARKed packets are delivered locally
|
|
func routeVIPTrafficToDirector(fwmark string) error {
|
|
out, err := exec.Command("ip", "rule", "list").Output()
|
|
if err != nil {
|
|
return errors.New("Failed to verify if `ip rule` exists due to: " + err.Error())
|
|
}
|
|
if !strings.Contains(string(out), fwmark) {
|
|
err = exec.Command("ip", "rule", "add", "prio", "32764", "fwmark", fwmark, "table",
|
|
customDSRRouteTableID).Run()
|
|
if err != nil {
|
|
return errors.New("Failed to add policy rule to lookup traffic to VIP through the custom " +
|
|
" routing table due to " + err.Error())
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// For DSR it is required that we dont assign the VIP to any interface to avoid martian packets
|
|
// http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.routing_to_VIP-less_director.html
|
|
// setupPolicyRoutingForDSR: setups policy routing so that FWMARKed packets are delivered locally
|
|
func (ln *linuxNetworking) setupPolicyRoutingForDSR() error {
|
|
b, err := ioutil.ReadFile("/etc/iproute2/rt_tables")
|
|
if err != nil {
|
|
return errors.New("Failed to setup policy routing required for DSR due to " + err.Error())
|
|
}
|
|
|
|
if !strings.Contains(string(b), customDSRRouteTableName) {
|
|
f, err := os.OpenFile("/etc/iproute2/rt_tables", os.O_APPEND|os.O_WRONLY, 0600)
|
|
if err != nil {
|
|
return errors.New("Failed to setup policy routing required for DSR due to " + err.Error())
|
|
}
|
|
defer utils.CloseCloserDisregardError(f)
|
|
if _, err = f.WriteString(customDSRRouteTableID + " " + customDSRRouteTableName + "\n"); err != nil {
|
|
return errors.New("Failed to setup policy routing required for DSR due to " + err.Error())
|
|
}
|
|
}
|
|
out, err := exec.Command("ip", "route", "list", "table", customDSRRouteTableID).Output()
|
|
if err != nil || !strings.Contains(string(out), " lo ") {
|
|
if err = exec.Command("ip", "route", "add", "local", "default", "dev", "lo", "table",
|
|
customDSRRouteTableID).Run(); err != nil {
|
|
return errors.New("Failed to add route in custom route table due to: " + err.Error())
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// For DSR it is required that node needs to know how to route external IP. Otherwise when endpoint
|
|
// directly responds back with source IP as external IP kernel will treat as martian packet.
|
|
// To prevent martian packets add route to external IP through the `kube-bridge` interface
|
|
// setupRoutesForExternalIPForDSR: setups routing so that kernel does not think return packets as martians
|
|
|
|
func (ln *linuxNetworking) setupRoutesForExternalIPForDSR(serviceInfoMap serviceInfoMap) error {
|
|
b, err := ioutil.ReadFile("/etc/iproute2/rt_tables")
|
|
if err != nil {
|
|
return errors.New("Failed to setup external ip routing table required for DSR due to " + err.Error())
|
|
}
|
|
|
|
if !strings.Contains(string(b), externalIPRouteTableName) {
|
|
f, err := os.OpenFile("/etc/iproute2/rt_tables", os.O_APPEND|os.O_WRONLY, 0600)
|
|
if err != nil {
|
|
return errors.New("Failed setup external ip routing table required for DSR due to " + err.Error())
|
|
}
|
|
defer utils.CloseCloserDisregardError(f)
|
|
if _, err = f.WriteString(externalIPRouteTableID + " " + externalIPRouteTableName + "\n"); err != nil {
|
|
return errors.New("Failed setup external ip routing table required for DSR due to " + err.Error())
|
|
}
|
|
}
|
|
|
|
out, err := exec.Command("ip", "rule", "list").Output()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to verify if `ip rule add prio 32765 from all lookup external_ip` exists due to: %v",
|
|
err)
|
|
}
|
|
|
|
if !(strings.Contains(string(out), externalIPRouteTableName) ||
|
|
strings.Contains(string(out), externalIPRouteTableID)) {
|
|
err = exec.Command("ip", "rule", "add", "prio", "32765", "from", "all", "lookup",
|
|
externalIPRouteTableID).Run()
|
|
if err != nil {
|
|
klog.Infof("Failed to add policy rule `ip rule add prio 32765 from all lookup external_ip` due to %v",
|
|
err.Error())
|
|
return fmt.Errorf("failed to add policy rule `ip rule add prio 32765 from all lookup external_ip` "+
|
|
"due to %v", err)
|
|
}
|
|
}
|
|
|
|
out, _ = exec.Command("ip", "route", "list", "table", externalIPRouteTableID).Output()
|
|
outStr := string(out)
|
|
activeExternalIPs := make(map[string]bool)
|
|
for _, svc := range serviceInfoMap {
|
|
for _, externalIP := range svc.externalIPs {
|
|
// Verify the DSR annotation exists
|
|
if !svc.directServerReturn {
|
|
klog.V(1).Infof("Skipping service %s/%s as it does not have DSR annotation\n",
|
|
svc.namespace, svc.name)
|
|
continue
|
|
}
|
|
|
|
activeExternalIPs[externalIP] = true
|
|
|
|
if !strings.Contains(outStr, externalIP) {
|
|
if err = exec.Command("ip", "route", "add", externalIP, "dev", "kube-bridge", "table",
|
|
externalIPRouteTableID).Run(); err != nil {
|
|
klog.Errorf("Failed to add route for %s in custom route table for external IP's due to: %v",
|
|
externalIP, err)
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// check if there are any pbr in externalIPRouteTableID for external IP's
|
|
if len(outStr) > 0 {
|
|
// clean up stale external IPs
|
|
for _, line := range strings.Split(strings.Trim(outStr, "\n"), "\n") {
|
|
route := strings.Split(strings.Trim(line, " "), " ")
|
|
ip := route[0]
|
|
if !activeExternalIPs[ip] {
|
|
args := []string{"route", "del", "table", externalIPRouteTableID}
|
|
args = append(args, route...)
|
|
if err = exec.Command("ip", args...).Run(); err != nil {
|
|
klog.Errorf("Failed to del route for %v in custom route table for external IP's due to: %s",
|
|
ip, err)
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func isEndpointsForLeaderElection(ep *api.Endpoints) bool {
|
|
_, isLeaderElection := ep.Annotations[LeaderElectionRecordAnnotationKey]
|
|
return isLeaderElection
|
|
}
|
|
|
|
// unique identifier for a load-balanced service (namespace + name + portname)
|
|
func generateServiceID(namespace, svcName, port string) string {
|
|
return namespace + "-" + svcName + "-" + port
|
|
}
|
|
|
|
// unique identifier for a load-balanced service (namespace + name + portname)
|
|
func generateIPPortID(ip, protocol, port string) string {
|
|
return ip + "-" + protocol + "-" + port
|
|
}
|
|
|
|
func generateEndpointID(ip, port string) string {
|
|
return ip + ":" + port
|
|
}
|
|
|
|
// returns all IP addresses found on any network address in the system, excluding dummy and docker interfaces
|
|
func getAllLocalIPs() ([]netlink.Addr, error) {
|
|
links, err := netlink.LinkList()
|
|
if err != nil {
|
|
return nil, errors.New("Could not load list of net interfaces: " + err.Error())
|
|
}
|
|
|
|
addrs := make([]netlink.Addr, 0)
|
|
for _, link := range links {
|
|
|
|
// do not include IPs for any interface that calls itself "dummy"
|
|
// or any of the docker# interfaces
|
|
if strings.Contains(link.Attrs().Name, "dummy") ||
|
|
strings.Contains(link.Attrs().Name, "kube") ||
|
|
strings.Contains(link.Attrs().Name, "docker") {
|
|
|
|
continue
|
|
}
|
|
|
|
linkAddrs, err := netlink.AddrList(link, netlink.FAMILY_V4)
|
|
if err != nil {
|
|
return nil, errors.New("Failed to get IPs for interface: " + err.Error())
|
|
}
|
|
|
|
addrs = append(addrs, linkAddrs...)
|
|
}
|
|
|
|
return addrs, nil
|
|
}
|
|
|
|
func (ln *linuxNetworking) getKubeDummyInterface() (netlink.Link, error) {
|
|
var dummyVipInterface netlink.Link
|
|
dummyVipInterface, err := netlink.LinkByName(KubeDummyIf)
|
|
if err != nil && err.Error() == IfaceNotFound {
|
|
klog.V(1).Infof("Could not find dummy interface: %s to assign cluster ip's, creating one",
|
|
KubeDummyIf)
|
|
err = netlink.LinkAdd(&netlink.Dummy{LinkAttrs: netlink.LinkAttrs{Name: KubeDummyIf}})
|
|
if err != nil {
|
|
return nil, errors.New("Failed to add dummy interface: " + err.Error())
|
|
}
|
|
dummyVipInterface, err = netlink.LinkByName(KubeDummyIf)
|
|
if err != nil {
|
|
return nil, errors.New("Failed to get dummy interface: " + err.Error())
|
|
}
|
|
err = netlink.LinkSetUp(dummyVipInterface)
|
|
if err != nil {
|
|
return nil, errors.New("Failed to bring dummy interface up: " + err.Error())
|
|
}
|
|
}
|
|
return dummyVipInterface, nil
|
|
}
|
|
|
|
// Cleanup cleans all the configurations (IPVS, iptables, links) done
|
|
func (nsc *NetworkServicesController) Cleanup() {
|
|
klog.Infof("Cleaning up NetworkServiceController configurations...")
|
|
|
|
// cleanup ipvs rules by flush
|
|
handle, err := ipvs.New("")
|
|
if err != nil {
|
|
klog.Errorf("failed to get ipvs handle for cleaning ipvs definitions: %v", err)
|
|
} else {
|
|
klog.Infof("ipvs definitions don't have names associated with them for checking, during cleanup " +
|
|
"we assume that we own all of them and delete all ipvs definitions")
|
|
err = handle.Flush()
|
|
if err != nil {
|
|
klog.Errorf("unable to flush ipvs tables: %v", err)
|
|
}
|
|
handle.Close()
|
|
}
|
|
|
|
// cleanup iptables masquerade rule
|
|
err = deleteMasqueradeIptablesRule()
|
|
if err != nil {
|
|
klog.Errorf("Failed to cleanup iptablesmasquerade rule due to: %s", err.Error())
|
|
return
|
|
}
|
|
|
|
// cleanup iptables hairpin rules
|
|
err = deleteHairpinIptablesRules()
|
|
if err != nil {
|
|
klog.Errorf("Failed to cleanup iptables hairpin rules: %s", err.Error())
|
|
return
|
|
}
|
|
|
|
nsc.cleanupIpvsFirewall()
|
|
|
|
// delete dummy interface used to assign cluster IP's
|
|
dummyVipInterface, err := netlink.LinkByName(KubeDummyIf)
|
|
if err != nil {
|
|
if err.Error() != IfaceNotFound {
|
|
klog.Infof("Dummy interface: " + KubeDummyIf + " does not exist")
|
|
}
|
|
} else {
|
|
err = netlink.LinkDel(dummyVipInterface)
|
|
if err != nil {
|
|
klog.Errorf("Could not delete dummy interface " + KubeDummyIf + " due to " + err.Error())
|
|
return
|
|
}
|
|
}
|
|
|
|
klog.Infof("Successfully cleaned the NetworkServiceController configuration done by kube-router")
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) newEndpointsEventHandler() cache.ResourceEventHandler {
|
|
return cache.ResourceEventHandlerFuncs{
|
|
AddFunc: func(obj interface{}) {
|
|
nsc.handleEndpointsAdd(obj)
|
|
|
|
},
|
|
UpdateFunc: func(oldObj, newObj interface{}) {
|
|
nsc.handleEndpointsUpdate(oldObj, newObj)
|
|
},
|
|
DeleteFunc: func(obj interface{}) {
|
|
nsc.handleEndpointsDelete(obj)
|
|
},
|
|
}
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) handleEndpointsAdd(obj interface{}) {
|
|
endpoints, ok := obj.(*api.Endpoints)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", obj)
|
|
return
|
|
}
|
|
nsc.OnEndpointsUpdate(endpoints)
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) handleEndpointsUpdate(oldObj, newObj interface{}) {
|
|
_, ok := oldObj.(*api.Endpoints)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", oldObj)
|
|
return
|
|
}
|
|
newEndpoints, ok := newObj.(*api.Endpoints)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", newObj)
|
|
return
|
|
}
|
|
nsc.OnEndpointsUpdate(newEndpoints)
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) handleEndpointsDelete(obj interface{}) {
|
|
endpoints, ok := obj.(*api.Endpoints)
|
|
if !ok {
|
|
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", obj)
|
|
return
|
|
}
|
|
if endpoints, ok = tombstone.Obj.(*api.Endpoints); !ok {
|
|
klog.Errorf("unexpected object type: %v", obj)
|
|
return
|
|
}
|
|
}
|
|
nsc.OnEndpointsUpdate(endpoints)
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) newSvcEventHandler() cache.ResourceEventHandler {
|
|
return cache.ResourceEventHandlerFuncs{
|
|
AddFunc: func(obj interface{}) {
|
|
nsc.handleServiceAdd(obj)
|
|
},
|
|
UpdateFunc: func(oldObj, newObj interface{}) {
|
|
nsc.handleServiceUpdate(oldObj, newObj)
|
|
},
|
|
DeleteFunc: func(obj interface{}) {
|
|
nsc.handleServiceDelete(obj)
|
|
},
|
|
}
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) handleServiceAdd(obj interface{}) {
|
|
service, ok := obj.(*api.Service)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", obj)
|
|
return
|
|
}
|
|
nsc.OnServiceUpdate(service)
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) handleServiceUpdate(oldObj, newObj interface{}) {
|
|
_, ok := oldObj.(*api.Service)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", oldObj)
|
|
return
|
|
}
|
|
newService, ok := newObj.(*api.Service)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", newObj)
|
|
return
|
|
}
|
|
nsc.OnServiceUpdate(newService)
|
|
}
|
|
|
|
func (nsc *NetworkServicesController) handleServiceDelete(obj interface{}) {
|
|
service, ok := obj.(*api.Service)
|
|
if !ok {
|
|
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
|
|
if !ok {
|
|
klog.Errorf("unexpected object type: %v", obj)
|
|
return
|
|
}
|
|
if service, ok = tombstone.Obj.(*api.Service); !ok {
|
|
klog.Errorf("unexpected object type: %v", obj)
|
|
return
|
|
}
|
|
}
|
|
nsc.OnServiceUpdate(service)
|
|
}
|
|
|
|
// NewNetworkServicesController returns NetworkServicesController object
|
|
func NewNetworkServicesController(clientset kubernetes.Interface,
|
|
config *options.KubeRouterConfig, svcInformer cache.SharedIndexInformer,
|
|
epInformer cache.SharedIndexInformer, podInformer cache.SharedIndexInformer,
|
|
ipsetMutex *sync.Mutex) (*NetworkServicesController, error) {
|
|
|
|
var err error
|
|
ln, err := newLinuxNetworking()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
nsc := NetworkServicesController{ln: ln, ipsetMutex: ipsetMutex, metricsMap: make(map[string][]string)}
|
|
|
|
if config.MetricsEnabled {
|
|
// Register the metrics for this controller
|
|
prometheus.MustRegister(metrics.ControllerIpvsServices)
|
|
prometheus.MustRegister(metrics.ControllerIpvsServicesSyncTime)
|
|
prometheus.MustRegister(metrics.ServiceBpsIn)
|
|
prometheus.MustRegister(metrics.ServiceBpsOut)
|
|
prometheus.MustRegister(metrics.ServiceBytesIn)
|
|
prometheus.MustRegister(metrics.ServiceBytesOut)
|
|
prometheus.MustRegister(metrics.ServiceCPS)
|
|
prometheus.MustRegister(metrics.ServicePacketsIn)
|
|
prometheus.MustRegister(metrics.ServicePacketsOut)
|
|
prometheus.MustRegister(metrics.ServicePpsIn)
|
|
prometheus.MustRegister(metrics.ServicePpsOut)
|
|
prometheus.MustRegister(metrics.ServiceTotalConn)
|
|
nsc.MetricsEnabled = true
|
|
}
|
|
|
|
nsc.syncPeriod = config.IpvsSyncPeriod
|
|
nsc.syncChan = make(chan int, 2)
|
|
nsc.gracefulPeriod = config.IpvsGracefulPeriod
|
|
nsc.gracefulTermination = config.IpvsGracefulTermination
|
|
nsc.globalHairpin = config.GlobalHairpinMode
|
|
|
|
nsc.serviceMap = make(serviceInfoMap)
|
|
nsc.endpointsMap = make(endpointsInfoMap)
|
|
nsc.client = clientset
|
|
|
|
nsc.ProxyFirewallSetup = sync.NewCond(&sync.Mutex{})
|
|
nsc.dsr = &dsrOpt{runtimeEndpoint: config.RuntimeEndpoint}
|
|
|
|
nsc.masqueradeAll = false
|
|
if config.MasqueradeAll {
|
|
nsc.masqueradeAll = true
|
|
}
|
|
|
|
if config.NodePortBindOnAllIP {
|
|
nsc.nodeportBindOnAllIP = true
|
|
}
|
|
|
|
if config.RunRouter {
|
|
cidr, err := utils.GetPodCidrFromNodeSpec(nsc.client, config.HostnameOverride)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get pod CIDR details from Node.spec: %s", err.Error())
|
|
}
|
|
nsc.podCidr = cidr
|
|
}
|
|
|
|
nsc.excludedCidrs = make([]net.IPNet, len(config.ExcludedCidrs))
|
|
for i, excludedCidr := range config.ExcludedCidrs {
|
|
_, ipnet, err := net.ParseCIDR(excludedCidr)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get excluded CIDR details: %s", err.Error())
|
|
}
|
|
nsc.excludedCidrs[i] = *ipnet
|
|
}
|
|
|
|
node, err := utils.GetNodeObject(clientset, config.HostnameOverride)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
nsc.nodeHostName = node.Name
|
|
NodeIP, err = utils.GetNodeIP(node)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nsc.nodeIP = NodeIP
|
|
automtu, err := utils.GetMTUFromNodeIP(nsc.nodeIP, config.EnableOverlay)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Sets it to 20 bytes less than the auto-detected MTU to account for additional ip-ip headers needed for DSR, above
|
|
// method GetMTUFromNodeIP() already accounts for the overhead of ip-ip overlay networking, so we only need to
|
|
// remove 20 bytes
|
|
nsc.dsrTCPMSS = automtu - utils.IPInIPHeaderLength
|
|
|
|
nsc.podLister = podInformer.GetIndexer()
|
|
|
|
nsc.svcLister = svcInformer.GetIndexer()
|
|
nsc.ServiceEventHandler = nsc.newSvcEventHandler()
|
|
|
|
nsc.ipvsPermitAll = config.IpvsPermitAll
|
|
|
|
nsc.epLister = epInformer.GetIndexer()
|
|
nsc.EndpointsEventHandler = nsc.newEndpointsEventHandler()
|
|
|
|
rand.Seed(time.Now().UnixNano())
|
|
|
|
return &nsc, nil
|
|
}
|