369 lines
11 KiB
Go

package routing
import (
"context"
"errors"
"fmt"
"net"
"strconv"
"strings"
"time"
"github.com/cloudnativelabs/kube-router/pkg/metrics"
"github.com/cloudnativelabs/kube-router/pkg/options"
"github.com/cloudnativelabs/kube-router/pkg/utils"
gobgpapi "github.com/osrg/gobgp/api"
gobgp "github.com/osrg/gobgp/pkg/server"
v1core "k8s.io/api/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"
)
// Refresh the peer relationship with rest of the nodes in the cluster (iBGP peers). Node add/remove
// events should ensure peer relationship with only currently active nodes. In case
// we miss any events from API server this method which is called periodically
// ensures peer relationship with removed nodes is deleted.
func (nrc *NetworkRoutingController) syncInternalPeers() {
nrc.mu.Lock()
defer nrc.mu.Unlock()
start := time.Now()
defer func() {
endTime := time.Since(start)
if nrc.MetricsEnabled {
metrics.ControllerBGPInternalPeersSyncTime.Observe(endTime.Seconds())
}
klog.V(2).Infof("Syncing BGP peers for the node took %v", endTime)
}()
// get the current list of the nodes from API server
nodes := nrc.nodeLister.List()
if nrc.MetricsEnabled {
metrics.ControllerBPGpeers.Set(float64(len(nodes)))
}
// establish peer and add Pod CIDRs with current set of nodes
currentNodes := make([]string, 0)
for _, obj := range nodes {
node := obj.(*v1core.Node)
nodeIP, err := utils.GetNodeIP(node)
if err != nil {
klog.Errorf("Failed to find a node IP and therefore cannot sync internal BGP Peer: %v", err)
continue
}
// skip self
if nodeIP.String() == nrc.nodeIP.String() {
continue
}
// we are rr-client peer only with rr-server
if nrc.bgpRRClient {
if _, ok := node.ObjectMeta.Annotations[rrServerAnnotation]; !ok {
continue
}
}
// if node full mesh is not requested then just peer with nodes with same ASN
// (run iBGP among same ASN peers)
if !nrc.bgpFullMeshMode {
nodeasn, ok := node.ObjectMeta.Annotations[nodeASNAnnotation]
if !ok {
klog.Infof("Not peering with the Node %s as ASN number of the node is unknown.",
nodeIP.String())
continue
}
asnNo, err := strconv.ParseUint(nodeasn, 0, asnMaxBitSize)
if err != nil {
klog.Infof("Not peering with the Node %s as ASN number of the node is invalid.",
nodeIP.String())
continue
}
// if the nodes ASN number is different from ASN number of current node skip peering
if nrc.nodeAsnNumber != uint32(asnNo) {
klog.Infof("Not peering with the Node %s as ASN number of the node is different.",
nodeIP.String())
continue
}
}
currentNodes = append(currentNodes, nodeIP.String())
nrc.activeNodes[nodeIP.String()] = true
n := &gobgpapi.Peer{
Conf: &gobgpapi.PeerConf{
NeighborAddress: nodeIP.String(),
PeerAs: nrc.nodeAsnNumber,
},
Transport: &gobgpapi.Transport{
RemotePort: nrc.bgpPort,
},
}
if nrc.bgpGracefulRestart {
n.GracefulRestart = &gobgpapi.GracefulRestart{
Enabled: true,
RestartTime: uint32(nrc.bgpGracefulRestartTime.Seconds()),
DeferralTime: uint32(nrc.bgpGracefulRestartDeferralTime.Seconds()),
LocalRestarting: true,
}
n.AfiSafis = []*gobgpapi.AfiSafi{
{
Config: &gobgpapi.AfiSafiConfig{
Family: &gobgpapi.Family{Afi: gobgpapi.Family_AFI_IP, Safi: gobgpapi.Family_SAFI_UNICAST},
Enabled: true,
},
MpGracefulRestart: &gobgpapi.MpGracefulRestart{
Config: &gobgpapi.MpGracefulRestartConfig{
Enabled: true,
},
State: &gobgpapi.MpGracefulRestartState{},
},
},
{
Config: &gobgpapi.AfiSafiConfig{
Family: &gobgpapi.Family{Afi: gobgpapi.Family_AFI_IP6, Safi: gobgpapi.Family_SAFI_UNICAST},
Enabled: true,
},
MpGracefulRestart: &gobgpapi.MpGracefulRestart{
Config: &gobgpapi.MpGracefulRestartConfig{
Enabled: true,
},
State: &gobgpapi.MpGracefulRestartState{},
},
},
}
}
// we are rr-server peer with other rr-client with reflection enabled
if nrc.bgpRRServer {
if _, ok := node.ObjectMeta.Annotations[rrClientAnnotation]; ok {
// add rr options with clusterId
n.RouteReflector = &gobgpapi.RouteReflector{
RouteReflectorClient: true,
RouteReflectorClusterId: fmt.Sprint(nrc.bgpClusterID),
}
}
}
// TODO: check if a node is already added as neighbor in a better way than add and catch error
if err := nrc.bgpServer.AddPeer(context.Background(), &gobgpapi.AddPeerRequest{
Peer: n,
}); err != nil {
if !strings.Contains(err.Error(), "can't overwrite the existing peer") {
klog.Errorf("Failed to add node %s as peer due to %s", nodeIP.String(), err)
}
}
}
// find the list of the node removed, from the last known list of active nodes
removedNodes := make([]string, 0)
for ip := range nrc.activeNodes {
stillActive := false
for _, node := range currentNodes {
if ip == node {
stillActive = true
break
}
}
if !stillActive {
removedNodes = append(removedNodes, ip)
}
}
// delete the neighbor for the nodes that are removed
for _, ip := range removedNodes {
if err := nrc.bgpServer.DeletePeer(context.Background(), &gobgpapi.DeletePeerRequest{Address: ip}); err != nil {
klog.Errorf("Failed to remove node %s as peer due to %s", ip, err)
}
delete(nrc.activeNodes, ip)
}
}
// connectToExternalBGPPeers adds all the configured eBGP peers (global or node specific) as neighbours
func connectToExternalBGPPeers(server *gobgp.BgpServer, peerNeighbors []*gobgpapi.Peer, bgpGracefulRestart bool,
bgpGracefulRestartDeferralTime time.Duration, bgpGracefulRestartTime time.Duration, peerMultihopTTL uint8) error {
for _, n := range peerNeighbors {
if bgpGracefulRestart {
n.GracefulRestart = &gobgpapi.GracefulRestart{
Enabled: true,
RestartTime: uint32(bgpGracefulRestartTime.Seconds()),
DeferralTime: uint32(bgpGracefulRestartDeferralTime.Seconds()),
LocalRestarting: true,
}
n.AfiSafis = []*gobgpapi.AfiSafi{
{
Config: &gobgpapi.AfiSafiConfig{
Family: &gobgpapi.Family{Afi: gobgpapi.Family_AFI_IP, Safi: gobgpapi.Family_SAFI_UNICAST},
Enabled: true,
},
MpGracefulRestart: &gobgpapi.MpGracefulRestart{
Config: &gobgpapi.MpGracefulRestartConfig{
Enabled: true,
},
},
},
{
Config: &gobgpapi.AfiSafiConfig{
Family: &gobgpapi.Family{Afi: gobgpapi.Family_AFI_IP6, Safi: gobgpapi.Family_SAFI_UNICAST},
Enabled: true,
},
MpGracefulRestart: &gobgpapi.MpGracefulRestart{
Config: &gobgpapi.MpGracefulRestartConfig{
Enabled: true,
},
},
},
}
}
if peerMultihopTTL > 1 {
n.EbgpMultihop = &gobgpapi.EbgpMultihop{
Enabled: true,
MultihopTtl: uint32(peerMultihopTTL),
}
}
err := server.AddPeer(context.Background(), &gobgpapi.AddPeerRequest{Peer: n})
if err != nil {
return fmt.Errorf("error peering with peer router "+
"%q due to: %s", n.Conf.NeighborAddress, err)
}
klog.V(2).Infof("Successfully configured %s in ASN %v as BGP peer to the node",
n.Conf.NeighborAddress, n.Conf.PeerAs)
}
return nil
}
// Does validation and returns neighbor configs
func newGlobalPeers(ips []net.IP, ports []uint32, asns []uint32, passwords []string, holdtime float64) (
[]*gobgpapi.Peer, error) {
peers := make([]*gobgpapi.Peer, 0)
// Validations
if len(ips) != len(asns) {
return nil, errors.New("invalid peer router config, the number of IPs and ASN numbers must be equal")
}
if len(ips) != len(passwords) && len(passwords) != 0 {
return nil, errors.New("Invalid peer router config. " +
"The number of passwords should either be zero, or one per peer router." +
" Use blank items if a router doesn't expect a password.\n" +
"Example: \"pass,,pass\" OR [\"pass\",\"\",\"pass\"].")
}
if len(ips) != len(ports) && len(ports) != 0 {
return nil, errors.New("Invalid peer router config. " +
"The number of ports should either be zero, or one per peer router." +
" If blank items are used, it will default to standard BGP port, " +
strconv.Itoa(options.DefaultBgpPort) + "\n" +
"Example: \"port,,port\" OR [\"port\",\"\",\"port\"].")
}
for i := 0; i < len(ips); i++ {
if !((asns[i] >= 1 && asns[i] <= 23455) ||
(asns[i] >= 23457 && asns[i] <= 63999) ||
(asns[i] >= 64512 && asns[i] <= 65534) ||
(asns[i] >= 131072 && asns[i] <= 4199999999) ||
(asns[i] >= 4200000000 && asns[i] <= 4294967294)) {
return nil, fmt.Errorf("reserved ASN number \"%d\" for global BGP peer",
asns[i])
}
peer := &gobgpapi.Peer{
Conf: &gobgpapi.PeerConf{
NeighborAddress: ips[i].String(),
PeerAs: asns[i],
},
Timers: &gobgpapi.Timers{Config: &gobgpapi.TimersConfig{HoldTime: uint64(holdtime)}},
Transport: &gobgpapi.Transport{
RemotePort: options.DefaultBgpPort,
},
}
if len(ports) != 0 {
peer.Transport.RemotePort = ports[i]
}
if len(passwords) != 0 {
peer.Conf.AuthPassword = passwords[i]
}
peers = append(peers, peer)
}
return peers, nil
}
func (nrc *NetworkRoutingController) newNodeEventHandler() cache.ResourceEventHandler {
return cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
node := obj.(*v1core.Node)
nodeIP, err := utils.GetNodeIP(node)
if err != nil {
klog.Errorf(
"New node received, but we were unable to add it as we were couldn't find it's node IP: %v", err)
return
}
klog.V(2).Infof("Received node %s added update from watch API so peer with new node", nodeIP)
nrc.OnNodeUpdate(obj)
},
UpdateFunc: func(oldObj, newObj interface{}) {
// we are only interested in node add/delete, so skip update
},
DeleteFunc: func(obj interface{}) {
node, ok := obj.(*v1core.Node)
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("unexpected object type: %v", obj)
return
}
if node, ok = tombstone.Obj.(*v1core.Node); !ok {
klog.Errorf("unexpected object type: %v", obj)
return
}
}
nodeIP, err := utils.GetNodeIP(node)
// In this case even if we can't get the NodeIP that's alright as the node is being removed anyway and
// future node lister operations that happen in OnNodeUpdate won't be affected as the node won't be returned
if err == nil {
klog.Infof("Received node %s removed update from watch API, so remove node from peer", nodeIP)
} else {
klog.Infof("Received node (IP unavailable) removed update from watch API, so remove node " +
"from peer")
}
nrc.OnNodeUpdate(obj)
},
}
}
// OnNodeUpdate Handle updates from Node watcher. Node watcher calls this method whenever there is
// new node is added or old node is deleted. So peer up with new node and drop peering
// from old node
func (nrc *NetworkRoutingController) OnNodeUpdate(_ interface{}) {
if !nrc.bgpServerStarted {
return
}
// update export policies so that NeighborSet gets updated with new set of nodes
err := nrc.AddPolicies()
if err != nil {
klog.Errorf("Error adding BGP policies: %s", err.Error())
}
if nrc.bgpEnableInternal {
nrc.syncInternalPeers()
}
// skip if first round of disableSourceDestinationCheck() is not done yet, this is to prevent
// all the nodes for all the node add update trying to perfrom disableSourceDestinationCheck
if nrc.disableSrcDstCheck && nrc.initSrcDstCheckDone && nrc.ec2IamAuthorized {
nrc.disableSourceDestinationCheck()
}
}