diff --git a/.dockerignore b/.dockerignore index 3bce1ad6..4144a7b3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,12 @@ +.git **/_cache +app +build-image +cni +contrib +daemonset +dashboard +Documentation +hack +utils +vendor diff --git a/.gitignore b/.gitignore index b492661f..b8e43c45 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.vscode /kube-router /gobgp _output diff --git a/Documentation/generic.md b/Documentation/generic.md index 744a001d..6f8819eb 100644 --- a/Documentation/generic.md +++ b/Documentation/generic.md @@ -51,7 +51,6 @@ Any iptables rules kube-proxy left around will also need to be cleaned up. This docker run --privileged --net=host gcr.io/google_containers/kube-proxy-amd64:v1.7.3 kube-proxy --cleanup-iptables - ## Running kube-router without the service proxy This runs kube-router with pod/service networking and the network policy firewall. The Services proxy is disabled. @@ -60,4 +59,8 @@ This runs kube-router with pod/service networking and the network policy firewal In this mode kube-router relies on for example [kube-proxy](https://kubernetes.io/docs/reference/generated/kube-proxy/) to provide service networking. -When service proxy is disabled kube-router will use [in-cluster configuration](https://github.com/kubernetes/client-go/tree/master/examples/in-cluster-client-configuration) to access APIserver through cluster-ip. Service networking must therefore be setup before deploying kube-router. \ No newline at end of file +When service proxy is disabled kube-router will use [in-cluster configuration](https://github.com/kubernetes/client-go/tree/master/examples/in-cluster-client-configuration) to access APIserver through cluster-ip. Service networking must therefore be setup before deploying kube-router. + +## Debugging + +kube-router supports setting log level via the command line -v or --v, To get maximal debug output from kube-router please start with `--v=3` \ No newline at end of file diff --git a/Documentation/metrics.md b/Documentation/metrics.md index 19e406e4..d9a97782 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -5,16 +5,24 @@ The scope of this document is to describe how to setup the [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) needed for [Prometheus](https://prometheus.io/) to use [Kubernetes SD](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#) to discover & scape kube-router [pods](https://kubernetes.io/docs/concepts/workloads/pods/pod/). For help with installing Prometheus please see their [docs](https://prometheus.io/docs/introduction/overview/) -By default kube-router will export Prometheus metrics on port `8080` under the path `/metrics`. -If running kube-router as [daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) this port might collide with other applications running on the host network and must be changed. +Metrics options: -kube-router 0.1.0-rc2 and upwards supports the following runtime configuration for controlling where to expose the metrics. -If you are using a older version, metrics path & port is locked to `/metrics` & `8080`. + --metrics-path string Path to serve Prometheus metrics on ( default: /metrics ) + --metrics-port uint16 <0-65535> Prometheus metrics port to use ( default: 0, disabled ) - --metrics-port int Prometheus metrics port to use ( default 8080 ) - --metrics-path string Path to serve Prometheus metrics on ( default /metrics ) +To enable kube-router metrics, start kube-router with `--metrics-port` and provide a port over 0 -By enabling [Kubernetes SD](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#) in Prometheus configuration & adding required annotations it can automaticly discover & scrape kube-router metrics. +Metrics is generally exported at the same rate as the sync period for each service. + +The default values unless other specified is + iptables-sync-period - 1 min + ipvs-sync-period - 1 min + routes-sync-period - 1 min + +By enabling [Kubernetes SD](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#) in Prometheus configuration & adding required annotations Prometheus can automaticly discover & scrape kube-router metrics + +## Version notes +kube-router 0.1.0-rc2 and upwards supports the runtime configuration for controlling where to expose the metrics. If you are using a older version, metrics path & port is locked to `/metrics` & `8080` ## Supported annotations @@ -39,8 +47,32 @@ For example: ## Avail metrics +If metrics is enabled only the running services metrics are exposed + The following metrics is exposed by kube-router prefixed by `kube_router_` +### run-router = true + +* controller_bgp_peers + Number of BGP peers of the instance +* controller_bgp_advertisements_received + Number of total BGP advertisements received since kube-router start +* controller_bgp_internal_peers_sync_time + Time it took for the BGP internal peer sync loop to complete + +### run-firewall=true + +* controller_iptables_sync_time + Time it took for the iptables sync loop to complete + +### run-service-proxy = true + +* controller_ipvs_services_sync_time + Time it took for the ipvs sync loop to complete +* controller_ipvs_services + The number of ipvs services in the instance +* controller_ipvs_metrics_export_time + The time it took to run the metrics export for IPVS services * service_total_connections Total connections made to the service since creation * service_packets_in @@ -68,4 +100,4 @@ To get a grouped list of CPS for each service a Prometheus query could look like ## Grafana Dashboard This repo contains a example [Grafana dashboard](https://raw.githubusercontent.com/cloudnativelabs/kube-router/master/dashboard/kube-router.json) utilizing all the above exposed metrics from kube-router. -![dashboard](https://raw.githubusercontent.com/cloudnativelabs/kube-router/master/dashboard/dashboard.png) +![dashboard](https://raw.githubusercontent.com/cloudnativelabs/kube-router/master/dashboard/dashboard.png) \ No newline at end of file diff --git a/app/controllers/metrics_controller.go b/app/controllers/metrics_controller.go new file mode 100644 index 00000000..ff154913 --- /dev/null +++ b/app/controllers/metrics_controller.go @@ -0,0 +1,154 @@ +package controllers + +import ( + "net" + "net/http" + "strconv" + "sync" + + "github.com/cloudnativelabs/kube-router/app/options" + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "golang.org/x/net/context" + "k8s.io/client-go/kubernetes" +) + +var ( + serviceTotalConn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_total_connections", + Help: "Total incoming conntections made", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + servicePacketsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_packets_in", + Help: "Total incoming packets", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + servicePacketsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_packets_out", + Help: "Total outoging packets", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + serviceBytesIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_bytes_in", + Help: "Total incoming bytes", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + serviceBytesOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_bytes_out", + Help: "Total outgoing bytes", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + servicePpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_pps_in", + Help: "Incoming packets per second", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + servicePpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_pps_out", + Help: "Outoging packets per second", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + serviceCPS = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_cps", + Help: "Service connections per second", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + serviceBpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_bps_in", + Help: "Incoming bytes per second", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + serviceBpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "service_bps_out", + Help: "Outoging bytes per second", + }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + controllerIpvsServices = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_ipvs_services", + Help: "Number of ipvs services in the instance", + }, []string{}) + controllerIptablesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_iptables_sync_time", + Help: "Time it took for controller to sync iptables", + }, []string{}) + controllerPublishMetricsTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_publish_metrics_time", + Help: "Time it took to publish metrics", + }, []string{}) + controllerIpvsServicesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_ipvs_services_sync_time", + Help: "Time it took for controller to sync ipvs services", + }, []string{}) + controllerBPGpeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_bgp_peers", + Help: "BGP peers in the runtime configuration", + }, []string{}) + controllerBGPInternalPeersSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_bgp_internal_peers_sync_time", + Help: "Time it took to sync internal bgp peers", + }, []string{}) + controllerBGPadvertisementsReceived = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_bgp_advertisements_received", + Help: "Time it took to sync internal bgp peers", + }, []string{}) + controllerIpvsMetricsExportTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Name: "controller_ipvs_metrics_export_time", + Help: "Time it took to export metrics", + }, []string{}) +) + +// MetricsController Holds settings for the metrics controller +type MetricsController struct { + endpointsMap endpointsInfoMap + MetricsPath string + MetricsPort uint16 + mu sync.Mutex + nodeIP net.IP + serviceMap serviceInfoMap +} + +// Run prometheus metrics controller +func (mc *MetricsController) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) error { + defer wg.Done() + glog.Info("Starting metrics controller") + + // register metrics for this controller + prometheus.MustRegister(controllerIpvsMetricsExportTime) + + srv := &http.Server{Addr: ":" + strconv.Itoa(int(mc.MetricsPort)), Handler: http.DefaultServeMux} + + // add prometheus handler on metrics path + http.Handle(mc.MetricsPath, promhttp.Handler()) + + go func() { + if err := srv.ListenAndServe(); err != nil { + // cannot panic, because this probably is an intentional close + glog.Errorf("Metrics controller error: %s", err) + } + }() + + <-stopCh + glog.Infof("Shutting down metrics controller") + if err := srv.Shutdown(context.Background()); err != nil { + glog.Errorf("could not shutdown: %v", err) + } + return nil +} + +// NewMetricsController returns new MetricController object +func NewMetricsController(clientset *kubernetes.Clientset, config *options.KubeRouterConfig) (*MetricsController, error) { + mc := MetricsController{} + mc.MetricsPath = config.MetricsPath + mc.MetricsPort = config.MetricsPort + return &mc, nil +} diff --git a/app/controllers/network_policy_controller.go b/app/controllers/network_policy_controller.go index 5329d607..ee24e675 100644 --- a/app/controllers/network_policy_controller.go +++ b/app/controllers/network_policy_controller.go @@ -17,6 +17,7 @@ import ( "github.com/cloudnativelabs/kube-router/utils" "github.com/coreos/go-iptables/iptables" "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" api "k8s.io/api/core/v1" apiextensions "k8s.io/api/extensions/v1beta1" networking "k8s.io/api/networking/v1" @@ -40,6 +41,7 @@ type NetworkPolicyController struct { nodeHostName string mu sync.Mutex syncPeriod time.Duration + MetricsEnabled bool v1NetworkPolicy bool // list of all active network policies expressed as networkPolicyInfo @@ -103,19 +105,19 @@ func (npc *NetworkPolicyController) Run(stopCh <-chan struct{}, wg *sync.WaitGro defer t.Stop() defer wg.Done() - glog.Infof("Starting network policy controller") + glog.Info("Starting network policy controller") // loop forever till notified to stop on stopCh for { select { case <-stopCh: - glog.Infof("Shutting down network policies controller") + glog.Info("Shutting down network policies controller") return default: } if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() { - glog.Infof("Performing periodic syn of the iptables to reflect network policies") + glog.V(1).Info("Performing periodic sync of iptables to reflect network policies") err := npc.Sync() if err != nil { glog.Errorf("Error during periodic sync: " + err.Error()) @@ -135,14 +137,14 @@ func (npc *NetworkPolicyController) Run(stopCh <-chan struct{}, wg *sync.WaitGro // OnPodUpdate handles updates to pods from the Kubernetes api server func (npc *NetworkPolicyController) OnPodUpdate(podUpdate *watchers.PodUpdate) { - glog.Infof("Received pod update namspace:%s pod name:%s", podUpdate.Pod.Namespace, podUpdate.Pod.Name) + glog.V(2).Infof("Received pod update namespace:%s pod name:%s", podUpdate.Pod.Namespace, podUpdate.Pod.Name) if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() { err := npc.Sync() if err != nil { glog.Errorf("Error syncing on pod update: %s", err) } } else { - glog.Infof("Received pod update, but controller not in sync") + glog.V(2).Infof("Received pod update, but controller not in sync") } } @@ -154,7 +156,7 @@ func (npc *NetworkPolicyController) OnNetworkPolicyUpdate(networkPolicyUpdate *w glog.Errorf("Error syncing on network policy update: %s", err) } } else { - glog.Infof("Received network policy update, but controller not in sync") + glog.V(2).Info("Received network policy update, but controller not in sync") } } @@ -166,14 +168,14 @@ func (npc *NetworkPolicyController) OnNamespaceUpdate(namespaceUpdate *watchers. return } - glog.Infof("Received namesapce update namspace:%s", namespaceUpdate.Namespace.Name) + glog.V(2).Infof("Received namespace update namespace:%s", namespaceUpdate.Namespace.Name) if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() { err := npc.Sync() if err != nil { glog.Errorf("Error syncing on namespace update: %s", err) } } else { - glog.Infof("Received namspace update, but controller not in sync") + glog.V(2).Info("Received namespace update, but controller not in sync") } } @@ -186,9 +188,15 @@ func (npc *NetworkPolicyController) Sync() error { start := time.Now() defer func() { - glog.Infof("sync iptables took %v", time.Since(start)) + endTime := time.Since(start) + if npc.MetricsEnabled { + controllerIptablesSyncTime.WithLabelValues().Set(float64(endTime)) + } + glog.V(2).Infof("sync iptables took %v", endTime) }() + glog.V(1).Info("Starting periodic sync of iptables") + if npc.v1NetworkPolicy { npc.networkPoliciesInfo, err = buildNetworkPoliciesInfo() if err != nil { @@ -289,7 +297,7 @@ func (npc *NetworkPolicyController) syncNetworkPolicyChains() (map[string]bool, } } - glog.Infof("Iptables chains in the filter table are synchronized with the network policies.") + glog.V(2).Infof("Iptables chains in the filter table are synchronized with the network policies.") return activePolicyChains, activePolicyIpSets, nil } @@ -886,7 +894,7 @@ func cleanupStaleRules(activePolicyChains, activePodFwChains, activePolicyIPSets if err != nil { return fmt.Errorf("Failed to delete the chain %s due to %s", chain, err.Error()) } - glog.Infof("Deleted pod specific firewall chain: %s from the filter table", chain) + glog.V(2).Infof("Deleted pod specific firewall chain: %s from the filter table", chain) } // cleanup network policy chains @@ -918,7 +926,7 @@ func cleanupStaleRules(activePolicyChains, activePodFwChains, activePolicyIPSets if err != nil { return fmt.Errorf("Failed to flush the rules in chain %s due to %s", policyChain, err) } - glog.Infof("Deleted network policy chain: %s from the filter table", policyChain) + glog.V(2).Infof("Deleted network policy chain: %s from the filter table", policyChain) } // cleanup network policy ipsets @@ -944,7 +952,7 @@ func (npc *NetworkPolicyController) getIngressNetworkPolicyEnabledPods(nodeIp st } _, ok := policy.targetPods[pod.Status.PodIP] if ok && (policy.policyType == "both" || policy.policyType == "ingress") { - glog.Infof("Found pod name: " + pod.ObjectMeta.Name + " namespace: " + pod.ObjectMeta.Namespace + " for which network policies need to be applied.") + glog.V(2).Infof("Found pod name: " + pod.ObjectMeta.Name + " namespace: " + pod.ObjectMeta.Namespace + " for which network policies need to be applied.") nodePods[pod.Status.PodIP] = podInfo{ip: pod.Status.PodIP, name: pod.ObjectMeta.Name, namespace: pod.ObjectMeta.Namespace, @@ -971,7 +979,7 @@ func (npc *NetworkPolicyController) getEgressNetworkPolicyEnabledPods(nodeIp str } _, ok := policy.targetPods[pod.Status.PodIP] if ok && (policy.policyType == "both" || policy.policyType == "egress") { - glog.Infof("Found pod name: " + pod.ObjectMeta.Name + " namespace: " + pod.ObjectMeta.Namespace + " for which network policies need to be applied.") + glog.V(2).Infof("Found pod name: " + pod.ObjectMeta.Name + " namespace: " + pod.ObjectMeta.Namespace + " for which network policies need to be applied.") nodePods[pod.Status.PodIP] = podInfo{ip: pod.Status.PodIP, name: pod.ObjectMeta.Name, namespace: pod.ObjectMeta.Namespace, @@ -1243,7 +1251,7 @@ func getNameSpaceDefaultPolicy(namespace string) (string, error) { return annot["ingress"]["isolation"], nil } glog.Errorf("Skipping invalid network-policy for namespace \"%s\": %s", namespace, err) - return "DefaultAllow", errors.New("Invalid NetworkPolicy.") + return "DefaultAllow", errors.New("Invalid NetworkPolicy") } return "DefaultAllow", nil } @@ -1290,7 +1298,7 @@ func policyIndexedDestinationPodIpSetName(namespace, policyName string, egressRu // Cleanup cleanup configurations done func (npc *NetworkPolicyController) Cleanup() { - glog.Infof("Cleaning up iptables configuration permanently done by kube-router") + glog.Info("Cleaning up iptables configuration permanently done by kube-router") iptablesCmdHandler, err := iptables.New() if err != nil { @@ -1373,9 +1381,14 @@ func (npc *NetworkPolicyController) Cleanup() { // NewNetworkPolicyController returns new NetworkPolicyController object func NewNetworkPolicyController(clientset *kubernetes.Clientset, config *options.KubeRouterConfig) (*NetworkPolicyController, error) { - npc := NetworkPolicyController{} + if config.MetricsEnabled { + //Register the metrics for this controller + prometheus.MustRegister(controllerIptablesSyncTime) + npc.MetricsEnabled = true + } + npc.syncPeriod = config.IPTablesSyncPeriod npc.v1NetworkPolicy = true diff --git a/app/controllers/network_routes_controller.go b/app/controllers/network_routes_controller.go index 3a51e560..994088b3 100644 --- a/app/controllers/network_routes_controller.go +++ b/app/controllers/network_routes_controller.go @@ -28,6 +28,7 @@ import ( "github.com/osrg/gobgp/packet/bgp" gobgp "github.com/osrg/gobgp/server" "github.com/osrg/gobgp/table" + "github.com/prometheus/client_golang/prometheus" "github.com/vishvananda/netlink" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -59,6 +60,7 @@ type NetworkRoutingController struct { ipSetHandler *utils.IPSet enableOverlays bool peerMultihopTtl uint8 + MetricsEnabled bool } var ( @@ -99,7 +101,7 @@ func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGr } } - glog.Info("Populating ipsets.") + glog.V(1).Info("Populating ipsets.") err = nrc.syncNodeIPSets() if err != nil { glog.Errorf("Failed initial ipset setup: %s", err) @@ -116,22 +118,22 @@ func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGr // Handle ipip tunnel overlay if nrc.enableOverlays { - glog.Info("IPIP Tunnel Overlay enabled in configuration.") - glog.Info("Setting up overlay networking.") + glog.V(1).Info("IPIP Tunnel Overlay enabled in configuration.") + glog.V(1).Info("Setting up overlay networking.") err = nrc.enablePolicyBasedRouting() if err != nil { glog.Errorf("Failed to enable required policy based routing: %s", err.Error()) } } else { - glog.Info("IPIP Tunnel Overlay disabled in configuration.") - glog.Info("Cleaning up old overlay networking if needed.") + glog.V(1).Info("IPIP Tunnel Overlay disabled in configuration.") + glog.V(1).Info("Cleaning up old overlay networking if needed.") err = nrc.disablePolicyBasedRouting() if err != nil { glog.Errorf("Failed to disable policy based routing: %s", err.Error()) } } - glog.Info("Performing cleanup of depreciated rules/ipsets (if needed).") + glog.V(1).Info("Performing cleanup of depreciated rules/ipsets (if needed).") err = deleteBadPodEgressRules() if err != nil { glog.Errorf("Error cleaning up old/bad Pod egress rules: %s", err.Error()) @@ -139,14 +141,14 @@ func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGr // Handle Pod egress masquerading configuration if nrc.enablePodEgress { - glog.Infoln("Enabling Pod egress.") + glog.V(1).Infoln("Enabling Pod egress.") err = createPodEgressRule() if err != nil { glog.Errorf("Error enabling Pod egress: %s", err.Error()) } } else { - glog.Infoln("Disabling Pod egress.") + glog.V(1).Infoln("Disabling Pod egress.") err = deletePodEgressRule() if err != nil { @@ -218,7 +220,7 @@ func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGr // Update ipset entries if nrc.enablePodEgress || nrc.enableOverlays { - glog.Info("Syncing ipsets.") + glog.V(1).Info("Syncing ipsets") err := nrc.syncNodeIPSets() if err != nil { glog.Errorf("Error synchronizing ipsets: %s", err.Error()) @@ -235,7 +237,7 @@ func (nrc *NetworkRoutingController) Run(stopCh <-chan struct{}, wg *sync.WaitGr nrc.advertiseExternalIPs() } - glog.Infof("Performing periodic sync of the routes") + glog.V(1).Info("Performing periodic sync of the routes") err = nrc.advertiseRoute() if err != nil { glog.Errorf("Error advertising route: %s", err.Error()) @@ -272,7 +274,7 @@ func createPodEgressRule() error { } - glog.Infof("Added iptables rule to masqurade outbound traffic from pods.") + glog.V(1).Infof("Added iptables rule to masqurade outbound traffic from pods.") return nil } @@ -333,7 +335,10 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() { case ev := <-watcher.Event(): switch msg := ev.(type) { case *gobgp.WatchEventBestPath: - glog.Infof("Processing bgp route advertisement from peer") + glog.V(3).Info("Processing bgp route advertisement from peer") + if nrc.MetricsEnabled { + controllerBGPadvertisementsReceived.WithLabelValues().Add(float64(1)) + } for _, path := range msg.PathList { if path.IsLocal() { continue @@ -349,7 +354,7 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() { } func (nrc *NetworkRoutingController) advertiseClusterIPs() { - glog.Infof("Advertising cluster ips of services to the external BGP peers") + glog.V(1).Info("Advertising cluster ips of services to the external BGP peers") for _, svc := range watchers.ServiceWatcher.List() { if svc.Spec.Type == "ClusterIP" || svc.Spec.Type == "NodePort" || svc.Spec.Type == "LoadBalancer" { @@ -358,7 +363,7 @@ func (nrc *NetworkRoutingController) advertiseClusterIPs() { continue } - glog.Infof("found a service of cluster ip type") + glog.V(2).Info("found a service of cluster ip type") err := nrc.AdvertiseClusterIp(svc.Spec.ClusterIP) if err != nil { glog.Errorf("error advertising cluster IP: %q error: %v", svc.Spec.ClusterIP, err) @@ -368,7 +373,7 @@ func (nrc *NetworkRoutingController) advertiseClusterIPs() { } func (nrc *NetworkRoutingController) advertiseExternalIPs() { - glog.Infof("Advertising external ips of the services to the external BGP peers") + glog.V(2).Info("Advertising external ips of the services to the external BGP peers") for _, svc := range watchers.ServiceWatcher.List() { if svc.Spec.Type == "ClusterIP" || svc.Spec.Type == "NodePort" { // skip headless services @@ -399,7 +404,7 @@ func (nrc *NetworkRoutingController) advertiseRoute() error { bgp.NewPathAttributeNextHop(nrc.nodeIP.String()), } - glog.Infof("Advertising route: '%s/%s via %s' to peers", subnet, strconv.Itoa(cidrLen), nrc.nodeIP.String()) + glog.V(2).Infof("Advertising route: '%s/%s via %s' to peers", subnet, strconv.Itoa(cidrLen), nrc.nodeIP.String()) if _, err := nrc.bgpServer.AddPath("", []*table.Path{table.NewPath(nil, bgp.NewIPAddrPrefix(uint8(cidrLen), subnet), false, attrs, time.Now(), false)}); err != nil { @@ -456,7 +461,7 @@ func stringSliceToIPs(s []string) ([]net.IP, error) { for _, ipString := range s { ip := net.ParseIP(ipString) if ip == nil { - return nil, fmt.Errorf("Could not parse \"%s\" as an IP.", ipString) + return nil, fmt.Errorf("Could not parse \"%s\" as an IP", ipString) } ips = append(ips, ip) } @@ -468,7 +473,7 @@ func stringSliceToUInt32(s []string) ([]uint32, error) { for _, intString := range s { newInt, err := strconv.ParseUint(intString, 0, 32) if err != nil { - return nil, fmt.Errorf("Could not parse \"%s\" as an integer.", intString) + return nil, fmt.Errorf("Could not parse \"%s\" as an integer", intString) } ints = append(ints, uint32(newInt)) } @@ -480,7 +485,7 @@ func stringSliceB64Decode(s []string) ([]string, error) { for _, b64String := range s { decoded, err := base64.StdEncoding.DecodeString(b64String) if err != nil { - return nil, fmt.Errorf("Could not parse \"%s\" as a base64 encoded string.", + return nil, fmt.Errorf("Could not parse \"%s\" as a base64 encoded string", b64String) } ss = append(ss, string(decoded)) @@ -509,7 +514,7 @@ func newGlobalPeers(ips []net.IP, asns []uint32, passwords []string) ( for i := 0; i < len(ips); i++ { if !((asns[i] >= 64512 && asns[i] <= 65535) || (asns[i] >= 4200000000 && asns[i] <= 4294967294)) { - return nil, fmt.Errorf("Invalid ASN number \"%d\" for global BGP peer.", + return nil, fmt.Errorf("Invalid ASN number \"%d\" for global BGP peer", asns[i]) } @@ -575,7 +580,7 @@ func connectToExternalBGPPeers(server *gobgp.BgpServer, peerConfigs []*config.Ne return fmt.Errorf("Error peering with peer router "+ "\"%s\" due to: %s", peerConfig.NeighborAddress, err) } - glog.Infof("Successfully configured %s in ASN %v as BGP peer to the node", + glog.V(2).Infof("Successfully configured %s in ASN %v as BGP peer to the node", peerConfig.NeighborAddress, peerConfig.PeerAs) } return nil @@ -588,7 +593,7 @@ func (nrc *NetworkRoutingController) AdvertiseClusterIp(clusterIp string) error bgp.NewPathAttributeOrigin(0), bgp.NewPathAttributeNextHop(nrc.nodeIP.String()), } - glog.Infof("Advertising route: '%s/%s via %s' to peers", clusterIp, strconv.Itoa(32), nrc.nodeIP.String()) + glog.V(2).Infof("Advertising route: '%s/%s via %s' to peers", clusterIp, strconv.Itoa(32), nrc.nodeIP.String()) if _, err := nrc.bgpServer.AddPath("", []*table.Path{table.NewPath(nil, bgp.NewIPAddrPrefix(uint8(32), clusterIp), false, attrs, time.Now(), false)}); err != nil { return fmt.Errorf(err.Error()) @@ -835,10 +840,10 @@ func (nrc *NetworkRoutingController) injectRoute(path *table.Path) error { } if path.IsWithdraw { - glog.Infof("Removing route: '%s via %s' from peer in the routing table", dst, nexthop) + glog.V(2).Infof("Removing route: '%s via %s' from peer in the routing table", dst, nexthop) return netlink.RouteDel(route) } - glog.Infof("Inject route: '%s via %s' from peer to routing table", dst, nexthop) + glog.V(2).Infof("Inject route: '%s via %s' from peer to routing table", dst, nexthop) return netlink.RouteReplace(route) } @@ -927,7 +932,7 @@ func (nrc *NetworkRoutingController) syncNodeIPSets() error { glog.Infof("Creating missing ipset \"%s\"", podSubnetsIPSetName) _, err = nrc.ipSetHandler.Create(podSubnetsIPSetName, utils.OptionTimeout, "0") if err != nil { - return fmt.Errorf("ipset \"%s\" not found in controller instance.", + return fmt.Errorf("ipset \"%s\" not found in controller instance", podSubnetsIPSetName) } } @@ -942,7 +947,7 @@ func (nrc *NetworkRoutingController) syncNodeIPSets() error { glog.Infof("Creating missing ipset \"%s\"", nodeAddrsIPSetName) _, err = nrc.ipSetHandler.Create(nodeAddrsIPSetName, utils.OptionTimeout, "0") if err != nil { - return fmt.Errorf("ipset \"%s\" not found in controller instance.", + return fmt.Errorf("ipset \"%s\" not found in controller instance", nodeAddrsIPSetName) } } @@ -959,8 +964,12 @@ func (nrc *NetworkRoutingController) syncNodeIPSets() error { // we miss any events from API server this method which is called periodically // ensure peer relationship with removed nodes is deleted. Also update Pod subnet ipset. func (nrc *NetworkRoutingController) syncInternalPeers() { - - glog.Infof("Syncing BGP peers for the node.") + start := time.Now() + defer func() { + endTime := time.Since(start) + controllerBGPInternalPeersSyncTime.WithLabelValues().Set(float64(endTime)) + glog.V(2).Infof("Syncing BGP peers for the node took %v", endTime) + }() // get the current list of the nodes from API server nodes, err := nrc.clientset.Core().Nodes().List(metav1.ListOptions{}) @@ -969,6 +978,7 @@ func (nrc *NetworkRoutingController) syncInternalPeers() { return } + controllerBPGpeers.WithLabelValues().Set(float64(len(nodes.Items))) // establish peer and add Pod CIDRs with current set of nodes currentNodes := make([]string, 0) for _, node := range nodes.Items { @@ -1215,7 +1225,7 @@ func (nrc *NetworkRoutingController) OnNodeUpdate(nodeUpdate *watchers.NodeUpdat node := nodeUpdate.Node nodeIP, _ := utils.GetNodeIP(node) if nodeUpdate.Op == watchers.ADD { - glog.Infof("Received node %s added update from watch API so peer with new node", nodeIP) + glog.V(2).Infof("Received node %s added update from watch API so peer with new node", nodeIP) n := &config.Neighbor{ Config: config.NeighborConfig{ NeighborAddress: nodeIP.String(), @@ -1427,6 +1437,14 @@ func NewNetworkRoutingController(clientset *kubernetes.Clientset, var err error nrc := NetworkRoutingController{} + if kubeRouterConfig.MetricsEnabled { + //Register the metrics for this controller + prometheus.MustRegister(controllerBGPadvertisementsReceived) + prometheus.MustRegister(controllerBGPInternalPeersSyncTime) + prometheus.MustRegister(controllerBPGpeers) + nrc.MetricsEnabled = true + } + nrc.bgpFullMeshMode = kubeRouterConfig.FullMeshMode nrc.bgpEnableInternal = kubeRouterConfig.EnableiBGP nrc.bgpGracefulRestart = kubeRouterConfig.BGPGracefulRestart diff --git a/app/controllers/network_services_controller.go b/app/controllers/network_services_controller.go index 16327d90..c4e43c61 100644 --- a/app/controllers/network_services_controller.go +++ b/app/controllers/network_services_controller.go @@ -7,7 +7,6 @@ import ( "io/ioutil" "math/rand" "net" - "net/http" "os" "os/exec" "reflect" @@ -26,7 +25,6 @@ import ( "github.com/docker/libnetwork/ipvs" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" "golang.org/x/net/context" @@ -45,57 +43,7 @@ const ( ) var ( - h *ipvs.Handle - serviceTotalConn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_total_connections", - Help: "Total incoming conntections made", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - servicePacketsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_packets_in", - Help: "Total incoming packets", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - servicePacketsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_packets_out", - Help: "Total outoging packets", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - serviceBytesIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_bytes_in", - Help: "Total incoming bytes", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - serviceBytesOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_bytes_out", - Help: "Total outgoing bytes", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - servicePpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_pps_in", - Help: "Incoming packets per second", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - servicePpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_pps_out", - Help: "Outoging packets per second", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - serviceCPS = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_cps", - Help: "Service connections per second", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - serviceBpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_bps_in", - Help: "Incoming bytes per second", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) - serviceBpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Name: "service_bps_out", - Help: "Outoging bytes per second", - }, []string{"namespace", "service_name", "service_vip", "protocol", "port"}) + h *ipvs.Handle ) // NetworkServicesController enables local node as network service proxy through IPVS/LVS. @@ -118,8 +66,7 @@ type NetworkServicesController struct { globalHairpin bool client *kubernetes.Clientset nodeportBindOnAllIp bool - MetricsPort int - MetricsPath string + MetricsEnabled bool } // internal representation of kubernetes service @@ -160,27 +107,12 @@ func (nsc *NetworkServicesController) Run(stopCh <-chan struct{}, wg *sync.WaitG glog.Infof("Starting network services controller") - // enable masquerade rule + // enable masquerad rule err := ensureMasqueradeIptablesRule(nsc.masqueradeAll, nsc.podCidr) if err != nil { - return errors.New("Failed to do add masqurade rule in POSTROUTING chain of nat table due to: %s" + err.Error()) + return errors.New("Failed to do add masquerad rule in POSTROUTING chain of nat table due to: %s" + err.Error()) } - // register metrics - prometheus.MustRegister(serviceBpsIn) - prometheus.MustRegister(serviceBpsOut) - prometheus.MustRegister(serviceBytesIn) - prometheus.MustRegister(serviceBytesOut) - prometheus.MustRegister(serviceCPS) - prometheus.MustRegister(servicePacketsIn) - prometheus.MustRegister(servicePacketsOut) - prometheus.MustRegister(servicePpsIn) - prometheus.MustRegister(servicePpsOut) - prometheus.MustRegister(serviceTotalConn) - - http.Handle(nsc.MetricsPath, promhttp.Handler()) - go http.ListenAndServe(":"+strconv.Itoa(nsc.MetricsPort), nil) - // enable ipvs connection tracking err = ensureIpvsConntrack() if err != nil { @@ -191,13 +123,13 @@ func (nsc *NetworkServicesController) Run(stopCh <-chan struct{}, wg *sync.WaitG for { select { case <-stopCh: - glog.Infof("Shutting down network services controller") + glog.Info("Shutting down network services controller") return nil default: } if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() { - glog.Infof("Performing periodic syn of the ipvs services and server to reflect desired state of kubernetes services and endpoints") + glog.V(1).Info("Performing periodic sync of ipvs services") nsc.sync() } else { continue @@ -205,7 +137,7 @@ func (nsc *NetworkServicesController) Run(stopCh <-chan struct{}, wg *sync.WaitG select { case <-stopCh: - glog.Infof("Shutting down network services controller") + glog.Info("Shutting down network services controller") return nil case <-t.C: } @@ -223,7 +155,77 @@ func (nsc *NetworkServicesController) sync() { glog.Errorf("Error syncing hairpin iptable rules: %s", err.Error()) } nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap) - nsc.publishMetrics(nsc.serviceMap) + if nsc.MetricsEnabled { + nsc.publishMetrics(nsc.serviceMap) + } +} + +func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoMap) error { + start := time.Now() + defer func() { + endTime := time.Since(start) + glog.V(2).Infof("Publishing IPVS metrics took %v", endTime) + controllerIpvsMetricsExportTime.WithLabelValues().Set(float64(endTime)) + }() + + ipvsSvcs, err := h.GetServices() + if err != nil { + return errors.New("Failed to list IPVS services: " + err.Error()) + } + + glog.V(1).Info("Publishing IPVS metrics") + for _, svc := range serviceInfoMap { + var protocol uint16 + var pushMetric bool + var svcVip string + + switch aProtocol := svc.protocol; aProtocol { + case "tcp": + protocol = syscall.IPPROTO_TCP + case "udp": + protocol = syscall.IPPROTO_UDP + default: + protocol = syscall.IPPROTO_NONE + } + for _, ipvsSvc := range ipvsSvcs { + + switch svcAddress := ipvsSvc.Address.String(); svcAddress { + case svc.clusterIP.String(): + if protocol == ipvsSvc.Protocol && uint16(svc.port) == ipvsSvc.Port { + pushMetric = true + svcVip = svc.clusterIP.String() + } else { + pushMetric = false + } + case nsc.nodeIP.String(): + if protocol == ipvsSvc.Protocol && uint16(svc.port) == ipvsSvc.Port { + pushMetric = true + svcVip = nsc.nodeIP.String() + } else { + pushMetric = false + } + default: + svcVip = "" + pushMetric = false + } + + if pushMetric { + glog.V(3).Infof("Publishing metrics for %s/%s (%s:%d/%s)", svc.namespace, svc.name, svcVip, svc.port, svc.protocol) + serviceBpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BPSIn)) + serviceBpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BPSOut)) + serviceBytesIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BytesIn)) + serviceBytesOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BytesOut)) + serviceCPS.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.CPS)) + servicePacketsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PacketsIn)) + servicePacketsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PacketsOut)) + servicePpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSIn)) + servicePpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSOut)) + serviceTotalConn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.Connections)) + controllerIpvsServices.WithLabelValues().Set(float64(len(ipvsSvcs))) + } + } + } + return nil } // OnEndpointsUpdate handle change in endpoints update from the API server @@ -232,9 +234,9 @@ func (nsc *NetworkServicesController) OnEndpointsUpdate(endpointsUpdate *watcher nsc.mu.Lock() defer nsc.mu.Unlock() - glog.Infof("Received endpoints update from watch API") + glog.V(1).Info("Received endpoints update from watch API") if !(watchers.ServiceWatcher.HasSynced() && watchers.EndpointsWatcher.HasSynced()) { - glog.Infof("Skipping ipvs server sync as local cache is not synced yet") + glog.V(1).Info("Skipping ipvs server sync as local cache is not synced yet") } // build new endpoints map to reflect the change @@ -244,7 +246,7 @@ func (nsc *NetworkServicesController) OnEndpointsUpdate(endpointsUpdate *watcher nsc.endpointsMap = newEndpointsMap nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap) } else { - glog.Infof("Skipping ipvs server sync on endpoints update because nothing changed") + glog.V(1).Info("Skipping ipvs server sync on endpoints because nothing changed") } } @@ -254,9 +256,9 @@ func (nsc *NetworkServicesController) OnServiceUpdate(serviceUpdate *watchers.Se nsc.mu.Lock() defer nsc.mu.Unlock() - glog.Infof("Received service update from watch API") + glog.V(1).Info("Received service update from watch API") if !(watchers.ServiceWatcher.HasSynced() && watchers.EndpointsWatcher.HasSynced()) { - glog.Infof("Skipping ipvs server sync as local cache is not synced yet") + glog.V(1).Info("Skipping ipvs server sync as local cache is not synced yet") } // build new services map to reflect the change @@ -266,7 +268,7 @@ func (nsc *NetworkServicesController) OnServiceUpdate(serviceUpdate *watchers.Se nsc.serviceMap = newServiceMap nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap) } else { - glog.Infof("Skipping ipvs server sync on service update because nothing changed") + glog.V(1).Info("Skipping ipvs server sync on service update because nothing changed") } } @@ -281,8 +283,13 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf var ipvsSvcs []*ipvs.Service start := time.Now() + defer func() { - glog.Infof("sync ipvs servers took %v", time.Since(start)) + endTime := time.Since(start) + if nsc.MetricsEnabled { + controllerIpvsServicesSyncTime.WithLabelValues().Set(float64(endTime)) + } + glog.V(1).Infof("sync ipvs services took %v", endTime) }() dummyVipInterface, err := getKubeDummyInterface() @@ -290,20 +297,20 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf return errors.New("Failed creating dummy interface: " + err.Error()) } - glog.Infof("Setting up policy routing required for Direct Server Return functionality.") + glog.V(1).Infof("Setting up policy routing required for Direct Server Return functionality.") err = setupPolicyRoutingForDSR() if err != nil { return errors.New("Failed setup PBR for DSR due to: " + err.Error()) } - glog.Infof("Custom routing table " + customDSRRouteTableName + " required for Direct Server Return is setup as expected.") + glog.V(1).Infof("Custom routing table " + customDSRRouteTableName + " required for Direct Server Return is setup as expected.") - glog.Infof("Setting up custom route table required to add routes for external IP's.") + glog.V(1).Infof("Setting up custom route table required to add routes for external IP's.") err = setupRoutesForExternalIPForDSR(serviceInfoMap) if err != nil { glog.Errorf("Failed setup custom routing table required to add routes for external IP's due to: " + err.Error()) return errors.New("Failed setup custom routing table required to add routes for external IP's due to: " + err.Error()) } - glog.Infof("Custom routing table " + externalIPRouteTableName + " required for Direct Server Return is setup as expected.") + glog.V(1).Infof("Custom routing table " + externalIPRouteTableName + " required for Direct Server Return is setup as expected.") // map of active services and service endpoints activeServiceEndpointMap := make(map[string][]string) @@ -315,10 +322,14 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf for k, svc := range serviceInfoMap { var protocol uint16 - if svc.protocol == "tcp" { + + switch aProtocol := svc.protocol; aProtocol { + case "tcp": protocol = syscall.IPPROTO_TCP - } else { + case "udp": protocol = syscall.IPPROTO_UDP + default: + protocol = syscall.IPPROTO_NONE } // assign cluster IP of the service to the dummy interface so that its routable from the pod's on the node @@ -423,7 +434,7 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf err = routeVIPTrafficToDirector("0x" + fmt.Sprintf("%x", fwMark)) if err != nil { glog.Errorf("Failed to setup ip rule to lookup traffic to external IP: %s through custom "+ - "route table due to ", externalIP, err.Error()) + "route table due to %s", externalIP, err.Error()) continue } } else { @@ -530,7 +541,7 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf } // cleanup stale ipvs service and servers - glog.Infof("Cleaning up if any, old ipvs service and servers which are no longer needed") + glog.V(1).Info("Cleaning up if any, old ipvs service and servers which are no longer needed") ipvsSvcs, err = h.GetServices() if err != nil { @@ -554,11 +565,11 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf endpoints, ok := activeServiceEndpointMap[key] if !ok { - glog.Infof("Found a IPVS service %s which is no longer needed so cleaning up", + glog.V(1).Infof("Found a IPVS service %s which is no longer needed so cleaning up", ipvsServiceString(ipvsSvc)) err := h.DelService(ipvsSvc) if err != nil { - glog.Errorf("Failed to delete stale IPVS service %s due to:", + glog.Errorf("Failed to delete stale IPVS service %s due to: %s", ipvsServiceString(ipvsSvc), err.Error()) continue } @@ -576,7 +587,7 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf } } if !validEp { - glog.Infof("Found a destination %s in service %s which is no longer needed so cleaning up", + glog.V(1).Infof("Found a destination %s in service %s which is no longer needed so cleaning up", ipvsDestinationString(dst), ipvsServiceString(ipvsSvc)) err := h.DelDestination(ipvsSvc, dst) if err != nil { @@ -590,13 +601,13 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf if err != nil { glog.Error("Failed to delete conntrack entry for endpoint: " + dst.Address.String() + ":" + strconv.Itoa(int(dst.Port)) + " due to " + err.Error()) } - glog.Infof("Deleted conntrack entry for endpoint: " + dst.Address.String() + ":" + strconv.Itoa(int(dst.Port))) + glog.V(1).Infof("Deleted conntrack entry for endpoint: " + dst.Address.String() + ":" + strconv.Itoa(int(dst.Port))) } } } } } - glog.Infof("IPVS servers and services are synced to desired state!!") + glog.V(1).Info("IPVS servers and services are synced to desired state") return nil } @@ -636,7 +647,7 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er defer hostNetworkNamespaceHandle.Close() activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace before netns.Set: " + activeNetworkNamespaceHandle.String()) + glog.V(1).Infof("Current network namespace before netns.Set: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() client, err := client.NewEnvClient() @@ -662,7 +673,7 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er } activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace after netns.Set to container network namespace: " + activeNetworkNamespaceHandle.String()) + glog.V(2).Infof("Current network namespace after netns. Set to container network namespace: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() // TODO: fix boilerplate `netns.Set(hostNetworkNamespaceHandle)` code. Need a robust @@ -674,12 +685,12 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er if err.Error() != IFACE_NOT_FOUND { netns.Set(hostNetworkNamespaceHandle) activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) + glog.V(2).Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() return errors.New("Failed to verify if ipip tunnel interface exists in endpoint " + endpointIP + " namespace due to " + err.Error()) } - glog.Infof("Could not find tunnel interface " + KUBE_TUNNEL_IF + " in endpoint " + endpointIP + " so creating one.") + glog.V(2).Infof("Could not find tunnel interface " + KUBE_TUNNEL_IF + " in endpoint " + endpointIP + " so creating one.") ipTunLink := netlink.Iptun{ LinkAttrs: netlink.LinkAttrs{Name: KUBE_TUNNEL_IF}, Local: net.ParseIP(endpointIP), @@ -688,7 +699,7 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er if err != nil { netns.Set(hostNetworkNamespaceHandle) activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) + glog.V(2).Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() return errors.New("Failed to add ipip tunnel interface in endpoint namespace due to " + err.Error()) } @@ -706,12 +717,12 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er if err != nil { netns.Set(hostNetworkNamespaceHandle) activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) + glog.V(2).Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() return errors.New("Failed to get " + KUBE_TUNNEL_IF + " tunnel interface handle due to " + err.Error()) } - glog.Infof("Successfully created tunnel interface " + KUBE_TUNNEL_IF + " in endpoint " + endpointIP + ".") + glog.V(2).Infof("Successfully created tunnel interface " + KUBE_TUNNEL_IF + " in endpoint " + endpointIP + ".") } // bring the tunnel interface up @@ -731,7 +742,7 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er if err != nil && err.Error() != IFACE_HAS_ADDR { netns.Set(hostNetworkNamespaceHandle) activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) + glog.V(2).Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() return errors.New("Failed to assign vip " + vip + " to kube-tunnel-if interface ") } @@ -760,7 +771,7 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er if err != nil { netns.Set(hostNetworkNamespaceHandle) activeNetworkNamespaceHandle, err = netns.Get() - glog.Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) + glog.V(2).Infof("Current network namespace after revert namespace to host network namespace: " + activeNetworkNamespaceHandle.String()) activeNetworkNamespaceHandle.Close() return errors.New("Failed to disable rp_filter on `all` in the endpoint container") } @@ -774,77 +785,17 @@ func prepareEndpointForDsr(containerId string, endpointIP string, vip string) er return nil } -func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoMap) error { - ipvsSvcs, err := h.GetServices() - if err != nil { - return errors.New("Failed to list IPVS services: " + err.Error()) - } - - for _, svc := range serviceInfoMap { - var protocol uint16 - var pushMetric bool - var svcVip string - - switch aProtocol := svc.protocol; aProtocol { - case "tcp": - protocol = syscall.IPPROTO_TCP - case "udp": - protocol = syscall.IPPROTO_UDP - default: - protocol = syscall.IPPROTO_NONE - } - glog.Info("Publishing Prometheus metrics") - for _, ipvsSvc := range ipvsSvcs { - - switch svcAddress := ipvsSvc.Address.String(); svcAddress { - case svc.clusterIP.String(): - if protocol == ipvsSvc.Protocol && uint16(svc.port) == ipvsSvc.Port { - pushMetric = true - svcVip = svc.clusterIP.String() - } else { - pushMetric = false - } - case nsc.nodeIP.String(): - if protocol == ipvsSvc.Protocol && uint16(svc.port) == ipvsSvc.Port { - pushMetric = true - svcVip = nsc.nodeIP.String() - } else { - pushMetric = false - } - default: - svcVip = "" - pushMetric = false - } - - if pushMetric { - glog.V(3).Infof("Publishing metrics for %s/%s (%s:%d/%s)", svc.namespace, svc.name, svcVip, svc.port, svc.protocol) - serviceBpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BPSIn)) - serviceBpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BPSOut)) - serviceBytesIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BytesIn)) - serviceBytesOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.BytesOut)) - serviceCPS.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.CPS)) - servicePacketsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PacketsIn)) - servicePacketsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PacketsOut)) - servicePpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSIn)) - servicePpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSOut)) - serviceTotalConn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.Connections)) - } - } - } - return nil -} - func buildServicesInfo() serviceInfoMap { serviceMap := make(serviceInfoMap) for _, svc := range watchers.ServiceWatcher.List() { if svc.Spec.ClusterIP == "None" || svc.Spec.ClusterIP == "" { - glog.Infof("Skipping service name:%s namespace:%s as there is no cluster IP", svc.Name, svc.Namespace) + glog.V(2).Infof("Skipping service name:%s namespace:%s as there is no cluster IP", svc.Name, svc.Namespace) continue } if svc.Spec.Type == "ExternalName" { - glog.Infof("Skipping service name:%s namespace:%s due to service Type=%s", svc.Name, svc.Namespace, svc.Spec.Type) + glog.V(2).Infof("Skipping service name:%s namespace:%s due to service Type=%s", svc.Name, svc.Namespace, svc.Spec.Type) continue } @@ -914,7 +865,7 @@ func buildEndpointsInfo() endpointsInfoMap { return endpointsMap } -// Add an iptable rule to masqurade outbound IPVS traffic. IPVS nat requires that reverse path traffic +// Add an iptable rule to masquerad outbound IPVS traffic. IPVS nat requires that reverse path traffic // to go through the director for its functioning. So the masquerade rule ensures source IP is modifed // to node ip, so return traffic from real server (endpoint pods) hits the node/lvs director func ensureMasqueradeIptablesRule(masqueradeAll bool, podCidr string) error { @@ -939,7 +890,7 @@ func ensureMasqueradeIptablesRule(masqueradeAll bool, podCidr string) error { return errors.New("Failed to run iptables command" + err.Error()) } } - glog.Infof("Successfully added iptables masqurade rule") + glog.V(1).Info("Successfully added iptables masquerad rule") return nil } @@ -974,7 +925,7 @@ func (nsc *NetworkServicesController) syncHairpinIptablesRules() error { // Cleanup (if needed) and return if there's no hairpin-mode Services if len(rulesNeeded) == 0 { - glog.Infof("No hairpin-mode enabled services found -- no hairpin rules created") + glog.V(1).Info("No hairpin-mode enabled services found -- no hairpin rules created") err := deleteHairpinIptablesRules() if err != nil { return errors.New("Error deleting hairpin rules: " + err.Error()) @@ -1047,14 +998,14 @@ func (nsc *NetworkServicesController) syncHairpinIptablesRules() error { if err != nil { glog.Errorf("Unable to delete hairpin rule \"%s\" from chain %s: %e", ruleFromNode, hairpinChain, err) } else { - glog.Info("Deleted invalid/outdated hairpin rule \"%s\" from chain %s", ruleFromNode, hairpinChain) + glog.V(1).Info("Deleted invalid/outdated hairpin rule \"%s\" from chain %s", ruleFromNode, hairpinChain) } } else { // Ignore the chain creation rule if ruleFromNode == "-N "+hairpinChain { continue } - glog.Infof("Not removing invalid hairpin rule \"%s\" from chain %s", ruleFromNode, hairpinChain) + glog.V(1).Infof("Not removing invalid hairpin rule \"%s\" from chain %s", ruleFromNode, hairpinChain) } } } @@ -1120,7 +1071,7 @@ func deleteHairpinIptablesRules() error { if err != nil { glog.Errorf("Unable to delete hairpin jump rule from chain \"POSTROUTING\": %e", err) } else { - glog.Info("Deleted hairpin jump rule from chain \"POSTROUTING\"") + glog.V(1).Info("Deleted hairpin jump rule from chain \"POSTROUTING\"") } } @@ -1157,7 +1108,7 @@ func deleteMasqueradeIptablesRule() error { if err != nil { return errors.New("Failed to run iptables command" + err.Error()) } - glog.Infof("Deleted iptables masquerade rule: %s", rule) + glog.V(2).Infof("Deleted iptables masquerade rule: %s", rule) break } } @@ -1220,7 +1171,7 @@ func ipvsAddService(svcs []*ipvs.Service, vip net.IP, protocol, port uint16, per if err != nil { return nil, err } - glog.Infof("Updated persistence/session-affinity for service: %s", ipvsServiceString(svc)) + glog.V(2).Infof("Updated persistence/session-affinity for service: %s", ipvsServiceString(svc)) } if scheduler != svc.SchedName { @@ -1229,7 +1180,7 @@ func ipvsAddService(svcs []*ipvs.Service, vip net.IP, protocol, port uint16, per if err != nil { return nil, errors.New("Failed to update the scheduler for the service due to " + err.Error()) } - glog.Infof("Updated schedule for the service: %s", ipvsServiceString(svc)) + glog.V(2).Infof("Updated schedule for the service: %s", ipvsServiceString(svc)) } // TODO: Make this debug output when we get log levels // glog.Fatal("ipvs service %s:%s:%s already exists so returning", vip.String(), @@ -1253,7 +1204,7 @@ func ipvsAddService(svcs []*ipvs.Service, vip net.IP, protocol, port uint16, per if err != nil { return nil, err } - glog.Infof("Successfully added service: %s", ipvsServiceString(&svc)) + glog.V(1).Infof("Successfully added service: %s", ipvsServiceString(&svc)) return &svc, nil } @@ -1296,7 +1247,7 @@ func ipvsAddFWMarkService(vip net.IP, protocol, port uint16, persistent bool, sc if err != nil { return nil, err } - glog.Infof("Updated persistence/session-affinity for service: %s", ipvsServiceString(svc)) + glog.V(2).Infof("Updated persistence/session-affinity for service: %s", ipvsServiceString(svc)) } if scheduler != svc.SchedName { @@ -1305,7 +1256,7 @@ func ipvsAddFWMarkService(vip net.IP, protocol, port uint16, persistent bool, sc if err != nil { return nil, errors.New("Failed to update the scheduler for the service due to " + err.Error()) } - glog.Infof("Updated schedule for the service: %s", ipvsServiceString(svc)) + glog.V(2).Infof("Updated schedule for the service: %s", ipvsServiceString(svc)) } // TODO: Make this debug output when we get log levels // glog.Fatal("ipvs service %s:%s:%s already exists so returning", vip.String(), @@ -1349,7 +1300,7 @@ func ipvsAddServer(service *ipvs.Service, dest *ipvs.Destination, local bool, po err := h.NewDestination(service, dest) if err == nil { - glog.Infof("Successfully added destination %s to the service %s", + glog.V(2).Infof("Successfully added destination %s to the service %s", ipvsDestinationString(dest), ipvsServiceString(service)) return nil } @@ -1357,14 +1308,14 @@ func ipvsAddServer(service *ipvs.Service, dest *ipvs.Destination, local bool, po if strings.Contains(err.Error(), IPVS_SERVER_EXISTS) { err = h.UpdateDestination(service, dest) if err != nil { - return fmt.Errorf("Failed to update ipvs destination %s to the ipvs service %s due to : %s", dest.Address, + return fmt.Errorf("Failed to update ipvs destination %s to the ipvs service %s due to : %s", ipvsDestinationString(dest), ipvsServiceString(service), err.Error()) } // TODO: Make this debug output when we get log levels // glog.Infof("ipvs destination %s already exists in the ipvs service %s so not adding destination", // ipvsDestinationString(dest), ipvsServiceString(service)) } else { - return fmt.Errorf("Failed to add ipvs destination %s to the ipvs service %s due to : %s", dest.Address, + return fmt.Errorf("Failed to add ipvs destination %s to the ipvs service %s due to : %s", ipvsDestinationString(dest), ipvsServiceString(service), err.Error()) } return nil @@ -1567,7 +1518,7 @@ func getKubeDummyInterface() (netlink.Link, error) { var dummyVipInterface netlink.Link dummyVipInterface, err := netlink.LinkByName(KUBE_DUMMY_IF) if err != nil && err.Error() == IFACE_NOT_FOUND { - glog.Infof("Could not find dummy interface: " + KUBE_DUMMY_IF + " to assign cluster ip's, so creating one") + glog.V(1).Infof("Could not find dummy interface: " + KUBE_DUMMY_IF + " to assign cluster ip's, creating one") err = netlink.LinkAdd(&netlink.Dummy{netlink.LinkAttrs{Name: KUBE_DUMMY_IF}}) if err != nil { return nil, errors.New("Failed to add dummy interface: " + err.Error()) @@ -1594,7 +1545,7 @@ func (nsc *NetworkServicesController) Cleanup() { handle.Close() - // cleanup iptable masqurade rule + // cleanup iptable masquerad rule err = deleteMasqueradeIptablesRule() if err != nil { glog.Errorf("Failed to cleanup iptable masquerade rule due to: %s", err.Error()) @@ -1635,10 +1586,26 @@ func NewNetworkServicesController(clientset *kubernetes.Clientset, config *optio // &h = handle nsc := NetworkServicesController{} + + if config.MetricsEnabled { + //Register the metrics for this controller + prometheus.MustRegister(controllerIpvsServices) + prometheus.MustRegister(controllerIpvsServicesSyncTime) + prometheus.MustRegister(serviceBpsIn) + prometheus.MustRegister(serviceBpsOut) + prometheus.MustRegister(serviceBytesIn) + prometheus.MustRegister(serviceBytesOut) + prometheus.MustRegister(serviceCPS) + prometheus.MustRegister(servicePacketsIn) + prometheus.MustRegister(servicePacketsOut) + prometheus.MustRegister(servicePpsIn) + prometheus.MustRegister(servicePpsOut) + prometheus.MustRegister(serviceTotalConn) + nsc.MetricsEnabled = true + } + nsc.syncPeriod = config.IpvsSyncPeriod nsc.globalHairpin = config.GlobalHairpinMode - nsc.MetricsPort = config.MetricsPort - nsc.MetricsPath = config.MetricsPath nsc.serviceMap = make(serviceInfoMap) nsc.endpointsMap = make(endpointsInfoMap) diff --git a/app/options/options.go b/app/options/options.go index d68aed01..ab6a33a1 100755 --- a/app/options/options.go +++ b/app/options/options.go @@ -37,8 +37,10 @@ type KubeRouterConfig struct { EnableOverlay bool PeerPasswords []string EnablePprof bool - MetricsPort int + MetricsEnabled bool + MetricsPort uint16 MetricsPath string + VLevel string // FullMeshPassword string } @@ -111,9 +113,10 @@ func (s *KubeRouterConfig) AddFlags(fs *pflag.FlagSet) { "Password for authenticating against the BGP peer defined with \"--peer-router-ips\".") fs.BoolVar(&s.EnablePprof, "enable-pprof", false, "Enables pprof for debugging performance and memory leak issues.") - fs.IntVar(&s.MetricsPort, "metrics-port", 8080, "Prometheus metrics port") + fs.Uint16Var(&s.MetricsPort, "metrics-port", 0, "Prometheus metrics port, 0 = Disabled") fs.StringVar(&s.MetricsPath, "metrics-path", "/metrics", "Prometheus metrics path") - // fs.StringVar(&s.FullMeshPassword, "nodes-full-mesh-password", s.FullMeshPassword, // "Password that cluster-node BGP servers will use to authenticate one another when \"--nodes-full-mesh\" is set.") + fs.StringVarP(&s.VLevel, "v", "v", "0", "log level for V logs") + } diff --git a/app/server.go b/app/server.go index 8fdab0f7..f04a7112 100644 --- a/app/server.go +++ b/app/server.go @@ -121,10 +121,25 @@ func (kr *KubeRouter) Run() error { } if !(kr.Config.RunFirewall || kr.Config.RunServiceProxy || kr.Config.RunRouter) { - glog.Infof("None of router, firewall, service proxy functionality was specified to be run. So exiting") + glog.Info("Router, Firewall or Service proxy functionality must be specified. Exiting!") os.Exit(0) } + if (kr.Config.MetricsPort > 0) && (kr.Config.MetricsPort <= 65535) { + kr.Config.MetricsEnabled = true + mc, err := controllers.NewMetricsController(kr.Client, kr.Config) + if err != nil { + return errors.New("Failed to create metrics controller: " + err.Error()) + } + wg.Add(1) + go mc.Run(stopCh, &wg) + } else if kr.Config.MetricsPort > 65535 { + glog.Errorf("Metrics port must be over 0 and under 65535, given port: %d", kr.Config.MetricsPort) + kr.Config.MetricsEnabled = false + } else { + kr.Config.MetricsEnabled = false + } + if kr.Config.RunFirewall { npc, err := controllers.NewNetworkPolicyController(kr.Client, kr.Config) if err != nil { diff --git a/dashboard/dashboard.png b/dashboard/dashboard.png index 9f450635..3cae5fd7 100644 Binary files a/dashboard/dashboard.png and b/dashboard/dashboard.png differ diff --git a/dashboard/kube-router.json b/dashboard/kube-router.json index 623d968c..85fafa33 100644 --- a/dashboard/kube-router.json +++ b/dashboard/kube-router.json @@ -44,7 +44,7 @@ }, "editable": true, "gnetId": null, - "graphTooltip": 2, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [], @@ -52,7 +52,7 @@ "rows": [ { "collapse": false, - "height": 300, + "height": 139, "panels": [ { "aliasColors": {}, @@ -93,7 +93,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(kube_router_service_cps) by (namespace, service_name, protocol, port)", + "expr": "sum(kube_router_service_cps{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"}) by (namespace, service_name, protocol, port)", "format": "time_series", "instant": false, "intervalFactor": 2, @@ -104,7 +104,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "CPS", + "title": "Connections", "tooltip": { "shared": true, "sort": 0, @@ -121,7 +121,7 @@ "yaxes": [ { "format": "short", - "label": null, + "label": "Per second", "logBase": 1, "max": null, "min": null, @@ -133,7 +133,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] }, @@ -171,7 +171,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(kube_router_service_bps_in) by (instance)", + "expr": "sum(kube_router_service_bps_in{instance=~\"[[selnode]]\"}) by (instance)", "format": "time_series", "instant": false, "intervalFactor": 2, @@ -179,7 +179,7 @@ "refId": "A" }, { - "expr": "- sum(kube_router_service_bps_out) by (instance)", + "expr": "- sum(kube_router_service_bps_out{instance=~\"[[selnode]]\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "out: {{instance}}", @@ -206,7 +206,7 @@ "yaxes": [ { "format": "Bps", - "label": null, + "label": "Rate", "logBase": 1, "max": null, "min": null, @@ -218,7 +218,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] } @@ -232,7 +232,7 @@ }, { "collapse": false, - "height": 337, + "height": 158, "panels": [ { "aliasColors": {}, @@ -272,14 +272,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(kube_router_service_pps_in) by (namespace, service_name, protocol, port)", + "expr": "sum(kube_router_service_pps_in{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"}) by (namespace, service_name, protocol, port)", "format": "time_series", "intervalFactor": 2, "legendFormat": "in {{namespace}}/{{service_name}} {{port}}/{{protocol}}", "refId": "A" }, { - "expr": "- sum(kube_router_service_pps_out) by (namespace, service_name, protocol, port)", + "expr": "- sum(kube_router_service_pps_out{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"}) by (namespace, service_name, protocol, port)", "format": "time_series", "intervalFactor": 2, "legendFormat": "out {{namespace}}/{{service_name}} {{port}}/{{protocol}}", @@ -289,7 +289,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "PPS", + "title": "Packets", "tooltip": { "shared": true, "sort": 0, @@ -306,7 +306,7 @@ "yaxes": [ { "format": "pps", - "label": null, + "label": "Per second", "logBase": 1, "max": null, "min": null, @@ -358,7 +358,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(kube_router_service_bps_in) by (namespace, service_name, protocol, port)", + "expr": "sum(kube_router_service_bps_in{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"}) by (namespace, service_name, protocol, port)", "format": "time_series", "instant": false, "intervalFactor": 2, @@ -366,7 +366,7 @@ "refId": "A" }, { - "expr": "- sum(kube_router_service_bps_out) by (namespace, service_name, protocol, port)", + "expr": "- sum(kube_router_service_bps_out{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"}) by (namespace, service_name, protocol, port)", "format": "time_series", "instant": false, "intervalFactor": 2, @@ -377,7 +377,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "BPS", + "title": "Traffic", "tooltip": { "shared": true, "sort": 0, @@ -394,11 +394,323 @@ "yaxes": [ { "format": "Bps", + "label": "Rate", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 164, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_router_controller_iptables_sync_time{instance=~\"[[selnode]]\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Iptables sync", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ns", + "label": "Time", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_router_controller_ipvs_services_sync_time{instance=~\"[[selnode]]\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Ipvs service sync", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ns", + "label": "Time", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_router_controller_bgp_internal_peers_sync_time{instance=~\"[[selnode]]\"}) by (instance)", + "format": "time_series", + "intervalFactor": 5, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "BGP peer sync", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ns", + "label": "Time", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_router_controller_ipvs_metrics_export_time{instance=~\"[[selnode]]\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Metrics export time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ns", + "label": "Time", + "logBase": 1, + "max": null, + "min": null, + "show": true }, { "format": "short", @@ -420,7 +732,248 @@ }, { "collapse": false, - "height": 281, + "height": 247, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_router_controller_ipvs_services{instance=~\"[[selnode]]\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Ipvs services", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "Per node", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "\nsort_desc(sum(kube_router_controller_bgp_peers{instance=~\"[[selnode]]\"}) by (instance))", + "format": "time_series", + "instant": false, + "intervalFactor": 5, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "BGP peers 5m avg", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Peers", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kube_router_controller_bgp_advertisements_received{instance=~\"[[selnode]]\"}[1m])) by (instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "BGP announcements received 5m avg", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Rate", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 177, "panels": [ { "aliasColors": {}, @@ -460,7 +1013,7 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(kube_router_service_bytes_in > 0) by (namespace, service_name, protocol, port))", + "expr": "sort_desc(sum(kube_router_service_bytes_in{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"} > 0) by (namespace, service_name, protocol, port))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -471,7 +1024,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Total Bytes In", + "title": "Bytes In", "tooltip": { "shared": false, "sort": 0, @@ -489,8 +1042,9 @@ }, "yaxes": [ { + "decimals": null, "format": "decbytes", - "label": null, + "label": "Total", "logBase": 10, "max": null, "min": null, @@ -544,7 +1098,7 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(kube_router_service_bytes_out > 0) by (namespace, service_name, protocol, port))", + "expr": "sort_desc(sum(kube_router_service_bytes_out{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"} > 0) by (namespace, service_name, protocol, port))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -553,9 +1107,9 @@ } ], "thresholds": [], - "timeFrom": "1m", + "timeFrom": null, "timeShift": null, - "title": "Total Bytes Out", + "title": "Bytes Out", "tooltip": { "shared": false, "sort": 0, @@ -574,7 +1128,7 @@ "yaxes": [ { "format": "bytes", - "label": null, + "label": "Total", "logBase": 10, "max": null, "min": null, @@ -600,7 +1154,7 @@ }, { "collapse": false, - "height": 257, + "height": 154, "panels": [ { "aliasColors": {}, @@ -640,7 +1194,7 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(kube_router_service_packets_in > 0) by (namespace, service_name, protocol, port))", + "expr": "sort_desc(sum(kube_router_service_packets_in{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"} > 0) by (namespace, service_name, protocol, port))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -651,7 +1205,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Total Packets In", + "title": "Packets In", "tooltip": { "shared": false, "sort": 0, @@ -670,7 +1224,7 @@ "yaxes": [ { "format": "short", - "label": null, + "label": "Total", "logBase": 10, "max": null, "min": null, @@ -724,7 +1278,7 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(kube_router_service_packets_out > 0) by (namespace, service_name, protocol, port))", + "expr": "sort_desc(sum(kube_router_service_packets_out{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"} > 0) by (namespace, service_name, protocol, port))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -735,7 +1289,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Total Packets Out", + "title": "Packets Out", "tooltip": { "shared": false, "sort": 0, @@ -754,7 +1308,7 @@ "yaxes": [ { "format": "short", - "label": null, + "label": "Total", "logBase": 10, "max": null, "min": null, @@ -821,7 +1375,7 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(kube_router_service_total_connections > 0) by (namespace, service_name, protocol, port))", + "expr": "sort_desc(sum(kube_router_service_total_connections{namespace=~\"[[selnamespace]]\",instance=~\"[[selnode]]\"} > 0) by (namespace, service_name, protocol, port))", "format": "time_series", "instant": true, "intervalFactor": 3, @@ -832,7 +1386,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Total Connections", + "title": "Connections", "tooltip": { "shared": false, "sort": 0, @@ -851,7 +1405,7 @@ "yaxes": [ { "format": "short", - "label": null, + "label": "Total", "logBase": 10, "max": null, "min": null, @@ -883,10 +1437,51 @@ "network" ], "templating": { - "list": [] + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "selnamespace", + "options": [], + "query": "label_values(kube_router_service_total_connections, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "selnode", + "options": [], + "query": "label_values(kube_router_service_total_connections, instance)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] }, "time": { - "from": "now-30m", + "from": "now-5m", "to": "now" }, "timepicker": { @@ -916,5 +1511,5 @@ }, "timezone": "browser", "title": "kube-router", - "version": 27 + "version": 48 } \ No newline at end of file diff --git a/kube-router.go b/kube-router.go index 6ceb9b93..989bdb7a 100644 --- a/kube-router.go +++ b/kube-router.go @@ -24,6 +24,7 @@ func main() { flag.CommandLine.Parse([]string{}) flag.Set("logtostderr", "true") + flag.Set("v", config.VLevel) if config.HelpRequested { pflag.Usage()