From e53aef280c3fdce7ba858c56777c4f79427b80b6 Mon Sep 17 00:00:00 2001 From: Joakim Karlsson Date: Sun, 4 Feb 2018 21:25:49 +0100 Subject: [PATCH] more work on healthchecks --- app/controllers/health_controller.go | 49 +++++++++++++------ app/controllers/network_policy_controller.go | 4 +- .../network_services_controller.go | 23 ++++++--- app/server.go | 2 + 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/app/controllers/health_controller.go b/app/controllers/health_controller.go index 3cddf5a3..6008921c 100644 --- a/app/controllers/health_controller.go +++ b/app/controllers/health_controller.go @@ -1,6 +1,7 @@ package controllers import ( + "fmt" "net/http" "strconv" "sync" @@ -42,15 +43,23 @@ func sendHeartBeat(channel chan<- *ControllerHeartbeat, controller string) { func (hc *HealthController) Handler(w http.ResponseWriter, req *http.Request) { if hc.Status.Healthy { w.WriteHeader(http.StatusOK) - w.Write([]byte("These aren't the droids you're looking for\n")) + w.Write([]byte("OK\n")) } else { w.WriteHeader(http.StatusInternalServerError) - w.Write([]byte("These are the droids you're looking for\n")) + statusText := fmt.Sprintf("Service controller last alive %s\n ago"+ + "Routing controller last alive: %s\n ago"+ + "Policy controller last alive: %s\n ago"+ + "Metrics controller last alive: %s\n ago", + time.Since(hc.Status.NetworkServicesControllerAlive), + time.Since(hc.Status.NetworkRoutingControllerAlive), + time.Since(hc.Status.NetworkPolicyControllerAlive), + time.Since(hc.Status.MetricsControllerAlive)) + w.Write([]byte(statusText)) } } func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) { - glog.Infof("Received heartbeat from %s", beat.Component) + glog.V(3).Infof("Received heartbeat from %s", beat.Component) switch component := beat.Component; component { case "NSC": hc.Status.NetworkServicesControllerAlive = time.Now() @@ -64,22 +73,35 @@ func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) { } func (hc *HealthController) CheckHealth() bool { - glog.V(4).Info("Checking components") health := true - if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+3*time.Second { - glog.Error("Network Policy Controller heartbeat timeout") - health = false + if hc.Config.RunFirewall { + if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+3*time.Second { + glog.Error("Network Policy Controller heartbeat missed") + health = false + } } - if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+3*time.Second { - glog.Error("Network Routing Controller heartbeat timeout") - health = false + if hc.Config.RunRouter { + if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+3*time.Second { + glog.Error("Network Routing Controller heartbeat missed") + health = false + } } - if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+3*time.Second { - glog.Error("NetworkService Controller heartbeat timeout") - health = false + if hc.Config.RunServiceProxy { + if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+3*time.Second { + glog.Error("NetworkService Controller heartbeat missed") + health = false + } } + + if hc.Config.MetricsEnabled { + if time.Since(hc.Status.MetricsControllerAlive) > 3*time.Second { + glog.Error("Metrics Controller heartbeat missed") + health = false + } + } + return health } @@ -90,7 +112,6 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh < srv := &http.Server{Addr: ":" + strconv.Itoa(int(hc.HealthPort)), Handler: http.DefaultServeMux} - // add prometheus handler on metrics path http.HandleFunc("/healthz", hc.Handler) go func() { diff --git a/app/controllers/network_policy_controller.go b/app/controllers/network_policy_controller.go index 4b6a95d9..e4984c93 100644 --- a/app/controllers/network_policy_controller.go +++ b/app/controllers/network_policy_controller.go @@ -121,13 +121,13 @@ func (npc *NetworkPolicyController) Run(healthChan chan<- *ControllerHeartbeat, err := npc.Sync() if err != nil { glog.Errorf("Error during periodic sync: " + err.Error()) + } else { + sendHeartBeat(healthChan, "NPC") } } else { continue } - sendHeartBeat(healthChan, "NPC") - select { case <-stopCh: glog.Infof("Shutting down network policies controller") diff --git a/app/controllers/network_services_controller.go b/app/controllers/network_services_controller.go index 26a39a72..8e8dfcd8 100644 --- a/app/controllers/network_services_controller.go +++ b/app/controllers/network_services_controller.go @@ -130,13 +130,16 @@ func (nsc *NetworkServicesController) Run(healthChan chan<- *ControllerHeartbeat if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() { glog.V(1).Info("Performing periodic sync of ipvs services") - nsc.sync() + err := nsc.sync() + if err != nil { + glog.Errorf("Error during periodic ipvs sync: " + err.Error()) + } else { + sendHeartBeat(healthChan, "NSC") + } } else { continue } - sendHeartBeat(healthChan, "NSC") - select { case <-stopCh: glog.Info("Shutting down network services controller") @@ -146,20 +149,28 @@ func (nsc *NetworkServicesController) Run(healthChan chan<- *ControllerHeartbeat } } -func (nsc *NetworkServicesController) sync() { +func (nsc *NetworkServicesController) sync() error { + var err error nsc.mu.Lock() defer nsc.mu.Unlock() nsc.serviceMap = buildServicesInfo() nsc.endpointsMap = buildEndpointsInfo() - err := nsc.syncHairpinIptablesRules() + err = nsc.syncHairpinIptablesRules() if err != nil { glog.Errorf("Error syncing hairpin iptable rules: %s", err.Error()) } - nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap) + + err = nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap) + if err != nil { + glog.Errorf("Error syncing IPVS services: %s", err.Error()) + return err + } + if nsc.MetricsEnabled { nsc.publishMetrics(nsc.serviceMap) } + return nil } func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoMap) error { diff --git a/app/server.go b/app/server.go index a6a150dd..8fa53f3f 100644 --- a/app/server.go +++ b/app/server.go @@ -111,7 +111,9 @@ func (kr *KubeRouter) stopApiWatchers() { func (kr *KubeRouter) Run() error { var err error var wg sync.WaitGroup + healthChan := make(chan *controllers.ControllerHeartbeat, 10) + defer close(healthChan) stopCh := make(chan struct{})