more work on healthchecks

This commit is contained in:
Joakim Karlsson 2018-02-04 21:25:49 +01:00
parent 77cb340cfd
commit e53aef280c
4 changed files with 56 additions and 22 deletions

View File

@ -1,6 +1,7 @@
package controllers
import (
"fmt"
"net/http"
"strconv"
"sync"
@ -42,15 +43,23 @@ func sendHeartBeat(channel chan<- *ControllerHeartbeat, controller string) {
func (hc *HealthController) Handler(w http.ResponseWriter, req *http.Request) {
if hc.Status.Healthy {
w.WriteHeader(http.StatusOK)
w.Write([]byte("These aren't the droids you're looking for\n"))
w.Write([]byte("OK\n"))
} else {
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte("These are the droids you're looking for\n"))
statusText := fmt.Sprintf("Service controller last alive %s\n ago"+
"Routing controller last alive: %s\n ago"+
"Policy controller last alive: %s\n ago"+
"Metrics controller last alive: %s\n ago",
time.Since(hc.Status.NetworkServicesControllerAlive),
time.Since(hc.Status.NetworkRoutingControllerAlive),
time.Since(hc.Status.NetworkPolicyControllerAlive),
time.Since(hc.Status.MetricsControllerAlive))
w.Write([]byte(statusText))
}
}
func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
glog.Infof("Received heartbeat from %s", beat.Component)
glog.V(3).Infof("Received heartbeat from %s", beat.Component)
switch component := beat.Component; component {
case "NSC":
hc.Status.NetworkServicesControllerAlive = time.Now()
@ -64,22 +73,35 @@ func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
}
func (hc *HealthController) CheckHealth() bool {
glog.V(4).Info("Checking components")
health := true
if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+3*time.Second {
glog.Error("Network Policy Controller heartbeat timeout")
health = false
if hc.Config.RunFirewall {
if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+3*time.Second {
glog.Error("Network Policy Controller heartbeat missed")
health = false
}
}
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+3*time.Second {
glog.Error("Network Routing Controller heartbeat timeout")
health = false
if hc.Config.RunRouter {
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+3*time.Second {
glog.Error("Network Routing Controller heartbeat missed")
health = false
}
}
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+3*time.Second {
glog.Error("NetworkService Controller heartbeat timeout")
health = false
if hc.Config.RunServiceProxy {
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+3*time.Second {
glog.Error("NetworkService Controller heartbeat missed")
health = false
}
}
if hc.Config.MetricsEnabled {
if time.Since(hc.Status.MetricsControllerAlive) > 3*time.Second {
glog.Error("Metrics Controller heartbeat missed")
health = false
}
}
return health
}
@ -90,7 +112,6 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <
srv := &http.Server{Addr: ":" + strconv.Itoa(int(hc.HealthPort)), Handler: http.DefaultServeMux}
// add prometheus handler on metrics path
http.HandleFunc("/healthz", hc.Handler)
go func() {

View File

@ -121,13 +121,13 @@ func (npc *NetworkPolicyController) Run(healthChan chan<- *ControllerHeartbeat,
err := npc.Sync()
if err != nil {
glog.Errorf("Error during periodic sync: " + err.Error())
} else {
sendHeartBeat(healthChan, "NPC")
}
} else {
continue
}
sendHeartBeat(healthChan, "NPC")
select {
case <-stopCh:
glog.Infof("Shutting down network policies controller")

View File

@ -130,13 +130,16 @@ func (nsc *NetworkServicesController) Run(healthChan chan<- *ControllerHeartbeat
if watchers.PodWatcher.HasSynced() && watchers.NetworkPolicyWatcher.HasSynced() {
glog.V(1).Info("Performing periodic sync of ipvs services")
nsc.sync()
err := nsc.sync()
if err != nil {
glog.Errorf("Error during periodic ipvs sync: " + err.Error())
} else {
sendHeartBeat(healthChan, "NSC")
}
} else {
continue
}
sendHeartBeat(healthChan, "NSC")
select {
case <-stopCh:
glog.Info("Shutting down network services controller")
@ -146,20 +149,28 @@ func (nsc *NetworkServicesController) Run(healthChan chan<- *ControllerHeartbeat
}
}
func (nsc *NetworkServicesController) sync() {
func (nsc *NetworkServicesController) sync() error {
var err error
nsc.mu.Lock()
defer nsc.mu.Unlock()
nsc.serviceMap = buildServicesInfo()
nsc.endpointsMap = buildEndpointsInfo()
err := nsc.syncHairpinIptablesRules()
err = nsc.syncHairpinIptablesRules()
if err != nil {
glog.Errorf("Error syncing hairpin iptable rules: %s", err.Error())
}
nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap)
err = nsc.syncIpvsServices(nsc.serviceMap, nsc.endpointsMap)
if err != nil {
glog.Errorf("Error syncing IPVS services: %s", err.Error())
return err
}
if nsc.MetricsEnabled {
nsc.publishMetrics(nsc.serviceMap)
}
return nil
}
func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoMap) error {

View File

@ -111,7 +111,9 @@ func (kr *KubeRouter) stopApiWatchers() {
func (kr *KubeRouter) Run() error {
var err error
var wg sync.WaitGroup
healthChan := make(chan *controllers.ControllerHeartbeat, 10)
defer close(healthChan)
stopCh := make(chan struct{})