state: add HA health prober

Ping HA subnet routers each probe cycle and mark unresponsive nodes unhealthy. Reconnecting a node clears its unhealthy state since the fresh Noise session proves basic connectivity. Updates #2129 Updates #2902
2026-05-05 03:56:10 +02:00 · 2026-04-15 13:41:09 +00:00 · 2026-04-15 13:41:09 +00:00 · 90e65ccd63
commit 90e65ccd63
parent 786ce2dce8
3 changed files with 203 additions and 0 deletions
--- a/hscontrol/state/ha_health.go
+++ b/hscontrol/state/ha_health.go
@ -0,0 +1,139 @@
+package state
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/juanfont/headscale/hscontrol/types"
+	"github.com/juanfont/headscale/hscontrol/types/change"
+	"github.com/juanfont/headscale/hscontrol/util/zlog/zf"
+	"github.com/rs/zerolog/log"
+	"tailscale.com/tailcfg"
+	"tailscale.com/util/set"
+)
+
+// HAHealthProber periodically pings HA subnet router nodes and
+// triggers failover when a primary stops responding.
+type HAHealthProber struct {
+	state       *State
+	cfg         types.HARouteConfig
+	serverURL   string
+	isConnected func(types.NodeID) bool
+}
+
+// NewHAHealthProber creates a prober that uses the given State for
+// ping tracking and primary route management.
+// isConnected should return true if a node has an active map session.
+func NewHAHealthProber(
+	s *State,
+	cfg types.HARouteConfig,
+	serverURL string,
+	isConnected func(types.NodeID) bool,
+) *HAHealthProber {
+	return &HAHealthProber{
+		state:       s,
+		cfg:         cfg,
+		serverURL:   serverURL,
+		isConnected: isConnected,
+	}
+}
+
+// ProbeOnce pings all HA subnet router nodes. PingNode changes are
+// dispatched immediately via dispatch so nodes can respond before the
+// timeout. Health-related policy changes are also dispatched inline.
+func (p *HAHealthProber) ProbeOnce(
+	ctx context.Context,
+	dispatch func(...change.Change),
+) {
+	haNodes := p.state.primaryRoutes.HANodes()
+	if len(haNodes) == 0 {
+		return
+	}
+
+	// Deduplicate node IDs across prefixes.
+	seen := make(set.Set[types.NodeID])
+
+	var nodeIDs []types.NodeID
+
+	for _, nodes := range haNodes {
+		for _, id := range nodes {
+			if !seen.Contains(id) {
+				seen.Add(id)
+				nodeIDs = append(nodeIDs, id)
+			}
+		}
+	}
+
+	log.Debug().
+		Int("haNodes", len(nodeIDs)).
+		Msg("HA health prober starting probe cycle")
+
+	var wg sync.WaitGroup
+
+	deadline := time.After(p.cfg.ProbeTimeout)
+
+	for _, id := range nodeIDs {
+		if !p.isConnected(id) {
+			log.Debug().
+				Uint64(zf.NodeID, id.Uint64()).
+				Msg("HA probe: skipping offline node")
+
+			continue
+		}
+
+		pingID, responseCh := p.state.RegisterPing(id)
+		callbackURL := p.serverURL + "/machine/ping-response?id=" + pingID
+
+		dispatch(change.PingNode(id, &tailcfg.PingRequest{
+			URL: callbackURL,
+		}))
+
+		wg.Go(func() {
+			select {
+			case latency := <-responseCh:
+				log.Debug().
+					Uint64(zf.NodeID, id.Uint64()).
+					Dur("latency", latency).
+					Msg("HA probe: node responded")
+
+				if p.state.primaryRoutes.SetNodeHealthy(id, true) {
+					dispatch(change.PolicyChange())
+
+					log.Info().
+						Uint64(zf.NodeID, id.Uint64()).
+						Msg("HA probe: node recovered, recalculating primaries")
+				}
+
+			case <-deadline:
+				p.state.CancelPing(pingID)
+
+				if !p.isConnected(id) {
+					log.Debug().
+						Uint64(zf.NodeID, id.Uint64()).
+						Msg("HA probe: node went offline during probe, skipping")
+
+					return
+				}
+
+				log.Warn().
+					Uint64(zf.NodeID, id.Uint64()).
+					Dur("timeout", p.cfg.ProbeTimeout).
+					Msg("HA probe: node did not respond")
+
+				if p.state.primaryRoutes.SetNodeHealthy(id, false) {
+					dispatch(change.PolicyChange())
+
+					log.Info().
+						Uint64(zf.NodeID, id.Uint64()).
+						Msg("HA probe: node unhealthy, triggering failover")
+				}
+
+			case <-ctx.Done():
+				p.state.CancelPing(pingID)
+			}
+		})
+	}
+
+	wg.Wait()
+}
--- a/hscontrol/state/state.go
+++ b/hscontrol/state/state.go
@ -574,6 +574,10 @@ func (s *State) Connect(id types.NodeID) ([]change.Change, uint64) {

 	log.Info().EmbedObject(node).Msg("node connected")

+	// Reconnecting clears any prior unhealthy state — the node proved
+	// basic connectivity by establishing the Noise session.
+	s.primaryRoutes.ClearUnhealthy(id)
+
 	// Use the node's current routes for primary route update.
 	// AllApprovedRoutes() returns only the intersection of announced and approved routes.
 	routeChange := s.primaryRoutes.SetRoutes(id, node.AllApprovedRoutes()...)
@ -1184,6 +1188,11 @@ func (s *State) RoutesForPeer(
 	return reduced
 }

+// PrimaryRoutes returns the primary routes tracker.
+func (s *State) PrimaryRoutes() *routes.PrimaryRoutes {
+	return s.primaryRoutes
+}
+
 // PrimaryRoutesString returns a string representation of all primary routes.
 func (s *State) PrimaryRoutesString() string {
 	return s.primaryRoutes.String()
--- a/hscontrol/types/config.go
+++ b/hscontrol/types/config.go
@ -60,6 +60,22 @@ type EphemeralConfig struct {
 	InactivityTimeout time.Duration
 }

+// HARouteConfig contains configuration for HA subnet router health probing.
+type HARouteConfig struct {
+	// ProbeInterval is how often HA subnet routers are probed.
+	// A zero or negative duration disables probing.
+	ProbeInterval time.Duration
+
+	// ProbeTimeout is the maximum time to wait for a probe response
+	// before declaring a node unhealthy. Must be less than ProbeInterval.
+	ProbeTimeout time.Duration
+}
+
+// RouteConfig contains configuration for route behaviour.
+type RouteConfig struct {
+	HA HARouteConfig
+}
+
 // NodeConfig contains configuration for node lifecycle and expiry.
 type NodeConfig struct {
 	// Expiry is the default key expiry duration for non-tagged nodes.
@ -70,6 +86,9 @@ type NodeConfig struct {

 	// Ephemeral contains configuration for ephemeral node lifecycle.
 	Ephemeral EphemeralConfig
+
+	// Routes contains configuration for route behaviour.
+	Routes RouteConfig
 }

 // Config contains the initial Headscale configuration.
@ -414,6 +433,8 @@ func LoadConfig(path string, isFile bool) error {

 	viper.SetDefault("node.expiry", "0")
 	viper.SetDefault("node.ephemeral.inactivity_timeout", "120s")
+	viper.SetDefault("node.routes.ha.probe_interval", "10s")
+	viper.SetDefault("node.routes.ha.probe_timeout", "5s")

 	viper.SetDefault("tuning.notifier_send_timeout", "800ms")
 	viper.SetDefault("tuning.batch_change_delay", "800ms")
@ -576,6 +597,34 @@ func validateServerConfig() error {
 		}
 	}

+	// Validate HA health probing parameters
+	if haInterval := viper.GetDuration(
+		"node.routes.ha.probe_interval",
+	); haInterval > 0 {
+		if haInterval < 2*time.Second {
+			errorText += fmt.Sprintf(
+				"Fatal config error: node.routes.ha.probe_interval (%s) must be >= 2s\n",
+				haInterval,
+			)
+		}
+
+		haTimeout := viper.GetDuration("node.routes.ha.probe_timeout")
+		if haTimeout < 1*time.Second {
+			errorText += fmt.Sprintf(
+				"Fatal config error: node.routes.ha.probe_timeout (%s) must be >= 1s\n",
+				haTimeout,
+			)
+		}
+
+		if haTimeout >= haInterval {
+			errorText += fmt.Sprintf(
+				"Fatal config error: node.routes.ha.probe_timeout (%s) must be less than node.routes.ha.probe_interval (%s)\n",
+				haTimeout,
+				haInterval,
+			)
+		}
+	}
+
 	// Validate tuning parameters
 	if size := viper.GetInt("tuning.node_store_batch_size"); size <= 0 {
 		errorText += fmt.Sprintf(
@ -1129,6 +1178,12 @@ func LoadServerConfig() (*Config, error) {
 			Ephemeral: EphemeralConfig{
 				InactivityTimeout: resolveEphemeralInactivityTimeout(),
 			},
+			Routes: RouteConfig{
+				HA: HARouteConfig{
+					ProbeInterval: viper.GetDuration("node.routes.ha.probe_interval"),
+					ProbeTimeout:  viper.GetDuration("node.routes.ha.probe_timeout"),
+				},
+			},
 		},

 		Database: databaseConfig(),