From 9f7c8e9a07bd6682e78ea7ddec57215375c6d802 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Tue, 28 Apr 2026 16:04:12 +0000 Subject: [PATCH] state: clear Unhealthy when node leaves HA candidate set Restore the legacy auto-clear at write boundaries that drop HA candidacy: Disconnect, SetApprovedRoutes(empty), and UpdateNodeFromMapRequest shrinking advertised routes to empty. Plus a defensive guard in SetNodeUnhealthy. Updates #3203 --- hscontrol/servertest/ha_health_test.go | 97 ++++++++++++++++++++++++++ hscontrol/state/state.go | 17 +++++ 2 files changed, 114 insertions(+) diff --git a/hscontrol/servertest/ha_health_test.go b/hscontrol/servertest/ha_health_test.go index 268fe91b..b09ed7d3 100644 --- a/hscontrol/servertest/ha_health_test.go +++ b/hscontrol/servertest/ha_health_test.go @@ -186,6 +186,103 @@ func TestHAHealthProbe_ConnectClearsUnhealthy(t *testing.T) { "reconnect should clear unhealthy state") } +// TestHAHealthProbe_SetApprovedRoutesEmptyClearsUnhealthy verifies +// that clearing a node's approved routes also clears any stale +// Unhealthy bit, mirroring the legacy routes.SetRoutes(empty) +// auto-clear. Without this, a probe timeout that lands just before +// SetApprovedRoutes would surface as a stale unhealthy node forever. +func TestHAHealthProbe_SetApprovedRoutesEmptyClearsUnhealthy(t *testing.T) { + t.Parallel() + + srv := servertest.NewServer(t) + user := srv.CreateUser(t, "ha-clear-approve") + + route := netip.MustParsePrefix("10.100.0.0/24") + + c1 := servertest.NewClient(t, srv, "ha-ca-r1", servertest.WithUser(user)) + c2 := servertest.NewClient(t, srv, "ha-ca-r2", servertest.WithUser(user)) + + c1.WaitForPeers(t, 1, 10*time.Second) + c2.WaitForPeers(t, 1, 10*time.Second) + + nodeID1 := advertiseAndApproveRoute(t, srv, c1, route) + advertiseAndApproveRoute(t, srv, c2, route) + + srv.State().SetNodeUnhealthy(nodeID1, true) + require.False(t, srv.State().IsNodeHealthy(nodeID1)) + + _, _, err := srv.State().SetApprovedRoutes(nodeID1, nil) + require.NoError(t, err) + + assert.True(t, srv.State().IsNodeHealthy(nodeID1), + "clearing approved routes should drop stale Unhealthy bit") +} + +// TestHAHealthProbe_DisconnectClearsUnhealthy verifies that +// Disconnect resets a stale Unhealthy bit. An offline node is not an +// HA candidate; carrying the bit forward leaks into DebugRoutes. +// +// The poll handler waits a 10s grace period before calling +// state.Disconnect, so the assertion is wrapped in Eventually with a +// generous timeout. +func TestHAHealthProbe_DisconnectClearsUnhealthy(t *testing.T) { + t.Parallel() + + srv := servertest.NewServer(t) + user := srv.CreateUser(t, "ha-clear-disc") + + route := netip.MustParsePrefix("10.101.0.0/24") + + c1 := servertest.NewClient(t, srv, "ha-cd-r1", servertest.WithUser(user)) + c2 := servertest.NewClient(t, srv, "ha-cd-r2", servertest.WithUser(user)) + + c1.WaitForPeers(t, 1, 10*time.Second) + c2.WaitForPeers(t, 1, 10*time.Second) + + nodeID1 := advertiseAndApproveRoute(t, srv, c1, route) + advertiseAndApproveRoute(t, srv, c2, route) + + srv.State().SetNodeUnhealthy(nodeID1, true) + require.False(t, srv.State().IsNodeHealthy(nodeID1)) + + c1.Disconnect(t) + + assert.Eventually(t, func() bool { + return srv.State().IsNodeHealthy(nodeID1) + }, 15*time.Second, 200*time.Millisecond, + "disconnect should drop stale Unhealthy bit") +} + +// TestHAHealthProbe_SetUnhealthyNoRoutesIsNoOp verifies the +// defensive guard for the still-online-but-no-routes case: a probe +// that fires after SetApprovedRoutes(empty) should not be allowed +// to install a stale Unhealthy bit either. +func TestHAHealthProbe_SetUnhealthyNoRoutesIsNoOp(t *testing.T) { + t.Parallel() + + srv := servertest.NewServer(t) + user := srv.CreateUser(t, "ha-guard-noroutes") + + route := netip.MustParsePrefix("10.103.0.0/24") + + c1 := servertest.NewClient(t, srv, "ha-gn-r1", servertest.WithUser(user)) + c2 := servertest.NewClient(t, srv, "ha-gn-r2", servertest.WithUser(user)) + + c1.WaitForPeers(t, 1, 10*time.Second) + c2.WaitForPeers(t, 1, 10*time.Second) + + nodeID1 := advertiseAndApproveRoute(t, srv, c1, route) + advertiseAndApproveRoute(t, srv, c2, route) + + _, _, err := srv.State().SetApprovedRoutes(nodeID1, nil) + require.NoError(t, err) + + srv.State().SetNodeUnhealthy(nodeID1, true) + + assert.True(t, srv.State().IsNodeHealthy(nodeID1), + "SetNodeUnhealthy on node with no approved routes should be a no-op") +} + // TestHAHealthProbe_NoHARoutes verifies that the prober is a no-op // when no HA configuration exists. func TestHAHealthProbe_NoHARoutes(t *testing.T) { diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go index dd88d177..ffcbc1da 100644 --- a/hscontrol/state/state.go +++ b/hscontrol/state/state.go @@ -599,6 +599,9 @@ func (s *State) Disconnect(id types.NodeID, epoch uint64) ([]change.Change, erro now := time.Now() n.LastSeen = &now n.IsOnline = new(false) + // Offline nodes are not HA candidates; drop any stale + // Unhealthy bit so it does not surface in DebugRoutes. + n.Unhealthy = false }) if !ok { @@ -899,6 +902,12 @@ func (s *State) SetApprovedRoutes(nodeID types.NodeID, routes []netip.Prefix) (t n, ok := s.nodeStore.UpdateNode(nodeID, func(node *types.Node) { node.ApprovedRoutes = routes + // A node with no approved routes is no longer an HA + // candidate; drop any stale Unhealthy bit (mirrors the + // legacy routes.SetRoutes(empty) auto-clear). + if len(node.AllApprovedRoutes()) == 0 { + node.Unhealthy = false + } }) if !ok { @@ -2562,6 +2571,14 @@ func (s *State) UpdateNodeFromMapRequest(id types.NodeID, req tailcfg.MapRequest Msg("applying route approval results") } } + + // AllApprovedRoutes is announced ∩ approved; a Hostinfo + // update that shrinks the announced set can drop the node + // out of HA candidacy without touching ApprovedRoutes. + // Clear any stale Unhealthy bit in that case. + if len(currentNode.AllApprovedRoutes()) == 0 { + currentNode.Unhealthy = false + } }) if !ok {