state: clear Unhealthy when node leaves HA candidate set

Restore the legacy auto-clear at write boundaries that drop HA
candidacy: Disconnect, SetApprovedRoutes(empty), and
UpdateNodeFromMapRequest shrinking advertised routes to empty.
Plus a defensive guard in SetNodeUnhealthy.

Updates #3203
This commit is contained in:
Kristoffer Dalby 2026-04-28 16:04:12 +00:00
parent 66ac785c22
commit 9f7c8e9a07
2 changed files with 114 additions and 0 deletions

View File

@ -186,6 +186,103 @@ func TestHAHealthProbe_ConnectClearsUnhealthy(t *testing.T) {
"reconnect should clear unhealthy state")
}
// TestHAHealthProbe_SetApprovedRoutesEmptyClearsUnhealthy verifies
// that clearing a node's approved routes also clears any stale
// Unhealthy bit, mirroring the legacy routes.SetRoutes(empty)
// auto-clear. Without this, a probe timeout that lands just before
// SetApprovedRoutes would surface as a stale unhealthy node forever.
func TestHAHealthProbe_SetApprovedRoutesEmptyClearsUnhealthy(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-clear-approve")
route := netip.MustParsePrefix("10.100.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-ca-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-ca-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
advertiseAndApproveRoute(t, srv, c2, route)
srv.State().SetNodeUnhealthy(nodeID1, true)
require.False(t, srv.State().IsNodeHealthy(nodeID1))
_, _, err := srv.State().SetApprovedRoutes(nodeID1, nil)
require.NoError(t, err)
assert.True(t, srv.State().IsNodeHealthy(nodeID1),
"clearing approved routes should drop stale Unhealthy bit")
}
// TestHAHealthProbe_DisconnectClearsUnhealthy verifies that
// Disconnect resets a stale Unhealthy bit. An offline node is not an
// HA candidate; carrying the bit forward leaks into DebugRoutes.
//
// The poll handler waits a 10s grace period before calling
// state.Disconnect, so the assertion is wrapped in Eventually with a
// generous timeout.
func TestHAHealthProbe_DisconnectClearsUnhealthy(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-clear-disc")
route := netip.MustParsePrefix("10.101.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-cd-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-cd-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
advertiseAndApproveRoute(t, srv, c2, route)
srv.State().SetNodeUnhealthy(nodeID1, true)
require.False(t, srv.State().IsNodeHealthy(nodeID1))
c1.Disconnect(t)
assert.Eventually(t, func() bool {
return srv.State().IsNodeHealthy(nodeID1)
}, 15*time.Second, 200*time.Millisecond,
"disconnect should drop stale Unhealthy bit")
}
// TestHAHealthProbe_SetUnhealthyNoRoutesIsNoOp verifies the
// defensive guard for the still-online-but-no-routes case: a probe
// that fires after SetApprovedRoutes(empty) should not be allowed
// to install a stale Unhealthy bit either.
func TestHAHealthProbe_SetUnhealthyNoRoutesIsNoOp(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-guard-noroutes")
route := netip.MustParsePrefix("10.103.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-gn-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-gn-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
advertiseAndApproveRoute(t, srv, c2, route)
_, _, err := srv.State().SetApprovedRoutes(nodeID1, nil)
require.NoError(t, err)
srv.State().SetNodeUnhealthy(nodeID1, true)
assert.True(t, srv.State().IsNodeHealthy(nodeID1),
"SetNodeUnhealthy on node with no approved routes should be a no-op")
}
// TestHAHealthProbe_NoHARoutes verifies that the prober is a no-op
// when no HA configuration exists.
func TestHAHealthProbe_NoHARoutes(t *testing.T) {

View File

@ -599,6 +599,9 @@ func (s *State) Disconnect(id types.NodeID, epoch uint64) ([]change.Change, erro
now := time.Now()
n.LastSeen = &now
n.IsOnline = new(false)
// Offline nodes are not HA candidates; drop any stale
// Unhealthy bit so it does not surface in DebugRoutes.
n.Unhealthy = false
})
if !ok {
@ -899,6 +902,12 @@ func (s *State) SetApprovedRoutes(nodeID types.NodeID, routes []netip.Prefix) (t
n, ok := s.nodeStore.UpdateNode(nodeID, func(node *types.Node) {
node.ApprovedRoutes = routes
// A node with no approved routes is no longer an HA
// candidate; drop any stale Unhealthy bit (mirrors the
// legacy routes.SetRoutes(empty) auto-clear).
if len(node.AllApprovedRoutes()) == 0 {
node.Unhealthy = false
}
})
if !ok {
@ -2562,6 +2571,14 @@ func (s *State) UpdateNodeFromMapRequest(id types.NodeID, req tailcfg.MapRequest
Msg("applying route approval results")
}
}
// AllApprovedRoutes is announced ∩ approved; a Hostinfo
// update that shrinks the announced set can drop the node
// out of HA candidacy without touching ApprovedRoutes.
// Clear any stale Unhealthy bit in that case.
if len(currentNode.AllApprovedRoutes()) == 0 {
currentNode.Unhealthy = false
}
})
if !ok {