From 94ec607bcabd1b7b92abb30051f2e57df4d809fc Mon Sep 17 00:00:00 2001
From: Kristoffer Dalby <kristoffer@dalby.cc>
Date: Thu, 30 Apr 2026 08:39:31 +0000
Subject: [PATCH] state: per-goroutine deadline in HA probe cycle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`time.After(ProbeTimeout)` returned a single channel shared by every
probe goroutine in the cycle. Only the first goroutine to receive the
deadline tick drains the channel; any other goroutine still waiting on
its `responseCh` is then stuck forever, `wg.Wait()` never returns, and
the scheduler loop in `app.go` stalls on the next tick. The condition
fires whenever two or more nodes time out in the same cycle — common
under cable-pull where IsOnline lags reality and both routers stay in
the candidate set as half-open TCP.

Move the timer inside each goroutine so every probe has its own
deadline.

Updates #3234
---
 hscontrol/state/ha_health.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hscontrol/state/ha_health.go b/hscontrol/state/ha_health.go
index edea5080..2b3e247f 100644
--- a/hscontrol/state/ha_health.go
+++ b/hscontrol/state/ha_health.go
@@ -71,8 +71,6 @@ func (p *HAHealthProber) ProbeOnce(
 
 	var wg sync.WaitGroup
 
-	deadline := time.After(p.cfg.ProbeTimeout)
-
 	for _, id := range nodeIDs {
 		if !p.isConnected(id) {
 			log.Debug().
@@ -90,6 +88,9 @@ func (p *HAHealthProber) ProbeOnce(
 		}))
 
 		wg.Go(func() {
+			timer := time.NewTimer(p.cfg.ProbeTimeout)
+			defer timer.Stop()
+
 			select {
 			case latency := <-responseCh:
 				log.Debug().
@@ -105,7 +106,7 @@ func (p *HAHealthProber) ProbeOnce(
 						Msg("HA probe: node recovered, recalculating primaries")
 				}
 
-			case <-deadline:
+			case <-timer.C:
 				p.state.CancelPing(pingID)
 
 				if !p.isConnected(id) {