From 94ec607bcabd1b7b92abb30051f2e57df4d809fc Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Thu, 30 Apr 2026 08:39:31 +0000 Subject: [PATCH] state: per-goroutine deadline in HA probe cycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `time.After(ProbeTimeout)` returned a single channel shared by every probe goroutine in the cycle. Only the first goroutine to receive the deadline tick drains the channel; any other goroutine still waiting on its `responseCh` is then stuck forever, `wg.Wait()` never returns, and the scheduler loop in `app.go` stalls on the next tick. The condition fires whenever two or more nodes time out in the same cycle — common under cable-pull where IsOnline lags reality and both routers stay in the candidate set as half-open TCP. Move the timer inside each goroutine so every probe has its own deadline. Updates #3234 --- hscontrol/state/ha_health.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hscontrol/state/ha_health.go b/hscontrol/state/ha_health.go index edea5080..2b3e247f 100644 --- a/hscontrol/state/ha_health.go +++ b/hscontrol/state/ha_health.go @@ -71,8 +71,6 @@ func (p *HAHealthProber) ProbeOnce( var wg sync.WaitGroup - deadline := time.After(p.cfg.ProbeTimeout) - for _, id := range nodeIDs { if !p.isConnected(id) { log.Debug(). @@ -90,6 +88,9 @@ func (p *HAHealthProber) ProbeOnce( })) wg.Go(func() { + timer := time.NewTimer(p.cfg.ProbeTimeout) + defer timer.Stop() + select { case latency := <-responseCh: log.Debug(). @@ -105,7 +106,7 @@ func (p *HAHealthProber) ProbeOnce( Msg("HA probe: node recovered, recalculating primaries") } - case <-deadline: + case <-timer.C: p.state.CancelPing(pingID) if !p.isConnected(id) {