state: preserve previous primary when all HA advertisers unhealthy

electPrimaryRoutes' all-unhealthy fallback picked candidates[0]
(lowest NodeID) regardless of who was prev. Under cable-pull
semantics IsOnline lags reality (long-poll TCP half-open), so
both routers stay in candidates and both go Unhealthy via the
prober — the fallback then churned primary to a node that was
itself unreachable.

Prefer prev when still in candidates; fall through to
candidates[0] only when prev is gone. Anti-blackhole holds.

Update the property test reference model and split the unit
test into existence (KeepsAPrimary) and identity
(PreservesPrevious) cases.

Fixes #3203
This commit is contained in:
Kristoffer Dalby 2026-04-29 13:34:51 +00:00
parent 27c9113af8
commit 3d5c0af4e7
3 changed files with 49 additions and 13 deletions

View File

@ -620,9 +620,12 @@ func snapshotFromNodes(
// electPrimaryRoutes picks the primary advertiser for each non-exit
// prefix. The previous primary is preserved when it is still online
// and healthy (anti-flap); otherwise the lowest-NodeID healthy
// advertiser wins, falling back to the lowest-NodeID candidate when
// every advertiser is unhealthy so peers see *some* primary instead
// of none.
// advertiser wins. When every advertiser is unhealthy the previous
// primary is preserved if still a candidate, falling back to the
// lowest-NodeID candidate so peers see *some* primary instead of
// none. Anti-flap in the all-unhealthy case matters under cable-pull
// where IsOnline lags reality and a naive lowest-ID fallback churns
// primaries to a node that is itself unreachable (issue #3203).
func electPrimaryRoutes(
nodes map[types.NodeID]types.Node,
prev map[netip.Prefix]types.NodeID,
@ -675,7 +678,12 @@ func electPrimaryRoutes(
}
if !found && len(candidates) >= 1 {
selected = candidates[0]
if cur, ok := prev[prefix]; ok && slices.Contains(candidates, cur) {
selected = cur
} else {
selected = candidates[0]
}
found = true
}

View File

@ -98,7 +98,12 @@ func (m *primariesModel) updatePrimaries() {
}
if !found && len(nodes) >= 1 {
selected = nodes[0]
if cur, ok := m.primary[p]; ok && slices.Contains(nodes, cur) {
selected = cur
} else {
selected = nodes[0]
}
found = true
}

View File

@ -200,18 +200,41 @@ func TestPrimaries_RecoveryFromUnhealthyNoFlap(t *testing.T) {
f.requirePrimary(mp("192.168.1.0/24"), 2)
}
func TestPrimaries_AllUnhealthyKeepsLowestIDPrimary(t *testing.T) {
// When every advertiser is unhealthy the algorithm degrades to
// the lowest-ID advertiser rather than going dark — peers should
// still see *some* primary so connectivity can recover when one
// of them flips healthy.
func TestPrimaries_AllUnhealthyKeepsAPrimary(t *testing.T) {
// Anti-blackhole: when every advertiser is unhealthy the
// algorithm keeps *some* primary so peers can recover once one
// flips healthy. The specific node is the prev primary when
// reachable (see PreservesPrevious); this test only pins the
// existence rule.
prefix := mp("192.168.1.0/24")
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.advertise(1, prefix)
f.advertise(2, prefix)
f.unhealthy(1)
f.unhealthy(2)
f.requirePrimary(mp("192.168.1.0/24"), 1)
_, ok := f.ns.PrimaryRouteFor(prefix)
require.True(t, ok, "all-unhealthy must still produce some primary")
}
func TestPrimaries_AllUnhealthyPreservesPrevious(t *testing.T) {
// Issue #3203: once a failover has moved primary to a higher-ID
// node, a subsequent all-unhealthy state must NOT churn primary
// back to the lowest-ID candidate. Under cable-pull semantics
// both nodes can linger as IsOnline=true (half-open TCP) and
// both go Unhealthy — naive `candidates[0]` would flap the
// primary to a node that is itself unreachable.
prefix := mp("10.0.0.0/24")
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, prefix)
f.advertise(2, prefix)
f.requirePrimary(prefix, 1)
f.unhealthy(1)
f.requirePrimary(prefix, 2)
f.unhealthy(2)
f.requirePrimary(prefix, 2)
}
func TestPrimaries_ExitRouteNotElected(t *testing.T) {