From 3d5c0af4e703482aff8f979ff5bde330a7bf97f5 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 29 Apr 2026 13:34:51 +0000 Subject: [PATCH] state: preserve previous primary when all HA advertisers unhealthy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit electPrimaryRoutes' all-unhealthy fallback picked candidates[0] (lowest NodeID) regardless of who was prev. Under cable-pull semantics IsOnline lags reality (long-poll TCP half-open), so both routers stay in candidates and both go Unhealthy via the prober — the fallback then churned primary to a node that was itself unreachable. Prefer prev when still in candidates; fall through to candidates[0] only when prev is gone. Anti-blackhole holds. Update the property test reference model and split the unit test into existence (KeepsAPrimary) and identity (PreservesPrevious) cases. Fixes #3203 --- hscontrol/state/node_store.go | 16 ++++++--- hscontrol/state/primaries_property_test.go | 7 +++- hscontrol/state/primaries_test.go | 39 +++++++++++++++++----- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/hscontrol/state/node_store.go b/hscontrol/state/node_store.go index 6d78b5fb..a2066f15 100644 --- a/hscontrol/state/node_store.go +++ b/hscontrol/state/node_store.go @@ -620,9 +620,12 @@ func snapshotFromNodes( // electPrimaryRoutes picks the primary advertiser for each non-exit // prefix. The previous primary is preserved when it is still online // and healthy (anti-flap); otherwise the lowest-NodeID healthy -// advertiser wins, falling back to the lowest-NodeID candidate when -// every advertiser is unhealthy so peers see *some* primary instead -// of none. +// advertiser wins. When every advertiser is unhealthy the previous +// primary is preserved if still a candidate, falling back to the +// lowest-NodeID candidate so peers see *some* primary instead of +// none. Anti-flap in the all-unhealthy case matters under cable-pull +// where IsOnline lags reality and a naive lowest-ID fallback churns +// primaries to a node that is itself unreachable (issue #3203). func electPrimaryRoutes( nodes map[types.NodeID]types.Node, prev map[netip.Prefix]types.NodeID, @@ -675,7 +678,12 @@ func electPrimaryRoutes( } if !found && len(candidates) >= 1 { - selected = candidates[0] + if cur, ok := prev[prefix]; ok && slices.Contains(candidates, cur) { + selected = cur + } else { + selected = candidates[0] + } + found = true } diff --git a/hscontrol/state/primaries_property_test.go b/hscontrol/state/primaries_property_test.go index 9f93ff57..2c583f76 100644 --- a/hscontrol/state/primaries_property_test.go +++ b/hscontrol/state/primaries_property_test.go @@ -98,7 +98,12 @@ func (m *primariesModel) updatePrimaries() { } if !found && len(nodes) >= 1 { - selected = nodes[0] + if cur, ok := m.primary[p]; ok && slices.Contains(nodes, cur) { + selected = cur + } else { + selected = nodes[0] + } + found = true } diff --git a/hscontrol/state/primaries_test.go b/hscontrol/state/primaries_test.go index 38f9bb53..ab6116a3 100644 --- a/hscontrol/state/primaries_test.go +++ b/hscontrol/state/primaries_test.go @@ -200,18 +200,41 @@ func TestPrimaries_RecoveryFromUnhealthyNoFlap(t *testing.T) { f.requirePrimary(mp("192.168.1.0/24"), 2) } -func TestPrimaries_AllUnhealthyKeepsLowestIDPrimary(t *testing.T) { - // When every advertiser is unhealthy the algorithm degrades to - // the lowest-ID advertiser rather than going dark — peers should - // still see *some* primary so connectivity can recover when one - // of them flips healthy. +func TestPrimaries_AllUnhealthyKeepsAPrimary(t *testing.T) { + // Anti-blackhole: when every advertiser is unhealthy the + // algorithm keeps *some* primary so peers can recover once one + // flips healthy. The specific node is the prev primary when + // reachable (see PreservesPrevious); this test only pins the + // existence rule. + prefix := mp("192.168.1.0/24") f := newPrimariesFixture(t, 1, 2) - f.advertise(1, mp("192.168.1.0/24")) - f.advertise(2, mp("192.168.1.0/24")) + f.advertise(1, prefix) + f.advertise(2, prefix) f.unhealthy(1) f.unhealthy(2) - f.requirePrimary(mp("192.168.1.0/24"), 1) + _, ok := f.ns.PrimaryRouteFor(prefix) + require.True(t, ok, "all-unhealthy must still produce some primary") +} + +func TestPrimaries_AllUnhealthyPreservesPrevious(t *testing.T) { + // Issue #3203: once a failover has moved primary to a higher-ID + // node, a subsequent all-unhealthy state must NOT churn primary + // back to the lowest-ID candidate. Under cable-pull semantics + // both nodes can linger as IsOnline=true (half-open TCP) and + // both go Unhealthy — naive `candidates[0]` would flap the + // primary to a node that is itself unreachable. + prefix := mp("10.0.0.0/24") + f := newPrimariesFixture(t, 1, 2) + f.advertise(1, prefix) + f.advertise(2, prefix) + f.requirePrimary(prefix, 1) + + f.unhealthy(1) + f.requirePrimary(prefix, 2) + + f.unhealthy(2) + f.requirePrimary(prefix, 2) } func TestPrimaries_ExitRouteNotElected(t *testing.T) {