mirror of
https://github.com/juanfont/headscale.git
synced 2026-05-05 03:56:10 +02:00
state: preserve previous primary when all HA advertisers unhealthy
electPrimaryRoutes' all-unhealthy fallback picked candidates[0] (lowest NodeID) regardless of who was prev. Under cable-pull semantics IsOnline lags reality (long-poll TCP half-open), so both routers stay in candidates and both go Unhealthy via the prober — the fallback then churned primary to a node that was itself unreachable. Prefer prev when still in candidates; fall through to candidates[0] only when prev is gone. Anti-blackhole holds. Update the property test reference model and split the unit test into existence (KeepsAPrimary) and identity (PreservesPrevious) cases. Fixes #3203
This commit is contained in:
parent
27c9113af8
commit
3d5c0af4e7
@ -620,9 +620,12 @@ func snapshotFromNodes(
|
||||
// electPrimaryRoutes picks the primary advertiser for each non-exit
|
||||
// prefix. The previous primary is preserved when it is still online
|
||||
// and healthy (anti-flap); otherwise the lowest-NodeID healthy
|
||||
// advertiser wins, falling back to the lowest-NodeID candidate when
|
||||
// every advertiser is unhealthy so peers see *some* primary instead
|
||||
// of none.
|
||||
// advertiser wins. When every advertiser is unhealthy the previous
|
||||
// primary is preserved if still a candidate, falling back to the
|
||||
// lowest-NodeID candidate so peers see *some* primary instead of
|
||||
// none. Anti-flap in the all-unhealthy case matters under cable-pull
|
||||
// where IsOnline lags reality and a naive lowest-ID fallback churns
|
||||
// primaries to a node that is itself unreachable (issue #3203).
|
||||
func electPrimaryRoutes(
|
||||
nodes map[types.NodeID]types.Node,
|
||||
prev map[netip.Prefix]types.NodeID,
|
||||
@ -675,7 +678,12 @@ func electPrimaryRoutes(
|
||||
}
|
||||
|
||||
if !found && len(candidates) >= 1 {
|
||||
selected = candidates[0]
|
||||
if cur, ok := prev[prefix]; ok && slices.Contains(candidates, cur) {
|
||||
selected = cur
|
||||
} else {
|
||||
selected = candidates[0]
|
||||
}
|
||||
|
||||
found = true
|
||||
}
|
||||
|
||||
|
||||
@ -98,7 +98,12 @@ func (m *primariesModel) updatePrimaries() {
|
||||
}
|
||||
|
||||
if !found && len(nodes) >= 1 {
|
||||
selected = nodes[0]
|
||||
if cur, ok := m.primary[p]; ok && slices.Contains(nodes, cur) {
|
||||
selected = cur
|
||||
} else {
|
||||
selected = nodes[0]
|
||||
}
|
||||
|
||||
found = true
|
||||
}
|
||||
|
||||
|
||||
@ -200,18 +200,41 @@ func TestPrimaries_RecoveryFromUnhealthyNoFlap(t *testing.T) {
|
||||
f.requirePrimary(mp("192.168.1.0/24"), 2)
|
||||
}
|
||||
|
||||
func TestPrimaries_AllUnhealthyKeepsLowestIDPrimary(t *testing.T) {
|
||||
// When every advertiser is unhealthy the algorithm degrades to
|
||||
// the lowest-ID advertiser rather than going dark — peers should
|
||||
// still see *some* primary so connectivity can recover when one
|
||||
// of them flips healthy.
|
||||
func TestPrimaries_AllUnhealthyKeepsAPrimary(t *testing.T) {
|
||||
// Anti-blackhole: when every advertiser is unhealthy the
|
||||
// algorithm keeps *some* primary so peers can recover once one
|
||||
// flips healthy. The specific node is the prev primary when
|
||||
// reachable (see PreservesPrevious); this test only pins the
|
||||
// existence rule.
|
||||
prefix := mp("192.168.1.0/24")
|
||||
f := newPrimariesFixture(t, 1, 2)
|
||||
f.advertise(1, mp("192.168.1.0/24"))
|
||||
f.advertise(2, mp("192.168.1.0/24"))
|
||||
f.advertise(1, prefix)
|
||||
f.advertise(2, prefix)
|
||||
f.unhealthy(1)
|
||||
f.unhealthy(2)
|
||||
|
||||
f.requirePrimary(mp("192.168.1.0/24"), 1)
|
||||
_, ok := f.ns.PrimaryRouteFor(prefix)
|
||||
require.True(t, ok, "all-unhealthy must still produce some primary")
|
||||
}
|
||||
|
||||
func TestPrimaries_AllUnhealthyPreservesPrevious(t *testing.T) {
|
||||
// Issue #3203: once a failover has moved primary to a higher-ID
|
||||
// node, a subsequent all-unhealthy state must NOT churn primary
|
||||
// back to the lowest-ID candidate. Under cable-pull semantics
|
||||
// both nodes can linger as IsOnline=true (half-open TCP) and
|
||||
// both go Unhealthy — naive `candidates[0]` would flap the
|
||||
// primary to a node that is itself unreachable.
|
||||
prefix := mp("10.0.0.0/24")
|
||||
f := newPrimariesFixture(t, 1, 2)
|
||||
f.advertise(1, prefix)
|
||||
f.advertise(2, prefix)
|
||||
f.requirePrimary(prefix, 1)
|
||||
|
||||
f.unhealthy(1)
|
||||
f.requirePrimary(prefix, 2)
|
||||
|
||||
f.unhealthy(2)
|
||||
f.requirePrimary(prefix, 2)
|
||||
}
|
||||
|
||||
func TestPrimaries_ExitRouteNotElected(t *testing.T) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user