servertest, integration: cover HA both-offline recovery

Three regression tests for the user scenario: an in-process
Disconnect/Reconnect, a tailscale-down/up integration test, and
an iptables -j DROP cable-pull integration test.

Updates #3203
This commit is contained in:
Kristoffer Dalby 2026-04-28 12:16:46 +00:00
parent 9f7c8e9a07
commit 863fa2f815
3 changed files with 558 additions and 0 deletions

View File

@ -251,6 +251,8 @@ jobs:
- TestSubnetRouteACLFiltering
- TestGrantViaSubnetSteering
- TestHASubnetRouterPingFailover
- TestHASubnetRouterFailoverBothOffline
- TestHASubnetRouterFailoverBothOfflineCablePull
- TestHeadscale
- TestTailscaleNodesJoiningHeadcale
- TestSSHOneUserToAll

View File

@ -3,10 +3,12 @@ package servertest_test
import (
"context"
"net/netip"
"slices"
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/servertest"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
@ -211,6 +213,134 @@ func TestRoutes(t *testing.T) {
}
}
})
// Reproduces https://github.com/juanfont/headscale/issues/3203:
// HA tracking loses the secondary subnet router after all routers serving
// the route have been offline simultaneously and one of them returns.
//
// Two assertions split the failure surface:
// R1 — server-side primary route state restores after reconnect.
// R2 — observer's netmap shows the reconnected router online with
// the route in its primary set.
// If R1 fails the bug is in state.Connect / primaryRoutes; if R1 passes
// and R2 fails the bug is in change broadcast / mapBatcher.
//
// Caveat: servertest's Reconnect re-registers via TryLogin in addition
// to starting a new poll session. Production reconnects after a brief
// network outage may bypass re-registration. If this test passes on
// main, fall back to the integration variant noted in the plan
// (TestHASubnetRouterFailover with all routers offline simultaneously).
t.Run("ha_secondary_recovers_after_all_offline", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha3203-user")
route := netip.MustParsePrefix("10.0.0.0/24")
r1 := servertest.NewClient(t, srv, "ha3203-router1",
servertest.WithUser(user))
r2 := servertest.NewClient(t, srv, "ha3203-router2",
servertest.WithUser(user))
obs := servertest.NewClient(t, srv, "ha3203-observer",
servertest.WithUser(user))
obs.WaitForPeers(t, 2, 10*time.Second)
// Both routers advertise the same route via their hostinfo.
advertise := func(c *servertest.TestClient, name string) {
t.Helper()
c.Direct().SetHostinfo(&tailcfg.Hostinfo{
BackendLogID: "servertest-" + name,
Hostname: name,
RoutableIPs: []netip.Prefix{route},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = c.Direct().SendUpdate(ctx)
}
advertise(r1, "ha3203-router1")
advertise(r2, "ha3203-router2")
// Approve the route on both routers explicitly. Auto-approvers
// would also work but introduce a policy dependency the harness
// does not currently set up here.
approve := func(name string) {
t.Helper()
id := findNodeID(t, srv, name)
_, ch, err := srv.State().SetApprovedRoutes(id, []netip.Prefix{route})
require.NoError(t, err)
srv.App.Change(ch)
}
approve("ha3203-router1")
approve("ha3203-router2")
// Sanity: r1 starts as primary (lower NodeID by registration order).
r1ID := findNodeID(t, srv, "ha3203-router1")
r2ID := findNodeID(t, srv, "ha3203-router2")
hasRoute := func(id types.NodeID) bool {
return slices.Contains(srv.State().GetNodePrimaryRoutes(id), route)
}
assert.Eventually(t, func() bool { return hasRoute(r1ID) },
10*time.Second, 100*time.Millisecond,
"r1 should be primary initially")
// 1. Take r1 offline. After the 10s grace period, r2 should take over.
r1.Disconnect(t)
assert.Eventually(t, func() bool { return hasRoute(r2ID) && !hasRoute(r1ID) },
20*time.Second, 200*time.Millisecond,
"r2 should take over as primary after r1 offline")
// 2. Take r2 offline. With both routers gone, no primary should remain.
r2.Disconnect(t)
assert.Eventually(t, func() bool { return !hasRoute(r1ID) && !hasRoute(r2ID) },
20*time.Second, 200*time.Millisecond,
"no primary should be assigned while both routers are offline")
// 3. Reconnect r2 (cable plugged back in).
r2.Reconnect(t)
// Hostinfo is part of the controlclient.Direct state; the Reconnect
// helper re-registers via TryLogin which carries the same Hostinfo
// that was set above. Push it again to be sure the announced route
// is registered in the new session.
advertise(r2, "ha3203-router2")
// R1: server-side state must restore r2 as primary.
assert.Eventually(t, func() bool { return hasRoute(r2ID) },
15*time.Second, 200*time.Millisecond,
"R1: r2 should be re-registered as primary after reconnect — issue #3203")
// R2: observer must see r2 online with the route in its primary set.
obs.WaitForCondition(t, "R2: observer sees r2 online with primary route",
15*time.Second,
func(nm *netmap.NetworkMap) bool {
for _, p := range nm.Peers {
hi := p.Hostinfo()
if !hi.Valid() || hi.Hostname() != "ha3203-router2" {
continue
}
online, known := p.Online().GetOk()
if !known || !online {
return false
}
for i := range p.PrimaryRoutes().Len() {
if p.PrimaryRoutes().At(i) == route {
return true
}
}
}
return false
})
})
}
// findNodeID is defined in issues_test.go.

View File

@ -3871,3 +3871,429 @@ func TestHASubnetRouterPingFailover(t *testing.T) {
assertTracerouteViaIPWithCollect(c, tr, ip)
}, propagationTime, 200*time.Millisecond, "traceroute should still go through router 2 after recovery")
}
// TestHASubnetRouterFailoverBothOffline reproduces issue #3203:
// HA tracking loses the secondary subnet router after all routers serving
// the route have been offline simultaneously and one of them returns.
// See https://github.com/juanfont/headscale/issues/3203.
//
// Existing TestHASubnetRouterFailover keeps subRouter3 online across both
// failover steps, so the all-offline transition is uncovered. This test
// uses two routers and walks them both offline before bringing r2 back.
//
// Two assertion sets split the failure surface:
// - R1: server-side primary route table restores after reconnect.
// If R1 fails, the bug is in state.Connect / primaryRoutes.
// - R2: client's view shows r2 online with the route in PrimaryRoutes.
// If R1 passes and R2 fails, the bug is in change broadcast /
// mapBatcher / multiChannelNodeConn.
func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(60 * time.Second)
spec := ScenarioSpec{
NodesPerUser: 2,
Users: []string{"user1", "user2"},
Networks: map[string]NetworkSpec{
"usernet1": {Users: []string{"user1"}},
"usernet2": {Users: []string{"user2"}},
},
ExtraService: map[string][]extraServiceFunc{
"usernet1": {Webservice},
},
Versions: []string{"head"},
}
scenario, err := NewScenario(spec)
require.NoErrorf(t, err, "failed to create scenario: %s", err)
err = scenario.CreateHeadscaleEnv(
[]tsic.Option{tsic.WithAcceptRoutes()},
hsic.WithTestName("rt-haboth"),
)
requireNoErrHeadscaleEnv(t, err)
allClients, err := scenario.ListTailscaleClients()
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
headscale, err := scenario.Headscale()
requireNoErrGetHeadscale(t, err)
prefp, err := scenario.SubnetOfNetwork("usernet1")
require.NoError(t, err)
pref := *prefp
t.Logf("usernet1 prefix: %s", pref.String())
usernet1, err := scenario.Network("usernet1")
require.NoError(t, err)
services, err := scenario.Services("usernet1")
require.NoError(t, err)
require.Len(t, services, 1)
web := services[0]
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
sort.SliceStable(allClients, func(i, j int) bool {
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
})
subRouter1 := allClients[0]
subRouter2 := allClients[1]
client := allClients[2]
t.Logf("Router 1: %s, Router 2: %s, Client: %s",
subRouter1.Hostname(), subRouter2.Hostname(), client.Hostname())
// Advertise the same route on both routers.
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
_, _, err = r.Execute([]string{
"tailscale", "set", "--advertise-routes=" + pref.String(),
})
require.NoErrorf(t, err, "failed to advertise route on %s", r.Hostname())
}
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
var nodes []*v1.Node
assert.EventuallyWithT(t, func(c *assert.CollectT) {
nodes, err = headscale.ListNodes()
assert.NoError(c, err)
assert.Len(c, nodes, 4)
}, propagationTime, 200*time.Millisecond, "nodes should be registered")
// Approve the route on both routers explicitly.
_, err = headscale.ApproveRoutes(
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
[]netip.Prefix{pref},
)
require.NoError(t, err)
_, err = headscale.ApproveRoutes(
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
[]netip.Prefix{pref},
)
require.NoError(t, err)
nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId())
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
// Sanity: r1 starts as primary (lower NodeID).
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.Equal(c, map[string]types.NodeID{
pref.String(): nodeID1,
}, pr.PrimaryRoutes, "router 1 should be primary initially")
}, propagationTime, 200*time.Millisecond, "waiting for HA setup")
// Confirm initial connectivity through r1.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r1")
t.Log("=== Step 1: r1 goes offline. r2 should take over. ===")
require.NoError(t, subRouter1.Down())
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.Equal(c, map[string]types.NodeID{
pref.String(): nodeID2,
}, pr.PrimaryRoutes, "r2 should be primary after r1 offline")
}, propagationTime, 500*time.Millisecond, "waiting for failover to r2")
t.Log("=== Step 2: r2 also goes offline. No primary should remain. ===")
require.NoError(t, subRouter2.Down())
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.Empty(c, pr.PrimaryRoutes,
"no primary should be assigned while both routers are offline")
}, propagationTime, 500*time.Millisecond, "waiting for both routers to be offline")
t.Log("=== Step 3: r2 returns. ===")
t.Log(" R1: server-side primary route state must restore r2 as primary.")
t.Log(" R2: client must observe r2 online with the route in PrimaryRoutes.")
require.NoError(t, subRouter2.Up())
// R1 — server side.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.Equal(c, map[string]types.NodeID{
pref.String(): nodeID2,
}, pr.PrimaryRoutes,
"R1: r2 should be re-registered as primary after reconnect — issue #3203")
}, propagationTime, 500*time.Millisecond, "R1: waiting for server-side primary restore")
// R2 — client view.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
clientStatus, err := client.Status()
assert.NoError(c, err)
srs2 := subRouter2.MustStatus()
peer := clientStatus.Peer[srs2.Self.PublicKey]
if !assert.NotNil(c, peer, "r2 peer should be in client status") {
return
}
assert.True(c, peer.Online,
"R2: client should see r2 online after reconnect — issue #3203")
if assert.NotNil(c, peer.PrimaryRoutes,
"R2: r2 should have PrimaryRoutes set in client status") {
assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref,
"R2: client's view of r2 should include the route as primary")
}
requirePeerSubnetRoutesWithCollect(c, peer, []netip.Prefix{pref})
}, propagationTime, 500*time.Millisecond, "R2: waiting for client to see r2 with primary route")
// End-to-end traffic should reach the webservice via r2.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r2 after recovery")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
tr, err := client.Traceroute(webip)
assert.NoError(c, err)
ip, err := subRouter2.IPv4()
if !assert.NoError(c, err, "failed to get IPv4 for r2") {
return
}
assertTracerouteViaIPWithCollect(c, tr, ip)
}, propagationTime, 200*time.Millisecond, "traceroute should go through r2 after recovery")
}
// TestHASubnetRouterFailoverBothOfflineCablePull is a stricter variant of
// TestHASubnetRouterFailoverBothOffline that simulates a cable pull rather
// than a graceful tailscale down. The two differ in what the server sees:
//
// - tailscale down: poll connection closes cleanly; defer fires
// immediately; grace period starts and ends predictably.
// - cable pull: server's noise long-poll is wedged in a half-open TCP
// connection until kernel keepalives time out (often >60 s). When
// the cable returns, two server-side longpoll sessions can overlap.
//
// Issue #3203 reports the bug after cable pulls; this variant blocks all
// traffic between the router container and headscale via iptables and then
// removes the block to mimic that behaviour.
func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(120 * time.Second)
spec := ScenarioSpec{
NodesPerUser: 2,
Users: []string{"user1", "user2"},
Networks: map[string]NetworkSpec{
"usernet1": {Users: []string{"user1"}},
"usernet2": {Users: []string{"user2"}},
},
ExtraService: map[string][]extraServiceFunc{
"usernet1": {Webservice},
},
Versions: []string{"head"},
}
scenario, err := NewScenario(spec)
require.NoErrorf(t, err, "failed to create scenario: %s", err)
err = scenario.CreateHeadscaleEnv(
[]tsic.Option{
tsic.WithAcceptRoutes(),
tsic.WithPackages("iptables"),
},
hsic.WithTestName("rt-hacable"),
)
requireNoErrHeadscaleEnv(t, err)
allClients, err := scenario.ListTailscaleClients()
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
headscale, err := scenario.Headscale()
requireNoErrGetHeadscale(t, err)
prefp, err := scenario.SubnetOfNetwork("usernet1")
require.NoError(t, err)
pref := *prefp
usernet1, err := scenario.Network("usernet1")
require.NoError(t, err)
services, err := scenario.Services("usernet1")
require.NoError(t, err)
require.Len(t, services, 1)
web := services[0]
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
sort.SliceStable(allClients, func(i, j int) bool {
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
})
subRouter1 := allClients[0]
subRouter2 := allClients[1]
client := allClients[2]
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
_, _, err = r.Execute([]string{
"tailscale", "set", "--advertise-routes=" + pref.String(),
})
require.NoErrorf(t, err, "advertise route on %s", r.Hostname())
}
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
var nodes []*v1.Node
assert.EventuallyWithT(t, func(c *assert.CollectT) {
nodes, err = headscale.ListNodes()
assert.NoError(c, err)
assert.Len(c, nodes, 4)
}, propagationTime, 200*time.Millisecond, "nodes registered")
_, err = headscale.ApproveRoutes(
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
[]netip.Prefix{pref},
)
require.NoError(t, err)
_, err = headscale.ApproveRoutes(
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
[]netip.Prefix{pref},
)
require.NoError(t, err)
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
// Sanity: r1 starts as primary.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.NotEmpty(c, pr.PrimaryRoutes, "a primary should exist")
}, propagationTime, 200*time.Millisecond, "HA setup")
hsIP := headscale.GetIPInNetwork(usernet1)
// "Cable pull" — drop all traffic in BOTH directions to/from headscale.
// Unlike the NEW-state-only filter used by TestHASubnetRouterPingFailover,
// this also breaks the existing ESTABLISHED long-poll, mimicking a
// physically severed link.
cablePull := func(r TailscaleClient) {
t.Helper()
for _, chain := range []string{"OUTPUT", "INPUT"} {
_, _, err := r.Execute([]string{
"iptables", "-A", chain,
"-d", hsIP, "-j", "DROP",
})
require.NoErrorf(t, err, "iptables -A %s on %s", chain, r.Hostname())
_, _, err = r.Execute([]string{
"iptables", "-A", chain,
"-s", hsIP, "-j", "DROP",
})
require.NoErrorf(t, err, "iptables -A %s -s on %s", chain, r.Hostname())
}
}
cableReplug := func(r TailscaleClient) {
t.Helper()
for _, chain := range []string{"OUTPUT", "INPUT"} {
_, _, err := r.Execute([]string{
"iptables", "-D", chain,
"-d", hsIP, "-j", "DROP",
})
require.NoErrorf(t, err, "iptables -D %s on %s", chain, r.Hostname())
_, _, err = r.Execute([]string{
"iptables", "-D", chain,
"-s", hsIP, "-j", "DROP",
})
require.NoErrorf(t, err, "iptables -D %s -s on %s", chain, r.Hostname())
}
}
t.Log("=== Cable-pull r1. Server should eventually fail r1 over to r2. ===")
cablePull(subRouter1)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.Equal(c, map[string]types.NodeID{
pref.String(): nodeID2,
}, pr.PrimaryRoutes, "r2 should become primary after r1 cable pull")
}, propagationTime, 1*time.Second, "waiting for r2 promotion")
t.Log("=== Cable-pull r2 while r1 is still cable-pulled. ===")
cablePull(subRouter2)
// Some primary may transiently flip back to r1 (offline) here — see the
// user's "failover to n1 (offline)" observation in the issue. We do not
// assert on that intermediate state; we just assert recovery below.
t.Log("=== Reconnect r2 (cable plugged back in). ===")
cableReplug(subRouter2)
// R1 — server side primary table should restore r2 as primary.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
pr, err := headscale.PrimaryRoutes()
assert.NoError(c, err)
assert.Equal(c, map[string]types.NodeID{
pref.String(): nodeID2,
}, pr.PrimaryRoutes,
"R1: r2 should be re-registered as primary after cable replug — issue #3203")
}, propagationTime, 1*time.Second, "R1: waiting for r2 to be primary again")
// R2 — client should observe r2 online with the route.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
clientStatus, err := client.Status()
assert.NoError(c, err)
srs2 := subRouter2.MustStatus()
peer := clientStatus.Peer[srs2.Self.PublicKey]
if !assert.NotNil(c, peer) {
return
}
assert.True(c, peer.Online,
"R2: client should see r2 online — issue #3203")
if assert.NotNil(c, peer.PrimaryRoutes) {
assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref)
}
}, propagationTime, 1*time.Second, "R2: waiting for client to see r2")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery")
}