From 863fa2f8157bf974b153a2d8f19eee12ad58bb3d Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Tue, 28 Apr 2026 12:16:46 +0000 Subject: [PATCH] servertest, integration: cover HA both-offline recovery Three regression tests for the user scenario: an in-process Disconnect/Reconnect, a tailscale-down/up integration test, and an iptables -j DROP cable-pull integration test. Updates #3203 --- .github/workflows/test-integration.yaml | 2 + hscontrol/servertest/routes_test.go | 130 ++++++++ integration/route_test.go | 426 ++++++++++++++++++++++++ 3 files changed, 558 insertions(+) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index 81f46581..eb97ff1d 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -251,6 +251,8 @@ jobs: - TestSubnetRouteACLFiltering - TestGrantViaSubnetSteering - TestHASubnetRouterPingFailover + - TestHASubnetRouterFailoverBothOffline + - TestHASubnetRouterFailoverBothOfflineCablePull - TestHeadscale - TestTailscaleNodesJoiningHeadcale - TestSSHOneUserToAll diff --git a/hscontrol/servertest/routes_test.go b/hscontrol/servertest/routes_test.go index a271428b..85f81820 100644 --- a/hscontrol/servertest/routes_test.go +++ b/hscontrol/servertest/routes_test.go @@ -3,10 +3,12 @@ package servertest_test import ( "context" "net/netip" + "slices" "testing" "time" "github.com/juanfont/headscale/hscontrol/servertest" + "github.com/juanfont/headscale/hscontrol/types" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "tailscale.com/tailcfg" @@ -211,6 +213,134 @@ func TestRoutes(t *testing.T) { } } }) + + // Reproduces https://github.com/juanfont/headscale/issues/3203: + // HA tracking loses the secondary subnet router after all routers serving + // the route have been offline simultaneously and one of them returns. + // + // Two assertions split the failure surface: + // R1 — server-side primary route state restores after reconnect. + // R2 — observer's netmap shows the reconnected router online with + // the route in its primary set. + // If R1 fails the bug is in state.Connect / primaryRoutes; if R1 passes + // and R2 fails the bug is in change broadcast / mapBatcher. + // + // Caveat: servertest's Reconnect re-registers via TryLogin in addition + // to starting a new poll session. Production reconnects after a brief + // network outage may bypass re-registration. If this test passes on + // main, fall back to the integration variant noted in the plan + // (TestHASubnetRouterFailover with all routers offline simultaneously). + t.Run("ha_secondary_recovers_after_all_offline", func(t *testing.T) { + t.Parallel() + + srv := servertest.NewServer(t) + user := srv.CreateUser(t, "ha3203-user") + + route := netip.MustParsePrefix("10.0.0.0/24") + + r1 := servertest.NewClient(t, srv, "ha3203-router1", + servertest.WithUser(user)) + r2 := servertest.NewClient(t, srv, "ha3203-router2", + servertest.WithUser(user)) + obs := servertest.NewClient(t, srv, "ha3203-observer", + servertest.WithUser(user)) + + obs.WaitForPeers(t, 2, 10*time.Second) + + // Both routers advertise the same route via their hostinfo. + advertise := func(c *servertest.TestClient, name string) { + t.Helper() + c.Direct().SetHostinfo(&tailcfg.Hostinfo{ + BackendLogID: "servertest-" + name, + Hostname: name, + RoutableIPs: []netip.Prefix{route}, + }) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + _ = c.Direct().SendUpdate(ctx) + } + advertise(r1, "ha3203-router1") + advertise(r2, "ha3203-router2") + + // Approve the route on both routers explicitly. Auto-approvers + // would also work but introduce a policy dependency the harness + // does not currently set up here. + approve := func(name string) { + t.Helper() + id := findNodeID(t, srv, name) + + _, ch, err := srv.State().SetApprovedRoutes(id, []netip.Prefix{route}) + require.NoError(t, err) + srv.App.Change(ch) + } + approve("ha3203-router1") + approve("ha3203-router2") + + // Sanity: r1 starts as primary (lower NodeID by registration order). + r1ID := findNodeID(t, srv, "ha3203-router1") + r2ID := findNodeID(t, srv, "ha3203-router2") + + hasRoute := func(id types.NodeID) bool { + return slices.Contains(srv.State().GetNodePrimaryRoutes(id), route) + } + + assert.Eventually(t, func() bool { return hasRoute(r1ID) }, + 10*time.Second, 100*time.Millisecond, + "r1 should be primary initially") + + // 1. Take r1 offline. After the 10s grace period, r2 should take over. + r1.Disconnect(t) + assert.Eventually(t, func() bool { return hasRoute(r2ID) && !hasRoute(r1ID) }, + 20*time.Second, 200*time.Millisecond, + "r2 should take over as primary after r1 offline") + + // 2. Take r2 offline. With both routers gone, no primary should remain. + r2.Disconnect(t) + assert.Eventually(t, func() bool { return !hasRoute(r1ID) && !hasRoute(r2ID) }, + 20*time.Second, 200*time.Millisecond, + "no primary should be assigned while both routers are offline") + + // 3. Reconnect r2 (cable plugged back in). + r2.Reconnect(t) + + // Hostinfo is part of the controlclient.Direct state; the Reconnect + // helper re-registers via TryLogin which carries the same Hostinfo + // that was set above. Push it again to be sure the announced route + // is registered in the new session. + advertise(r2, "ha3203-router2") + + // R1: server-side state must restore r2 as primary. + assert.Eventually(t, func() bool { return hasRoute(r2ID) }, + 15*time.Second, 200*time.Millisecond, + "R1: r2 should be re-registered as primary after reconnect — issue #3203") + + // R2: observer must see r2 online with the route in its primary set. + obs.WaitForCondition(t, "R2: observer sees r2 online with primary route", + 15*time.Second, + func(nm *netmap.NetworkMap) bool { + for _, p := range nm.Peers { + hi := p.Hostinfo() + if !hi.Valid() || hi.Hostname() != "ha3203-router2" { + continue + } + + online, known := p.Online().GetOk() + if !known || !online { + return false + } + + for i := range p.PrimaryRoutes().Len() { + if p.PrimaryRoutes().At(i) == route { + return true + } + } + } + + return false + }) + }) } // findNodeID is defined in issues_test.go. diff --git a/integration/route_test.go b/integration/route_test.go index e660dd35..6cc47c30 100644 --- a/integration/route_test.go +++ b/integration/route_test.go @@ -3871,3 +3871,429 @@ func TestHASubnetRouterPingFailover(t *testing.T) { assertTracerouteViaIPWithCollect(c, tr, ip) }, propagationTime, 200*time.Millisecond, "traceroute should still go through router 2 after recovery") } + +// TestHASubnetRouterFailoverBothOffline reproduces issue #3203: +// HA tracking loses the secondary subnet router after all routers serving +// the route have been offline simultaneously and one of them returns. +// See https://github.com/juanfont/headscale/issues/3203. +// +// Existing TestHASubnetRouterFailover keeps subRouter3 online across both +// failover steps, so the all-offline transition is uncovered. This test +// uses two routers and walks them both offline before bringing r2 back. +// +// Two assertion sets split the failure surface: +// - R1: server-side primary route table restores after reconnect. +// If R1 fails, the bug is in state.Connect / primaryRoutes. +// - R2: client's view shows r2 online with the route in PrimaryRoutes. +// If R1 passes and R2 fails, the bug is in change broadcast / +// mapBatcher / multiChannelNodeConn. +func TestHASubnetRouterFailoverBothOffline(t *testing.T) { + IntegrationSkip(t) + + propagationTime := integrationutil.ScaledTimeout(60 * time.Second) + + spec := ScenarioSpec{ + NodesPerUser: 2, + Users: []string{"user1", "user2"}, + Networks: map[string]NetworkSpec{ + "usernet1": {Users: []string{"user1"}}, + "usernet2": {Users: []string{"user2"}}, + }, + ExtraService: map[string][]extraServiceFunc{ + "usernet1": {Webservice}, + }, + Versions: []string{"head"}, + } + + scenario, err := NewScenario(spec) + require.NoErrorf(t, err, "failed to create scenario: %s", err) + + err = scenario.CreateHeadscaleEnv( + []tsic.Option{tsic.WithAcceptRoutes()}, + hsic.WithTestName("rt-haboth"), + ) + requireNoErrHeadscaleEnv(t, err) + + allClients, err := scenario.ListTailscaleClients() + requireNoErrListClients(t, err) + + err = scenario.WaitForTailscaleSync() + requireNoErrSync(t, err) + + headscale, err := scenario.Headscale() + requireNoErrGetHeadscale(t, err) + + prefp, err := scenario.SubnetOfNetwork("usernet1") + require.NoError(t, err) + + pref := *prefp + t.Logf("usernet1 prefix: %s", pref.String()) + + usernet1, err := scenario.Network("usernet1") + require.NoError(t, err) + + services, err := scenario.Services("usernet1") + require.NoError(t, err) + require.Len(t, services, 1) + + web := services[0] + webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1)) + weburl := fmt.Sprintf("http://%s/etc/hostname", webip) + + sort.SliceStable(allClients, func(i, j int) bool { + return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID + }) + + subRouter1 := allClients[0] + subRouter2 := allClients[1] + client := allClients[2] + + t.Logf("Router 1: %s, Router 2: %s, Client: %s", + subRouter1.Hostname(), subRouter2.Hostname(), client.Hostname()) + + // Advertise the same route on both routers. + for _, r := range []TailscaleClient{subRouter1, subRouter2} { + _, _, err = r.Execute([]string{ + "tailscale", "set", "--advertise-routes=" + pref.String(), + }) + require.NoErrorf(t, err, "failed to advertise route on %s", r.Hostname()) + } + + err = scenario.WaitForTailscaleSync() + requireNoErrSync(t, err) + + var nodes []*v1.Node + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + nodes, err = headscale.ListNodes() + assert.NoError(c, err) + assert.Len(c, nodes, 4) + }, propagationTime, 200*time.Millisecond, "nodes should be registered") + + // Approve the route on both routers explicitly. + _, err = headscale.ApproveRoutes( + MustFindNode(subRouter1.Hostname(), nodes).GetId(), + []netip.Prefix{pref}, + ) + require.NoError(t, err) + + _, err = headscale.ApproveRoutes( + MustFindNode(subRouter2.Hostname(), nodes).GetId(), + []netip.Prefix{pref}, + ) + require.NoError(t, err) + + nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()) + nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()) + + // Sanity: r1 starts as primary (lower NodeID). + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.Equal(c, map[string]types.NodeID{ + pref.String(): nodeID1, + }, pr.PrimaryRoutes, "router 1 should be primary initially") + }, propagationTime, 200*time.Millisecond, "waiting for HA setup") + + // Confirm initial connectivity through r1. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "client reaches webservice via r1") + + t.Log("=== Step 1: r1 goes offline. r2 should take over. ===") + + require.NoError(t, subRouter1.Down()) + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.Equal(c, map[string]types.NodeID{ + pref.String(): nodeID2, + }, pr.PrimaryRoutes, "r2 should be primary after r1 offline") + }, propagationTime, 500*time.Millisecond, "waiting for failover to r2") + + t.Log("=== Step 2: r2 also goes offline. No primary should remain. ===") + + require.NoError(t, subRouter2.Down()) + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.Empty(c, pr.PrimaryRoutes, + "no primary should be assigned while both routers are offline") + }, propagationTime, 500*time.Millisecond, "waiting for both routers to be offline") + + t.Log("=== Step 3: r2 returns. ===") + t.Log(" R1: server-side primary route state must restore r2 as primary.") + t.Log(" R2: client must observe r2 online with the route in PrimaryRoutes.") + + require.NoError(t, subRouter2.Up()) + + // R1 — server side. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.Equal(c, map[string]types.NodeID{ + pref.String(): nodeID2, + }, pr.PrimaryRoutes, + "R1: r2 should be re-registered as primary after reconnect — issue #3203") + }, propagationTime, 500*time.Millisecond, "R1: waiting for server-side primary restore") + + // R2 — client view. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + clientStatus, err := client.Status() + assert.NoError(c, err) + + srs2 := subRouter2.MustStatus() + + peer := clientStatus.Peer[srs2.Self.PublicKey] + if !assert.NotNil(c, peer, "r2 peer should be in client status") { + return + } + + assert.True(c, peer.Online, + "R2: client should see r2 online after reconnect — issue #3203") + + if assert.NotNil(c, peer.PrimaryRoutes, + "R2: r2 should have PrimaryRoutes set in client status") { + assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref, + "R2: client's view of r2 should include the route as primary") + } + + requirePeerSubnetRoutesWithCollect(c, peer, []netip.Prefix{pref}) + }, propagationTime, 500*time.Millisecond, "R2: waiting for client to see r2 with primary route") + + // End-to-end traffic should reach the webservice via r2. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "client reaches webservice via r2 after recovery") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + + ip, err := subRouter2.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for r2") { + return + } + + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "traceroute should go through r2 after recovery") +} + +// TestHASubnetRouterFailoverBothOfflineCablePull is a stricter variant of +// TestHASubnetRouterFailoverBothOffline that simulates a cable pull rather +// than a graceful tailscale down. The two differ in what the server sees: +// +// - tailscale down: poll connection closes cleanly; defer fires +// immediately; grace period starts and ends predictably. +// - cable pull: server's noise long-poll is wedged in a half-open TCP +// connection until kernel keepalives time out (often >60 s). When +// the cable returns, two server-side longpoll sessions can overlap. +// +// Issue #3203 reports the bug after cable pulls; this variant blocks all +// traffic between the router container and headscale via iptables and then +// removes the block to mimic that behaviour. +func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) { + IntegrationSkip(t) + + propagationTime := integrationutil.ScaledTimeout(120 * time.Second) + + spec := ScenarioSpec{ + NodesPerUser: 2, + Users: []string{"user1", "user2"}, + Networks: map[string]NetworkSpec{ + "usernet1": {Users: []string{"user1"}}, + "usernet2": {Users: []string{"user2"}}, + }, + ExtraService: map[string][]extraServiceFunc{ + "usernet1": {Webservice}, + }, + Versions: []string{"head"}, + } + + scenario, err := NewScenario(spec) + require.NoErrorf(t, err, "failed to create scenario: %s", err) + + err = scenario.CreateHeadscaleEnv( + []tsic.Option{ + tsic.WithAcceptRoutes(), + tsic.WithPackages("iptables"), + }, + hsic.WithTestName("rt-hacable"), + ) + requireNoErrHeadscaleEnv(t, err) + + allClients, err := scenario.ListTailscaleClients() + requireNoErrListClients(t, err) + + err = scenario.WaitForTailscaleSync() + requireNoErrSync(t, err) + + headscale, err := scenario.Headscale() + requireNoErrGetHeadscale(t, err) + + prefp, err := scenario.SubnetOfNetwork("usernet1") + require.NoError(t, err) + + pref := *prefp + + usernet1, err := scenario.Network("usernet1") + require.NoError(t, err) + + services, err := scenario.Services("usernet1") + require.NoError(t, err) + require.Len(t, services, 1) + + web := services[0] + webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1)) + weburl := fmt.Sprintf("http://%s/etc/hostname", webip) + + sort.SliceStable(allClients, func(i, j int) bool { + return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID + }) + + subRouter1 := allClients[0] + subRouter2 := allClients[1] + client := allClients[2] + + for _, r := range []TailscaleClient{subRouter1, subRouter2} { + _, _, err = r.Execute([]string{ + "tailscale", "set", "--advertise-routes=" + pref.String(), + }) + require.NoErrorf(t, err, "advertise route on %s", r.Hostname()) + } + + err = scenario.WaitForTailscaleSync() + requireNoErrSync(t, err) + + var nodes []*v1.Node + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + nodes, err = headscale.ListNodes() + assert.NoError(c, err) + assert.Len(c, nodes, 4) + }, propagationTime, 200*time.Millisecond, "nodes registered") + + _, err = headscale.ApproveRoutes( + MustFindNode(subRouter1.Hostname(), nodes).GetId(), + []netip.Prefix{pref}, + ) + require.NoError(t, err) + + _, err = headscale.ApproveRoutes( + MustFindNode(subRouter2.Hostname(), nodes).GetId(), + []netip.Prefix{pref}, + ) + require.NoError(t, err) + + nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()) + + // Sanity: r1 starts as primary. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.NotEmpty(c, pr.PrimaryRoutes, "a primary should exist") + }, propagationTime, 200*time.Millisecond, "HA setup") + + hsIP := headscale.GetIPInNetwork(usernet1) + + // "Cable pull" — drop all traffic in BOTH directions to/from headscale. + // Unlike the NEW-state-only filter used by TestHASubnetRouterPingFailover, + // this also breaks the existing ESTABLISHED long-poll, mimicking a + // physically severed link. + cablePull := func(r TailscaleClient) { + t.Helper() + + for _, chain := range []string{"OUTPUT", "INPUT"} { + _, _, err := r.Execute([]string{ + "iptables", "-A", chain, + "-d", hsIP, "-j", "DROP", + }) + require.NoErrorf(t, err, "iptables -A %s on %s", chain, r.Hostname()) + _, _, err = r.Execute([]string{ + "iptables", "-A", chain, + "-s", hsIP, "-j", "DROP", + }) + require.NoErrorf(t, err, "iptables -A %s -s on %s", chain, r.Hostname()) + } + } + + cableReplug := func(r TailscaleClient) { + t.Helper() + + for _, chain := range []string{"OUTPUT", "INPUT"} { + _, _, err := r.Execute([]string{ + "iptables", "-D", chain, + "-d", hsIP, "-j", "DROP", + }) + require.NoErrorf(t, err, "iptables -D %s on %s", chain, r.Hostname()) + _, _, err = r.Execute([]string{ + "iptables", "-D", chain, + "-s", hsIP, "-j", "DROP", + }) + require.NoErrorf(t, err, "iptables -D %s -s on %s", chain, r.Hostname()) + } + } + + t.Log("=== Cable-pull r1. Server should eventually fail r1 over to r2. ===") + cablePull(subRouter1) + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.Equal(c, map[string]types.NodeID{ + pref.String(): nodeID2, + }, pr.PrimaryRoutes, "r2 should become primary after r1 cable pull") + }, propagationTime, 1*time.Second, "waiting for r2 promotion") + + t.Log("=== Cable-pull r2 while r1 is still cable-pulled. ===") + cablePull(subRouter2) + + // Some primary may transiently flip back to r1 (offline) here — see the + // user's "failover to n1 (offline)" observation in the issue. We do not + // assert on that intermediate state; we just assert recovery below. + + t.Log("=== Reconnect r2 (cable plugged back in). ===") + cableReplug(subRouter2) + + // R1 — server side primary table should restore r2 as primary. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + pr, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + assert.Equal(c, map[string]types.NodeID{ + pref.String(): nodeID2, + }, pr.PrimaryRoutes, + "R1: r2 should be re-registered as primary after cable replug — issue #3203") + }, propagationTime, 1*time.Second, "R1: waiting for r2 to be primary again") + + // R2 — client should observe r2 online with the route. + assert.EventuallyWithT(t, func(c *assert.CollectT) { + clientStatus, err := client.Status() + assert.NoError(c, err) + + srs2 := subRouter2.MustStatus() + + peer := clientStatus.Peer[srs2.Self.PublicKey] + if !assert.NotNil(c, peer) { + return + } + + assert.True(c, peer.Online, + "R2: client should see r2 online — issue #3203") + + if assert.NotNil(c, peer.PrimaryRoutes) { + assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref) + } + }, propagationTime, 1*time.Second, "R2: waiting for client to see r2") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery") +}