mirror of
https://github.com/juanfont/headscale.git
synced 2026-05-05 03:56:10 +02:00
servertest, integration: cover HA both-offline recovery
Three regression tests for the user scenario: an in-process Disconnect/Reconnect, a tailscale-down/up integration test, and an iptables -j DROP cable-pull integration test. Updates #3203
This commit is contained in:
parent
9f7c8e9a07
commit
863fa2f815
2
.github/workflows/test-integration.yaml
vendored
2
.github/workflows/test-integration.yaml
vendored
@ -251,6 +251,8 @@ jobs:
|
||||
- TestSubnetRouteACLFiltering
|
||||
- TestGrantViaSubnetSteering
|
||||
- TestHASubnetRouterPingFailover
|
||||
- TestHASubnetRouterFailoverBothOffline
|
||||
- TestHASubnetRouterFailoverBothOfflineCablePull
|
||||
- TestHeadscale
|
||||
- TestTailscaleNodesJoiningHeadcale
|
||||
- TestSSHOneUserToAll
|
||||
|
||||
@ -3,10 +3,12 @@ package servertest_test
|
||||
import (
|
||||
"context"
|
||||
"net/netip"
|
||||
"slices"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/juanfont/headscale/hscontrol/servertest"
|
||||
"github.com/juanfont/headscale/hscontrol/types"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"tailscale.com/tailcfg"
|
||||
@ -211,6 +213,134 @@ func TestRoutes(t *testing.T) {
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Reproduces https://github.com/juanfont/headscale/issues/3203:
|
||||
// HA tracking loses the secondary subnet router after all routers serving
|
||||
// the route have been offline simultaneously and one of them returns.
|
||||
//
|
||||
// Two assertions split the failure surface:
|
||||
// R1 — server-side primary route state restores after reconnect.
|
||||
// R2 — observer's netmap shows the reconnected router online with
|
||||
// the route in its primary set.
|
||||
// If R1 fails the bug is in state.Connect / primaryRoutes; if R1 passes
|
||||
// and R2 fails the bug is in change broadcast / mapBatcher.
|
||||
//
|
||||
// Caveat: servertest's Reconnect re-registers via TryLogin in addition
|
||||
// to starting a new poll session. Production reconnects after a brief
|
||||
// network outage may bypass re-registration. If this test passes on
|
||||
// main, fall back to the integration variant noted in the plan
|
||||
// (TestHASubnetRouterFailover with all routers offline simultaneously).
|
||||
t.Run("ha_secondary_recovers_after_all_offline", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
srv := servertest.NewServer(t)
|
||||
user := srv.CreateUser(t, "ha3203-user")
|
||||
|
||||
route := netip.MustParsePrefix("10.0.0.0/24")
|
||||
|
||||
r1 := servertest.NewClient(t, srv, "ha3203-router1",
|
||||
servertest.WithUser(user))
|
||||
r2 := servertest.NewClient(t, srv, "ha3203-router2",
|
||||
servertest.WithUser(user))
|
||||
obs := servertest.NewClient(t, srv, "ha3203-observer",
|
||||
servertest.WithUser(user))
|
||||
|
||||
obs.WaitForPeers(t, 2, 10*time.Second)
|
||||
|
||||
// Both routers advertise the same route via their hostinfo.
|
||||
advertise := func(c *servertest.TestClient, name string) {
|
||||
t.Helper()
|
||||
c.Direct().SetHostinfo(&tailcfg.Hostinfo{
|
||||
BackendLogID: "servertest-" + name,
|
||||
Hostname: name,
|
||||
RoutableIPs: []netip.Prefix{route},
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_ = c.Direct().SendUpdate(ctx)
|
||||
}
|
||||
advertise(r1, "ha3203-router1")
|
||||
advertise(r2, "ha3203-router2")
|
||||
|
||||
// Approve the route on both routers explicitly. Auto-approvers
|
||||
// would also work but introduce a policy dependency the harness
|
||||
// does not currently set up here.
|
||||
approve := func(name string) {
|
||||
t.Helper()
|
||||
id := findNodeID(t, srv, name)
|
||||
|
||||
_, ch, err := srv.State().SetApprovedRoutes(id, []netip.Prefix{route})
|
||||
require.NoError(t, err)
|
||||
srv.App.Change(ch)
|
||||
}
|
||||
approve("ha3203-router1")
|
||||
approve("ha3203-router2")
|
||||
|
||||
// Sanity: r1 starts as primary (lower NodeID by registration order).
|
||||
r1ID := findNodeID(t, srv, "ha3203-router1")
|
||||
r2ID := findNodeID(t, srv, "ha3203-router2")
|
||||
|
||||
hasRoute := func(id types.NodeID) bool {
|
||||
return slices.Contains(srv.State().GetNodePrimaryRoutes(id), route)
|
||||
}
|
||||
|
||||
assert.Eventually(t, func() bool { return hasRoute(r1ID) },
|
||||
10*time.Second, 100*time.Millisecond,
|
||||
"r1 should be primary initially")
|
||||
|
||||
// 1. Take r1 offline. After the 10s grace period, r2 should take over.
|
||||
r1.Disconnect(t)
|
||||
assert.Eventually(t, func() bool { return hasRoute(r2ID) && !hasRoute(r1ID) },
|
||||
20*time.Second, 200*time.Millisecond,
|
||||
"r2 should take over as primary after r1 offline")
|
||||
|
||||
// 2. Take r2 offline. With both routers gone, no primary should remain.
|
||||
r2.Disconnect(t)
|
||||
assert.Eventually(t, func() bool { return !hasRoute(r1ID) && !hasRoute(r2ID) },
|
||||
20*time.Second, 200*time.Millisecond,
|
||||
"no primary should be assigned while both routers are offline")
|
||||
|
||||
// 3. Reconnect r2 (cable plugged back in).
|
||||
r2.Reconnect(t)
|
||||
|
||||
// Hostinfo is part of the controlclient.Direct state; the Reconnect
|
||||
// helper re-registers via TryLogin which carries the same Hostinfo
|
||||
// that was set above. Push it again to be sure the announced route
|
||||
// is registered in the new session.
|
||||
advertise(r2, "ha3203-router2")
|
||||
|
||||
// R1: server-side state must restore r2 as primary.
|
||||
assert.Eventually(t, func() bool { return hasRoute(r2ID) },
|
||||
15*time.Second, 200*time.Millisecond,
|
||||
"R1: r2 should be re-registered as primary after reconnect — issue #3203")
|
||||
|
||||
// R2: observer must see r2 online with the route in its primary set.
|
||||
obs.WaitForCondition(t, "R2: observer sees r2 online with primary route",
|
||||
15*time.Second,
|
||||
func(nm *netmap.NetworkMap) bool {
|
||||
for _, p := range nm.Peers {
|
||||
hi := p.Hostinfo()
|
||||
if !hi.Valid() || hi.Hostname() != "ha3203-router2" {
|
||||
continue
|
||||
}
|
||||
|
||||
online, known := p.Online().GetOk()
|
||||
if !known || !online {
|
||||
return false
|
||||
}
|
||||
|
||||
for i := range p.PrimaryRoutes().Len() {
|
||||
if p.PrimaryRoutes().At(i) == route {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// findNodeID is defined in issues_test.go.
|
||||
|
||||
@ -3871,3 +3871,429 @@ func TestHASubnetRouterPingFailover(t *testing.T) {
|
||||
assertTracerouteViaIPWithCollect(c, tr, ip)
|
||||
}, propagationTime, 200*time.Millisecond, "traceroute should still go through router 2 after recovery")
|
||||
}
|
||||
|
||||
// TestHASubnetRouterFailoverBothOffline reproduces issue #3203:
|
||||
// HA tracking loses the secondary subnet router after all routers serving
|
||||
// the route have been offline simultaneously and one of them returns.
|
||||
// See https://github.com/juanfont/headscale/issues/3203.
|
||||
//
|
||||
// Existing TestHASubnetRouterFailover keeps subRouter3 online across both
|
||||
// failover steps, so the all-offline transition is uncovered. This test
|
||||
// uses two routers and walks them both offline before bringing r2 back.
|
||||
//
|
||||
// Two assertion sets split the failure surface:
|
||||
// - R1: server-side primary route table restores after reconnect.
|
||||
// If R1 fails, the bug is in state.Connect / primaryRoutes.
|
||||
// - R2: client's view shows r2 online with the route in PrimaryRoutes.
|
||||
// If R1 passes and R2 fails, the bug is in change broadcast /
|
||||
// mapBatcher / multiChannelNodeConn.
|
||||
func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
|
||||
IntegrationSkip(t)
|
||||
|
||||
propagationTime := integrationutil.ScaledTimeout(60 * time.Second)
|
||||
|
||||
spec := ScenarioSpec{
|
||||
NodesPerUser: 2,
|
||||
Users: []string{"user1", "user2"},
|
||||
Networks: map[string]NetworkSpec{
|
||||
"usernet1": {Users: []string{"user1"}},
|
||||
"usernet2": {Users: []string{"user2"}},
|
||||
},
|
||||
ExtraService: map[string][]extraServiceFunc{
|
||||
"usernet1": {Webservice},
|
||||
},
|
||||
Versions: []string{"head"},
|
||||
}
|
||||
|
||||
scenario, err := NewScenario(spec)
|
||||
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
||||
|
||||
err = scenario.CreateHeadscaleEnv(
|
||||
[]tsic.Option{tsic.WithAcceptRoutes()},
|
||||
hsic.WithTestName("rt-haboth"),
|
||||
)
|
||||
requireNoErrHeadscaleEnv(t, err)
|
||||
|
||||
allClients, err := scenario.ListTailscaleClients()
|
||||
requireNoErrListClients(t, err)
|
||||
|
||||
err = scenario.WaitForTailscaleSync()
|
||||
requireNoErrSync(t, err)
|
||||
|
||||
headscale, err := scenario.Headscale()
|
||||
requireNoErrGetHeadscale(t, err)
|
||||
|
||||
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
||||
require.NoError(t, err)
|
||||
|
||||
pref := *prefp
|
||||
t.Logf("usernet1 prefix: %s", pref.String())
|
||||
|
||||
usernet1, err := scenario.Network("usernet1")
|
||||
require.NoError(t, err)
|
||||
|
||||
services, err := scenario.Services("usernet1")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, services, 1)
|
||||
|
||||
web := services[0]
|
||||
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
||||
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
||||
|
||||
sort.SliceStable(allClients, func(i, j int) bool {
|
||||
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
||||
})
|
||||
|
||||
subRouter1 := allClients[0]
|
||||
subRouter2 := allClients[1]
|
||||
client := allClients[2]
|
||||
|
||||
t.Logf("Router 1: %s, Router 2: %s, Client: %s",
|
||||
subRouter1.Hostname(), subRouter2.Hostname(), client.Hostname())
|
||||
|
||||
// Advertise the same route on both routers.
|
||||
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
||||
_, _, err = r.Execute([]string{
|
||||
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
||||
})
|
||||
require.NoErrorf(t, err, "failed to advertise route on %s", r.Hostname())
|
||||
}
|
||||
|
||||
err = scenario.WaitForTailscaleSync()
|
||||
requireNoErrSync(t, err)
|
||||
|
||||
var nodes []*v1.Node
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
nodes, err = headscale.ListNodes()
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, nodes, 4)
|
||||
}, propagationTime, 200*time.Millisecond, "nodes should be registered")
|
||||
|
||||
// Approve the route on both routers explicitly.
|
||||
_, err = headscale.ApproveRoutes(
|
||||
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
||||
[]netip.Prefix{pref},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = headscale.ApproveRoutes(
|
||||
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
||||
[]netip.Prefix{pref},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId())
|
||||
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
||||
|
||||
// Sanity: r1 starts as primary (lower NodeID).
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Equal(c, map[string]types.NodeID{
|
||||
pref.String(): nodeID1,
|
||||
}, pr.PrimaryRoutes, "router 1 should be primary initially")
|
||||
}, propagationTime, 200*time.Millisecond, "waiting for HA setup")
|
||||
|
||||
// Confirm initial connectivity through r1.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
result, err := client.Curl(weburl)
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, result, 13)
|
||||
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r1")
|
||||
|
||||
t.Log("=== Step 1: r1 goes offline. r2 should take over. ===")
|
||||
|
||||
require.NoError(t, subRouter1.Down())
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Equal(c, map[string]types.NodeID{
|
||||
pref.String(): nodeID2,
|
||||
}, pr.PrimaryRoutes, "r2 should be primary after r1 offline")
|
||||
}, propagationTime, 500*time.Millisecond, "waiting for failover to r2")
|
||||
|
||||
t.Log("=== Step 2: r2 also goes offline. No primary should remain. ===")
|
||||
|
||||
require.NoError(t, subRouter2.Down())
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Empty(c, pr.PrimaryRoutes,
|
||||
"no primary should be assigned while both routers are offline")
|
||||
}, propagationTime, 500*time.Millisecond, "waiting for both routers to be offline")
|
||||
|
||||
t.Log("=== Step 3: r2 returns. ===")
|
||||
t.Log(" R1: server-side primary route state must restore r2 as primary.")
|
||||
t.Log(" R2: client must observe r2 online with the route in PrimaryRoutes.")
|
||||
|
||||
require.NoError(t, subRouter2.Up())
|
||||
|
||||
// R1 — server side.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Equal(c, map[string]types.NodeID{
|
||||
pref.String(): nodeID2,
|
||||
}, pr.PrimaryRoutes,
|
||||
"R1: r2 should be re-registered as primary after reconnect — issue #3203")
|
||||
}, propagationTime, 500*time.Millisecond, "R1: waiting for server-side primary restore")
|
||||
|
||||
// R2 — client view.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
clientStatus, err := client.Status()
|
||||
assert.NoError(c, err)
|
||||
|
||||
srs2 := subRouter2.MustStatus()
|
||||
|
||||
peer := clientStatus.Peer[srs2.Self.PublicKey]
|
||||
if !assert.NotNil(c, peer, "r2 peer should be in client status") {
|
||||
return
|
||||
}
|
||||
|
||||
assert.True(c, peer.Online,
|
||||
"R2: client should see r2 online after reconnect — issue #3203")
|
||||
|
||||
if assert.NotNil(c, peer.PrimaryRoutes,
|
||||
"R2: r2 should have PrimaryRoutes set in client status") {
|
||||
assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref,
|
||||
"R2: client's view of r2 should include the route as primary")
|
||||
}
|
||||
|
||||
requirePeerSubnetRoutesWithCollect(c, peer, []netip.Prefix{pref})
|
||||
}, propagationTime, 500*time.Millisecond, "R2: waiting for client to see r2 with primary route")
|
||||
|
||||
// End-to-end traffic should reach the webservice via r2.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
result, err := client.Curl(weburl)
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, result, 13)
|
||||
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r2 after recovery")
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
tr, err := client.Traceroute(webip)
|
||||
assert.NoError(c, err)
|
||||
|
||||
ip, err := subRouter2.IPv4()
|
||||
if !assert.NoError(c, err, "failed to get IPv4 for r2") {
|
||||
return
|
||||
}
|
||||
|
||||
assertTracerouteViaIPWithCollect(c, tr, ip)
|
||||
}, propagationTime, 200*time.Millisecond, "traceroute should go through r2 after recovery")
|
||||
}
|
||||
|
||||
// TestHASubnetRouterFailoverBothOfflineCablePull is a stricter variant of
|
||||
// TestHASubnetRouterFailoverBothOffline that simulates a cable pull rather
|
||||
// than a graceful tailscale down. The two differ in what the server sees:
|
||||
//
|
||||
// - tailscale down: poll connection closes cleanly; defer fires
|
||||
// immediately; grace period starts and ends predictably.
|
||||
// - cable pull: server's noise long-poll is wedged in a half-open TCP
|
||||
// connection until kernel keepalives time out (often >60 s). When
|
||||
// the cable returns, two server-side longpoll sessions can overlap.
|
||||
//
|
||||
// Issue #3203 reports the bug after cable pulls; this variant blocks all
|
||||
// traffic between the router container and headscale via iptables and then
|
||||
// removes the block to mimic that behaviour.
|
||||
func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
|
||||
IntegrationSkip(t)
|
||||
|
||||
propagationTime := integrationutil.ScaledTimeout(120 * time.Second)
|
||||
|
||||
spec := ScenarioSpec{
|
||||
NodesPerUser: 2,
|
||||
Users: []string{"user1", "user2"},
|
||||
Networks: map[string]NetworkSpec{
|
||||
"usernet1": {Users: []string{"user1"}},
|
||||
"usernet2": {Users: []string{"user2"}},
|
||||
},
|
||||
ExtraService: map[string][]extraServiceFunc{
|
||||
"usernet1": {Webservice},
|
||||
},
|
||||
Versions: []string{"head"},
|
||||
}
|
||||
|
||||
scenario, err := NewScenario(spec)
|
||||
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
||||
|
||||
err = scenario.CreateHeadscaleEnv(
|
||||
[]tsic.Option{
|
||||
tsic.WithAcceptRoutes(),
|
||||
tsic.WithPackages("iptables"),
|
||||
},
|
||||
hsic.WithTestName("rt-hacable"),
|
||||
)
|
||||
requireNoErrHeadscaleEnv(t, err)
|
||||
|
||||
allClients, err := scenario.ListTailscaleClients()
|
||||
requireNoErrListClients(t, err)
|
||||
|
||||
err = scenario.WaitForTailscaleSync()
|
||||
requireNoErrSync(t, err)
|
||||
|
||||
headscale, err := scenario.Headscale()
|
||||
requireNoErrGetHeadscale(t, err)
|
||||
|
||||
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
||||
require.NoError(t, err)
|
||||
|
||||
pref := *prefp
|
||||
|
||||
usernet1, err := scenario.Network("usernet1")
|
||||
require.NoError(t, err)
|
||||
|
||||
services, err := scenario.Services("usernet1")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, services, 1)
|
||||
|
||||
web := services[0]
|
||||
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
||||
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
||||
|
||||
sort.SliceStable(allClients, func(i, j int) bool {
|
||||
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
||||
})
|
||||
|
||||
subRouter1 := allClients[0]
|
||||
subRouter2 := allClients[1]
|
||||
client := allClients[2]
|
||||
|
||||
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
||||
_, _, err = r.Execute([]string{
|
||||
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
||||
})
|
||||
require.NoErrorf(t, err, "advertise route on %s", r.Hostname())
|
||||
}
|
||||
|
||||
err = scenario.WaitForTailscaleSync()
|
||||
requireNoErrSync(t, err)
|
||||
|
||||
var nodes []*v1.Node
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
nodes, err = headscale.ListNodes()
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, nodes, 4)
|
||||
}, propagationTime, 200*time.Millisecond, "nodes registered")
|
||||
|
||||
_, err = headscale.ApproveRoutes(
|
||||
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
||||
[]netip.Prefix{pref},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = headscale.ApproveRoutes(
|
||||
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
||||
[]netip.Prefix{pref},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
||||
|
||||
// Sanity: r1 starts as primary.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.NotEmpty(c, pr.PrimaryRoutes, "a primary should exist")
|
||||
}, propagationTime, 200*time.Millisecond, "HA setup")
|
||||
|
||||
hsIP := headscale.GetIPInNetwork(usernet1)
|
||||
|
||||
// "Cable pull" — drop all traffic in BOTH directions to/from headscale.
|
||||
// Unlike the NEW-state-only filter used by TestHASubnetRouterPingFailover,
|
||||
// this also breaks the existing ESTABLISHED long-poll, mimicking a
|
||||
// physically severed link.
|
||||
cablePull := func(r TailscaleClient) {
|
||||
t.Helper()
|
||||
|
||||
for _, chain := range []string{"OUTPUT", "INPUT"} {
|
||||
_, _, err := r.Execute([]string{
|
||||
"iptables", "-A", chain,
|
||||
"-d", hsIP, "-j", "DROP",
|
||||
})
|
||||
require.NoErrorf(t, err, "iptables -A %s on %s", chain, r.Hostname())
|
||||
_, _, err = r.Execute([]string{
|
||||
"iptables", "-A", chain,
|
||||
"-s", hsIP, "-j", "DROP",
|
||||
})
|
||||
require.NoErrorf(t, err, "iptables -A %s -s on %s", chain, r.Hostname())
|
||||
}
|
||||
}
|
||||
|
||||
cableReplug := func(r TailscaleClient) {
|
||||
t.Helper()
|
||||
|
||||
for _, chain := range []string{"OUTPUT", "INPUT"} {
|
||||
_, _, err := r.Execute([]string{
|
||||
"iptables", "-D", chain,
|
||||
"-d", hsIP, "-j", "DROP",
|
||||
})
|
||||
require.NoErrorf(t, err, "iptables -D %s on %s", chain, r.Hostname())
|
||||
_, _, err = r.Execute([]string{
|
||||
"iptables", "-D", chain,
|
||||
"-s", hsIP, "-j", "DROP",
|
||||
})
|
||||
require.NoErrorf(t, err, "iptables -D %s -s on %s", chain, r.Hostname())
|
||||
}
|
||||
}
|
||||
|
||||
t.Log("=== Cable-pull r1. Server should eventually fail r1 over to r2. ===")
|
||||
cablePull(subRouter1)
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Equal(c, map[string]types.NodeID{
|
||||
pref.String(): nodeID2,
|
||||
}, pr.PrimaryRoutes, "r2 should become primary after r1 cable pull")
|
||||
}, propagationTime, 1*time.Second, "waiting for r2 promotion")
|
||||
|
||||
t.Log("=== Cable-pull r2 while r1 is still cable-pulled. ===")
|
||||
cablePull(subRouter2)
|
||||
|
||||
// Some primary may transiently flip back to r1 (offline) here — see the
|
||||
// user's "failover to n1 (offline)" observation in the issue. We do not
|
||||
// assert on that intermediate state; we just assert recovery below.
|
||||
|
||||
t.Log("=== Reconnect r2 (cable plugged back in). ===")
|
||||
cableReplug(subRouter2)
|
||||
|
||||
// R1 — server side primary table should restore r2 as primary.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Equal(c, map[string]types.NodeID{
|
||||
pref.String(): nodeID2,
|
||||
}, pr.PrimaryRoutes,
|
||||
"R1: r2 should be re-registered as primary after cable replug — issue #3203")
|
||||
}, propagationTime, 1*time.Second, "R1: waiting for r2 to be primary again")
|
||||
|
||||
// R2 — client should observe r2 online with the route.
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
clientStatus, err := client.Status()
|
||||
assert.NoError(c, err)
|
||||
|
||||
srs2 := subRouter2.MustStatus()
|
||||
|
||||
peer := clientStatus.Peer[srs2.Self.PublicKey]
|
||||
if !assert.NotNil(c, peer) {
|
||||
return
|
||||
}
|
||||
|
||||
assert.True(c, peer.Online,
|
||||
"R2: client should see r2 online — issue #3203")
|
||||
|
||||
if assert.NotNil(c, peer.PrimaryRoutes) {
|
||||
assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref)
|
||||
}
|
||||
}, propagationTime, 1*time.Second, "R2: waiting for client to see r2")
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
result, err := client.Curl(weburl)
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, result, 13)
|
||||
}, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery")
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user