mirror of
https://github.com/juanfont/headscale.git
synced 2026-05-05 03:56:10 +02:00
integration: HA cable-pull lifecycle test
Add DisconnectFromNetwork/ReconnectToNetwork on TailscaleClient backed by pool.Client.DisconnectNetwork. Exercise single-router fail+recover either side, sequential dual failure, and simultaneous dual failure. The dual-failure legs assert no flap to a known-bad primary; the single-router-return legs check traffic only because docker network disconnect transiently fails probes on sibling routers. Fails on parent; passes after the fix. Updates #3203
This commit is contained in:
parent
863fa2f815
commit
7bb86f2c16
@ -91,6 +91,47 @@ func AddContainerToNetwork(
|
||||
return nil
|
||||
}
|
||||
|
||||
// DisconnectContainerFromNetwork removes the container from network at
|
||||
// the docker daemon level. Mirrors a physical cable pull: the
|
||||
// container's network interface for that network disappears and any
|
||||
// in-flight TCP connections are left half-open, exactly the failure
|
||||
// mode iptables-based simulations cannot reproduce.
|
||||
func DisconnectContainerFromNetwork(
|
||||
pool *dockertest.Pool,
|
||||
network *dockertest.Network,
|
||||
testContainer string,
|
||||
) error {
|
||||
containers, err := pool.Client.ListContainers(docker.ListContainersOptions{
|
||||
All: true,
|
||||
Filters: map[string][]string{
|
||||
"name": {testContainer},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(containers) == 0 {
|
||||
return fmt.Errorf("%w: %s", ErrContainerNotFound, testContainer)
|
||||
}
|
||||
|
||||
return pool.Client.DisconnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{
|
||||
Container: containers[0].ID,
|
||||
Force: true,
|
||||
})
|
||||
}
|
||||
|
||||
// ReconnectContainerToNetwork is the inverse of
|
||||
// DisconnectContainerFromNetwork — re-attaches the container to the
|
||||
// network so traffic can flow again.
|
||||
func ReconnectContainerToNetwork(
|
||||
pool *dockertest.Pool,
|
||||
network *dockertest.Network,
|
||||
testContainer string,
|
||||
) error {
|
||||
return AddContainerToNetwork(pool, network, testContainer)
|
||||
}
|
||||
|
||||
// RandomFreeHostPort asks the kernel for a free open port that is ready to use.
|
||||
// (from https://github.com/phayes/freeport)
|
||||
func RandomFreeHostPort() (int, error) {
|
||||
|
||||
@ -4297,3 +4297,255 @@ func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
|
||||
assert.Len(c, result, 13)
|
||||
}, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery")
|
||||
}
|
||||
|
||||
// TestHASubnetRouterFailoverDockerDisconnect drives a multi-phase
|
||||
// up/down/up/down lifecycle of two HA subnet routers using real
|
||||
// docker network disconnects — the same failure primitive nblock
|
||||
// observed when pulling a Proxmox interface in issue #3203.
|
||||
// iptables-based simulations cannot reproduce this because the
|
||||
// container's kernel still owns the socket; only daemon-level
|
||||
// disconnect leaves the long-poll TCP half-open at the peer.
|
||||
//
|
||||
// Phases:
|
||||
// 1. r1 starts as primary (lowest NodeID).
|
||||
// 2. r1 alone fails and recovers — failover to r2, then traffic
|
||||
// resumes when r1 returns.
|
||||
// 3. r2 alone fails and recovers — failover, then traffic resumes.
|
||||
// 4. Sequential dual failure — the issue #3203 bug.
|
||||
// 4a. r1 down → r2 promoted.
|
||||
// 4b. r2 down → primary must NOT flap to offline r1.
|
||||
// 4c. r2 up → r2 primary again, traffic resumes.
|
||||
// 5. Simultaneous dual failure.
|
||||
// 5a. r1 + r2 down → primary must NOT flap to offline r1.
|
||||
// 5b. both up → primary stays r2, traffic resumes.
|
||||
//
|
||||
// The no-flap assertions in 4b and 5a are the regression barriers
|
||||
// for #3203. Phases 2/3 are functional checks (failover works,
|
||||
// traffic recovers) without strict identity assertions on the
|
||||
// "return" leg, since `docker network disconnect` triggers bridge
|
||||
// reconfiguration that can transiently affect probing of OTHER
|
||||
// containers on the same network — a test-infrastructure quirk
|
||||
// that does not occur with a real cable pull.
|
||||
func TestHASubnetRouterFailoverDockerDisconnect(t *testing.T) {
|
||||
IntegrationSkip(t)
|
||||
|
||||
propagationTime := integrationutil.ScaledTimeout(120 * time.Second)
|
||||
flapWindow := integrationutil.ScaledTimeout(40 * time.Second)
|
||||
|
||||
spec := ScenarioSpec{
|
||||
NodesPerUser: 2,
|
||||
Users: []string{"user1", "user2"},
|
||||
Networks: map[string]NetworkSpec{
|
||||
"usernet1": {Users: []string{"user1"}},
|
||||
"usernet2": {Users: []string{"user2"}},
|
||||
},
|
||||
ExtraService: map[string][]extraServiceFunc{
|
||||
"usernet1": {Webservice},
|
||||
},
|
||||
Versions: []string{"head"},
|
||||
}
|
||||
|
||||
scenario, err := NewScenario(spec)
|
||||
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
||||
|
||||
err = scenario.CreateHeadscaleEnv(
|
||||
[]tsic.Option{tsic.WithAcceptRoutes()},
|
||||
hsic.WithTestName("rt-hadocker"),
|
||||
)
|
||||
requireNoErrHeadscaleEnv(t, err)
|
||||
|
||||
allClients, err := scenario.ListTailscaleClients()
|
||||
requireNoErrListClients(t, err)
|
||||
|
||||
err = scenario.WaitForTailscaleSync()
|
||||
requireNoErrSync(t, err)
|
||||
|
||||
headscale, err := scenario.Headscale()
|
||||
requireNoErrGetHeadscale(t, err)
|
||||
|
||||
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
||||
require.NoError(t, err)
|
||||
|
||||
pref := *prefp
|
||||
|
||||
usernet1, err := scenario.Network("usernet1")
|
||||
require.NoError(t, err)
|
||||
|
||||
services, err := scenario.Services("usernet1")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, services, 1)
|
||||
|
||||
web := services[0]
|
||||
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
||||
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
||||
|
||||
sort.SliceStable(allClients, func(i, j int) bool {
|
||||
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
||||
})
|
||||
|
||||
subRouter1 := allClients[0]
|
||||
subRouter2 := allClients[1]
|
||||
client := allClients[2]
|
||||
|
||||
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
||||
_, _, err = r.Execute([]string{
|
||||
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
||||
})
|
||||
require.NoErrorf(t, err, "advertise route on %s", r.Hostname())
|
||||
}
|
||||
|
||||
err = scenario.WaitForTailscaleSync()
|
||||
requireNoErrSync(t, err)
|
||||
|
||||
var nodes []*v1.Node
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
nodes, err = headscale.ListNodes()
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, nodes, 4)
|
||||
}, propagationTime, 200*time.Millisecond, "nodes registered")
|
||||
|
||||
_, err = headscale.ApproveRoutes(
|
||||
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
||||
[]netip.Prefix{pref},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = headscale.ApproveRoutes(
|
||||
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
||||
[]netip.Prefix{pref},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId())
|
||||
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
||||
|
||||
// requirePrimary blocks until headscale reports want as the
|
||||
// primary advertiser for pref.
|
||||
requirePrimary := func(want types.NodeID, msg string) {
|
||||
t.Helper()
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
assert.NoError(c, err)
|
||||
assert.Equal(c, map[string]types.NodeID{
|
||||
pref.String(): want,
|
||||
}, pr.PrimaryRoutes, msg)
|
||||
}, propagationTime, 1*time.Second, msg)
|
||||
}
|
||||
|
||||
// requireTrafficWorks asserts the client can reach the webservice
|
||||
// across the tailnet (i.e. via whichever router is primary).
|
||||
requireTrafficWorks := func(msg string) {
|
||||
t.Helper()
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
result, err := client.Curl(weburl)
|
||||
assert.NoError(c, err)
|
||||
assert.Len(c, result, 13)
|
||||
}, propagationTime, 1*time.Second, msg)
|
||||
}
|
||||
|
||||
// requirePrimaryStable asserts primary == want for the entire
|
||||
// window. Catches transient flaps and verifies anti-flap on
|
||||
// prev-primary return.
|
||||
requirePrimaryStable := func(want types.NodeID, window time.Duration, msg string) {
|
||||
t.Helper()
|
||||
require.Never(t, func() bool {
|
||||
pr, err := headscale.PrimaryRoutes()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
owner, ok := pr.PrimaryRoutes[pref.String()]
|
||||
|
||||
return !ok || owner != want
|
||||
}, window, 1*time.Second, msg)
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Phase 1: initial state — r1 (lowest NodeID) is primary.
|
||||
// ============================================================
|
||||
t.Log("=== Phase 1: initial state — r1 should be primary. ===")
|
||||
requirePrimary(nodeID1, "phase 1: r1 primary at start")
|
||||
requireTrafficWorks("phase 1: client reaches webservice via r1")
|
||||
|
||||
// ============================================================
|
||||
// Phase 2: r1 alone fails and returns. Failover to r2, traffic
|
||||
// resumes; reconnect r1 and verify traffic still flows. We do
|
||||
// not assert primary identity across the r1-return leg because
|
||||
// docker bridge reconfiguration can transiently fail probes on
|
||||
// r2 (real cable pulls do not have this side effect).
|
||||
// ============================================================
|
||||
t.Log("=== Phase 2a: cable-pull r1, expect failover to r2. ===")
|
||||
require.NoError(t, subRouter1.DisconnectFromNetwork(usernet1),
|
||||
"phase 2a: docker disconnect r1")
|
||||
requirePrimary(nodeID2, "phase 2a: r2 promoted after r1 down")
|
||||
requireTrafficWorks("phase 2a: client reaches webservice via r2")
|
||||
|
||||
t.Log("=== Phase 2b: reconnect r1, traffic should still flow. ===")
|
||||
require.NoError(t, subRouter1.ReconnectToNetwork(usernet1),
|
||||
"phase 2b: docker reconnect r1")
|
||||
requireTrafficWorks("phase 2b: client still reaches webservice")
|
||||
|
||||
// ============================================================
|
||||
// Phase 3: r2 alone fails and returns. Same caveats as phase 2
|
||||
// on identity assertions during the return leg.
|
||||
// ============================================================
|
||||
t.Log("=== Phase 3a: cable-pull r2, traffic should fail over. ===")
|
||||
require.NoError(t, subRouter2.DisconnectFromNetwork(usernet1),
|
||||
"phase 3a: docker disconnect r2")
|
||||
requireTrafficWorks("phase 3a: client reaches webservice via remaining router")
|
||||
|
||||
t.Log("=== Phase 3b: reconnect r2, traffic should still flow. ===")
|
||||
require.NoError(t, subRouter2.ReconnectToNetwork(usernet1),
|
||||
"phase 3b: docker reconnect r2")
|
||||
requireTrafficWorks("phase 3b: client still reaches webservice")
|
||||
|
||||
// ============================================================
|
||||
// Phase 4: sequential dual failure — the issue #3203 bug. The
|
||||
// flap target is r1 because under cable-pull both routers
|
||||
// linger as IsOnline=true (half-open TCP), both go Unhealthy,
|
||||
// and electPrimaryRoutes' all-unhealthy fallback selects the
|
||||
// lowest NodeID regardless of who was prev primary.
|
||||
// ============================================================
|
||||
t.Log("=== Phase 4a: cable-pull r1, expect failover to r2. ===")
|
||||
require.NoError(t, subRouter1.DisconnectFromNetwork(usernet1),
|
||||
"phase 4a: docker disconnect r1")
|
||||
requirePrimary(nodeID2, "phase 4a: r2 promoted after r1 down")
|
||||
|
||||
t.Log("=== Phase 4b: cable-pull r2, primary must NOT flap to offline r1. ===")
|
||||
require.NoError(t, subRouter2.DisconnectFromNetwork(usernet1),
|
||||
"phase 4b: docker disconnect r2")
|
||||
requirePrimaryStable(nodeID2, flapWindow,
|
||||
"phase 4b: primary must not flap to offline r1 (issue #3203)")
|
||||
|
||||
t.Log("=== Phase 4c: reconnect r2, r2 should resume as primary. ===")
|
||||
require.NoError(t, subRouter2.ReconnectToNetwork(usernet1),
|
||||
"phase 4c: docker reconnect r2")
|
||||
requirePrimary(nodeID2, "phase 4c: r2 primary after reconnect")
|
||||
requireTrafficWorks("phase 4c: client reaches webservice via r2 after recovery")
|
||||
|
||||
t.Log("=== Phase 4d: reconnect r1, traffic should still flow. ===")
|
||||
require.NoError(t, subRouter1.ReconnectToNetwork(usernet1),
|
||||
"phase 4d: docker reconnect r1")
|
||||
requireTrafficWorks("phase 4d: client still reaches webservice")
|
||||
|
||||
// ============================================================
|
||||
// Phase 5: simultaneous dual failure (whole-segment outage).
|
||||
// prev going in is r2 — primary must not flap to offline r1.
|
||||
// ============================================================
|
||||
t.Log("=== Phase 5a: cable-pull r1 and r2 simultaneously. ===")
|
||||
require.NoError(t, subRouter1.DisconnectFromNetwork(usernet1),
|
||||
"phase 5a: docker disconnect r1")
|
||||
require.NoError(t, subRouter2.DisconnectFromNetwork(usernet1),
|
||||
"phase 5a: docker disconnect r2")
|
||||
requirePrimaryStable(nodeID2, flapWindow,
|
||||
"phase 5a: primary must not flap to offline r1 (issue #3203)")
|
||||
|
||||
t.Log("=== Phase 5b: reconnect both, r2 should remain primary. ===")
|
||||
require.NoError(t, subRouter1.ReconnectToNetwork(usernet1),
|
||||
"phase 5b: docker reconnect r1")
|
||||
require.NoError(t, subRouter2.ReconnectToNetwork(usernet1),
|
||||
"phase 5b: docker reconnect r2")
|
||||
requirePrimary(nodeID2, "phase 5b: r2 primary after both reconnect")
|
||||
requireTrafficWorks("phase 5b: client reaches webservice via r2")
|
||||
}
|
||||
|
||||
@ -58,6 +58,8 @@ type TailscaleClient interface {
|
||||
ReadFile(path string) ([]byte, error)
|
||||
PacketFilter() ([]filter.Match, error)
|
||||
ConnectToNetwork(network *dockertest.Network) error
|
||||
DisconnectFromNetwork(network *dockertest.Network) error
|
||||
ReconnectToNetwork(network *dockertest.Network) error
|
||||
|
||||
// FailingPeersAsString returns a formatted-ish multi-line-string of peers in the client
|
||||
// and a bool indicating if the clients online count and peer count is equal.
|
||||
|
||||
@ -807,6 +807,21 @@ func (t *TailscaleInContainer) Down() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// DisconnectFromNetwork detaches the container from network at the
|
||||
// docker daemon level. The container's network interface for that
|
||||
// network disappears and any in-flight TCP connection is left
|
||||
// half-open at the peer — the same failure mode a real cable pull
|
||||
// produces, which iptables-based simulations cannot reproduce.
|
||||
func (t *TailscaleInContainer) DisconnectFromNetwork(network *dockertest.Network) error {
|
||||
return dockertestutil.DisconnectContainerFromNetwork(t.pool, network, t.hostname)
|
||||
}
|
||||
|
||||
// ReconnectToNetwork is the inverse of DisconnectFromNetwork: it
|
||||
// re-attaches the container to network so traffic can flow again.
|
||||
func (t *TailscaleInContainer) ReconnectToNetwork(network *dockertest.Network) error {
|
||||
return dockertestutil.ReconnectContainerToNetwork(t.pool, network, t.hostname)
|
||||
}
|
||||
|
||||
// IPs returns the netip.Addr of the Tailscale instance.
|
||||
func (t *TailscaleInContainer) IPs() ([]netip.Addr, error) {
|
||||
if len(t.ips) != 0 {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user