integration: retry transient docker network ops

Libnetwork endpoint cleanup is eventually consistent. A back-to-back
disconnect+connect on the same network can race teardown and return a
transient error. Wrap the daemon calls in bounded exponential backoff
so TestHASubnetRouterFailoverDockerDisconnect no longer flakes on
phase 4c reconnect.

Fixes #3234
This commit is contained in:
Kristoffer Dalby 2026-04-30 08:18:50 +00:00
parent 3d5c0af4e7
commit 155e42f892
2 changed files with 83 additions and 11 deletions

View File

@ -1,11 +1,14 @@
package dockertestutil
import (
"context"
"errors"
"fmt"
"log"
"net"
"time"
"github.com/cenkalti/backoff/v5"
"github.com/juanfont/headscale/hscontrol/util"
"github.com/ory/dockertest/v3"
"github.com/ory/dockertest/v3/docker"
@ -13,6 +16,20 @@ import (
var ErrContainerNotFound = errors.New("container not found")
// retryDockerOp absorbs eventual-consistency races in libnetwork endpoint cleanup.
func retryDockerOp(ctx context.Context, op func() error) error {
_, err := backoff.Retry(ctx, func() (struct{}, error) {
err := op()
if err != nil {
return struct{}{}, err
}
return struct{}{}, nil
}, backoff.WithBackOff(backoff.NewExponentialBackOff()), backoff.WithMaxElapsedTime(30*time.Second))
return err
}
func GetFirstOrCreateNetwork(pool *dockertest.Pool, name string) (*dockertest.Network, error) {
return GetFirstOrCreateNetworkWithSubnet(pool, name, "")
}
@ -72,13 +89,6 @@ func AddContainerToNetwork(
return err
}
err = pool.Client.ConnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{
Container: containers[0].ID,
})
if err != nil {
return err
}
// TODO(kradalby): This doesn't work reliably, but calling the exact same functions
// seem to work fine...
// if container, ok := pool.ContainerByName("/" + testContainer); ok {
@ -88,7 +98,11 @@ func AddContainerToNetwork(
// }
// }
return nil
return retryDockerOp(context.Background(), func() error {
return pool.Client.ConnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{
Container: containers[0].ID,
})
})
}
// DisconnectContainerFromNetwork removes the container from network at
@ -115,9 +129,11 @@ func DisconnectContainerFromNetwork(
return fmt.Errorf("%w: %s", ErrContainerNotFound, testContainer)
}
return pool.Client.DisconnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{
Container: containers[0].ID,
Force: true,
return retryDockerOp(context.Background(), func() error {
return pool.Client.DisconnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{
Container: containers[0].ID,
Force: true,
})
})
}

View File

@ -0,0 +1,56 @@
package dockertestutil
import (
"context"
"errors"
"sync/atomic"
"testing"
"time"
)
var (
errTransientEndpointExists = errors.New("endpoint with name foo already exists in network bar")
errPermanent = errors.New("permanent error")
)
func TestRetryDockerOp_RecoversFromTransient(t *testing.T) {
var attempts atomic.Int32
op := func() error {
if attempts.Add(1) < 3 {
return errTransientEndpointExists
}
return nil
}
err := retryDockerOp(context.Background(), op)
if err != nil {
t.Fatalf("retryDockerOp should recover from 2 transient errors, got: %v", err)
}
if got := attempts.Load(); got != 3 {
t.Fatalf("expected 3 attempts, got %d", got)
}
}
func TestRetryDockerOp_RespectsContextCancellation(t *testing.T) {
op := func() error {
return errPermanent
}
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
start := time.Now()
err := retryDockerOp(ctx, op)
elapsed := time.Since(start)
if err == nil {
t.Fatal("retryDockerOp should fail when op always errors")
}
if elapsed > 5*time.Second {
t.Fatalf("retryDockerOp should honour ctx deadline (~200ms), took %s", elapsed)
}
}