From 155e42f892f28d1c9f17e1c0c3209a4f74fb49dc Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Thu, 30 Apr 2026 08:18:50 +0000 Subject: [PATCH] integration: retry transient docker network ops Libnetwork endpoint cleanup is eventually consistent. A back-to-back disconnect+connect on the same network can race teardown and return a transient error. Wrap the daemon calls in bounded exponential backoff so TestHASubnetRouterFailoverDockerDisconnect no longer flakes on phase 4c reconnect. Fixes #3234 --- integration/dockertestutil/network.go | 38 ++++++++++----- integration/dockertestutil/network_test.go | 56 ++++++++++++++++++++++ 2 files changed, 83 insertions(+), 11 deletions(-) create mode 100644 integration/dockertestutil/network_test.go diff --git a/integration/dockertestutil/network.go b/integration/dockertestutil/network.go index d224c43c..fe03fcfc 100644 --- a/integration/dockertestutil/network.go +++ b/integration/dockertestutil/network.go @@ -1,11 +1,14 @@ package dockertestutil import ( + "context" "errors" "fmt" "log" "net" + "time" + "github.com/cenkalti/backoff/v5" "github.com/juanfont/headscale/hscontrol/util" "github.com/ory/dockertest/v3" "github.com/ory/dockertest/v3/docker" @@ -13,6 +16,20 @@ import ( var ErrContainerNotFound = errors.New("container not found") +// retryDockerOp absorbs eventual-consistency races in libnetwork endpoint cleanup. +func retryDockerOp(ctx context.Context, op func() error) error { + _, err := backoff.Retry(ctx, func() (struct{}, error) { + err := op() + if err != nil { + return struct{}{}, err + } + + return struct{}{}, nil + }, backoff.WithBackOff(backoff.NewExponentialBackOff()), backoff.WithMaxElapsedTime(30*time.Second)) + + return err +} + func GetFirstOrCreateNetwork(pool *dockertest.Pool, name string) (*dockertest.Network, error) { return GetFirstOrCreateNetworkWithSubnet(pool, name, "") } @@ -72,13 +89,6 @@ func AddContainerToNetwork( return err } - err = pool.Client.ConnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{ - Container: containers[0].ID, - }) - if err != nil { - return err - } - // TODO(kradalby): This doesn't work reliably, but calling the exact same functions // seem to work fine... // if container, ok := pool.ContainerByName("/" + testContainer); ok { @@ -88,7 +98,11 @@ func AddContainerToNetwork( // } // } - return nil + return retryDockerOp(context.Background(), func() error { + return pool.Client.ConnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{ + Container: containers[0].ID, + }) + }) } // DisconnectContainerFromNetwork removes the container from network at @@ -115,9 +129,11 @@ func DisconnectContainerFromNetwork( return fmt.Errorf("%w: %s", ErrContainerNotFound, testContainer) } - return pool.Client.DisconnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{ - Container: containers[0].ID, - Force: true, + return retryDockerOp(context.Background(), func() error { + return pool.Client.DisconnectNetwork(network.Network.ID, docker.NetworkConnectionOptions{ + Container: containers[0].ID, + Force: true, + }) }) } diff --git a/integration/dockertestutil/network_test.go b/integration/dockertestutil/network_test.go new file mode 100644 index 00000000..3f041329 --- /dev/null +++ b/integration/dockertestutil/network_test.go @@ -0,0 +1,56 @@ +package dockertestutil + +import ( + "context" + "errors" + "sync/atomic" + "testing" + "time" +) + +var ( + errTransientEndpointExists = errors.New("endpoint with name foo already exists in network bar") + errPermanent = errors.New("permanent error") +) + +func TestRetryDockerOp_RecoversFromTransient(t *testing.T) { + var attempts atomic.Int32 + + op := func() error { + if attempts.Add(1) < 3 { + return errTransientEndpointExists + } + + return nil + } + + err := retryDockerOp(context.Background(), op) + if err != nil { + t.Fatalf("retryDockerOp should recover from 2 transient errors, got: %v", err) + } + + if got := attempts.Load(); got != 3 { + t.Fatalf("expected 3 attempts, got %d", got) + } +} + +func TestRetryDockerOp_RespectsContextCancellation(t *testing.T) { + op := func() error { + return errPermanent + } + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + start := time.Now() + err := retryDockerOp(ctx, op) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("retryDockerOp should fail when op always errors") + } + + if elapsed > 5*time.Second { + t.Fatalf("retryDockerOp should honour ctx deadline (~200ms), took %s", elapsed) + } +}