talos/pkg/cluster/bootstrap.go
Andrey Smirnov 8ebaa60b71 fix: retry connection refused errors while bootstrapping a cluster
This fixes a random failure at least in the tests. As the nodes are
booting, one node might boot earlier than others. As client is using all
control plane endpoints with load-balancing, check for apid might
succeed via one node, but next request might hit a different endpoint
which still has cached connection error, and we should retry that.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
2020-10-28 08:32:58 -07:00

81 lines
1.9 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package cluster
import (
"context"
"fmt"
"io"
"sort"
"strings"
"time"
"github.com/talos-systems/go-retry/retry"
"google.golang.org/grpc/backoff"
"github.com/talos-systems/talos/pkg/machinery/client"
"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
)
// APIBoostrapper bootstraps cluster via Talos API.
type APIBoostrapper struct {
ClientProvider
Info
}
// Bootstrap the cluster via the API.
//
// Bootstrap implements Bootstrapper interface.
func (s *APIBoostrapper) Bootstrap(ctx context.Context, out io.Writer) error {
cli, err := s.Client()
if err != nil {
return err
}
controlPlaneNodes := s.NodesByType(machine.TypeControlPlane)
if len(controlPlaneNodes) == 0 {
return fmt.Errorf("no control plane nodes to bootstrap")
}
sort.Strings(controlPlaneNodes)
node := controlPlaneNodes[0]
nodeCtx := client.WithNodes(ctx, node)
fmt.Fprintln(out, "waiting for API")
err = retry.Constant(5*time.Minute, retry.WithUnits(500*time.Millisecond)).Retry(func() error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond)
defer cancel()
if _, err = cli.Version(retryCtx); err != nil {
return retry.ExpectedError(err)
}
return nil
})
if err != nil {
return err
}
fmt.Fprintln(out, "bootstrapping cluster")
return retry.Constant(backoff.DefaultConfig.MaxDelay, retry.WithUnits(100*time.Millisecond)).Retry(func() error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond)
defer cancel()
if err = cli.Bootstrap(retryCtx); err != nil {
if strings.Contains(err.Error(), "connection refused") {
return retry.ExpectedError(err)
}
return retry.UnexpectedError(err)
}
return nil
})
}