talos/pkg/cluster/bootstrap.go
Andrey Smirnov b7d70cf625
feat: unify maintenance and regular APIs
Drop maintenance service and all the code supporting it directly.

Instead, move all network API termination into the `apid` service, which
now can work now in more modes to support maintenance operations as
well.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
2026-03-17 17:00:35 +04:00

114 lines
3.7 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package cluster
import (
"context"
"errors"
"fmt"
"io"
"slices"
"strings"
"time"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/siderolabs/go-retry/retry"
"google.golang.org/grpc/backoff"
"google.golang.org/grpc/codes"
machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
"github.com/siderolabs/talos/pkg/machinery/client"
"github.com/siderolabs/talos/pkg/machinery/config/machine"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)
// APIBootstrapper bootstraps cluster via Talos API.
type APIBootstrapper struct {
ClientProvider
Info
}
// Bootstrap the cluster via the API.
//
// Bootstrap implements Bootstrapper interface.
//
//nolint:gocyclo
func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error {
cli, err := s.Client()
if err != nil {
return err
}
controlPlaneNodes := s.NodesByType(machine.TypeControlPlane)
if len(controlPlaneNodes) == 0 {
return errors.New("no control plane nodes to bootstrap")
}
slices.SortFunc(controlPlaneNodes, func(a, b NodeInfo) int { return strings.Compare(a.IPs[0].String(), b.IPs[0].String()) })
nodeIP := controlPlaneNodes[0].IPs[0]
nodeCtx := client.WithNode(ctx, nodeIP.String())
fmt.Fprintln(out, "waiting for Talos API (to bootstrap the cluster)")
err = retry.Constant(10*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second)
defer cancel()
if _, err = cli.Version(retryCtx); err != nil {
return retry.ExpectedError(err)
}
machineStatus, err := safe.ReaderGetByID[*runtime.MachineStatus](retryCtx, cli.COSI, runtime.MachineStatusID)
if err != nil {
return retry.ExpectedError(err)
}
switch machineStatus.TypedSpec().Stage {
case runtime.MachineStageBooting, runtime.MachineStageRunning:
return nil
case runtime.MachineStageUnknown, runtime.MachineStageMaintenance, runtime.MachineStageInstalling,
runtime.MachineStageRebooting, runtime.MachineStageShuttingDown, runtime.MachineStageResetting, runtime.MachineStageUpgrading:
return retry.ExpectedError(fmt.Errorf("machine in unexpected stage %s", machineStatus.TypedSpec().Stage))
}
return nil
})
if err != nil {
return err
}
fmt.Fprintln(out, "bootstrapping cluster")
return retry.Constant(backoff.DefaultConfig.MaxDelay, retry.WithUnits(100*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second)
defer cancel()
if err = cli.Bootstrap(retryCtx, &machineapi.BootstrapRequest{}); err != nil {
switch {
// deadline exceeded in case it's verbatim context error
case errors.Is(err, context.DeadlineExceeded):
return retry.ExpectedError(err)
// FailedPrecondition when time is not in sync yet on the server
// DeadlineExceeded when the call fails in the gRPC stack either on the server or client side
// Canceled is when apid restarts on transitioning maintenance -> ready
case client.StatusCode(err) == codes.FailedPrecondition || client.StatusCode(err) == codes.DeadlineExceeded || client.StatusCode(err) == codes.Canceled:
return retry.ExpectedError(err)
// connection refused, including proxied connection refused via the endpoint to the node
case strings.Contains(err.Error(), "connection refused"):
return retry.ExpectedError(err)
// connection timeout
case strings.Contains(err.Error(), "error reading from server: EOF"):
return retry.ExpectedError(err)
}
return err
}
return nil
})
}