mirror of
https://github.com/siderolabs/talos.git
synced 2026-05-05 20:36:18 +02:00
fix(talosctl): ensure uncordon runs after reboot/upgrade errors
Use defer blocks and error joining to guarantee uncordon cleanup runs regardless of reboot/upgrade success or failure. Prevents nodes from staying cordoned when operations fail. Also added gRPC keepalive params to prevent timeout issues during long operations. Signed-off-by: Mateusz Urbanek <mateusz.urbanek@siderolabs.com> (cherry picked from commit 3db14309e058cacc2ab8664944fc18f80a3bb747)
This commit is contained in:
parent
f9531d3529
commit
d34a61c8d1
@ -57,7 +57,7 @@ var rebootCmd = &cobra.Command{
|
||||
},
|
||||
}
|
||||
|
||||
func rebootRun(opts []client.RebootMode) error {
|
||||
func rebootRun(opts []client.RebootMode) (retErr error) {
|
||||
rep := reporter.New(
|
||||
reporter.WithOutputMode(rebootCmdFlags.progress.Value()),
|
||||
)
|
||||
@ -78,13 +78,15 @@ func rebootRun(opts []client.RebootMode) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...); err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() {
|
||||
if uncordonErr := WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error {
|
||||
return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep)
|
||||
}); uncordonErr != nil {
|
||||
retErr = errors.Join(retErr, uncordonErr)
|
||||
}
|
||||
}()
|
||||
|
||||
return WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error {
|
||||
return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep)
|
||||
})
|
||||
return rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...)
|
||||
}
|
||||
|
||||
func rebootInternal(wait, debug bool, timeout time.Duration, rep *reporter.Reporter, opts ...client.RebootMode) error {
|
||||
|
||||
@ -88,7 +88,7 @@ var talosUpgradeAPIVersionRange = semver.MustParseRange(">1.13.0-alpha.2 <2.0.0"
|
||||
// If the server returns codes.Unimplemented, it falls back to the legacy MachineService.Upgrade.
|
||||
//
|
||||
//nolint:gocyclo
|
||||
func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) error {
|
||||
func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) (retErr error) {
|
||||
if upgradeCmdFlags.debug {
|
||||
upgradeCmdFlags.wait = true
|
||||
}
|
||||
@ -147,18 +147,23 @@ func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []s
|
||||
}
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if !upgradeCmdFlags.drain {
|
||||
return
|
||||
}
|
||||
|
||||
if len(nodeNames) > 0 {
|
||||
if uncordonErr := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); uncordonErr != nil {
|
||||
retErr = errors.Join(retErr, uncordonErr)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
err = rebootInternal(upgradeCmdFlags.wait, upgradeCmdFlags.debug, upgradeCmdFlags.timeout, rep, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during upgrade: %w", err)
|
||||
}
|
||||
|
||||
// Phase 3: uncordon.
|
||||
if upgradeCmdFlags.drain && len(nodeNames) > 0 {
|
||||
if err := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -161,7 +161,7 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error {
|
||||
// handle retryable errors
|
||||
|
||||
statusCode := client.StatusCode(err)
|
||||
if errors.Is(err, io.EOF) || statusCode == codes.Unavailable {
|
||||
if errors.Is(err, io.EOF) || statusCode == codes.Unavailable || statusCode == codes.Canceled {
|
||||
a.update(reporter.Update{
|
||||
Message: "unavailable, retrying...",
|
||||
Status: reporter.StatusError,
|
||||
|
||||
@ -23,6 +23,7 @@ import (
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/backoff"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/keepalive"
|
||||
"google.golang.org/grpc/status"
|
||||
|
||||
"github.com/siderolabs/talos/cmd/talosctl/cmd/common"
|
||||
@ -238,6 +239,9 @@ func (a *Tracker) Run() error {
|
||||
// disable grpc backoff
|
||||
Backoff: backoff.Config{},
|
||||
MinConnectTimeout: 20 * time.Second,
|
||||
}), grpc.WithKeepaliveParams(keepalive.ClientParameters{
|
||||
Time: 10 * time.Second,
|
||||
Timeout: 5 * time.Second,
|
||||
}))
|
||||
if errors.Is(err, context.Canceled) {
|
||||
err = nil
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user