fix(talosctl): ensure uncordon runs after reboot/upgrade errors

Use defer blocks and error joining to guarantee uncordon cleanup runs regardless of reboot/upgrade success or failure. Prevents nodes from staying cordoned when operations fail. Also added gRPC keepalive params to prevent timeout issues during long operations. Signed-off-by: Mateusz Urbanek <mateusz.urbanek@siderolabs.com> (cherry picked from commit 3db14309e058cacc2ab8664944fc18f80a3bb747)
2026-05-05 20:36:18 +02:00 · 2026-04-23 10:27:03 +02:00 · 2026-04-23 10:27:03 +02:00 · d34a61c8d1
commit d34a61c8d1
parent f9531d3529
4 changed files with 27 additions and 16 deletions
--- a/cmd/talosctl/cmd/talos/reboot.go
+++ b/cmd/talosctl/cmd/talos/reboot.go
@ -57,7 +57,7 @@ var rebootCmd = &cobra.Command{
 	},
 }

-func rebootRun(opts []client.RebootMode) error {
+func rebootRun(opts []client.RebootMode) (retErr error) {
 	rep := reporter.New(
 		reporter.WithOutputMode(rebootCmdFlags.progress.Value()),
 	)
@ -78,13 +78,15 @@ func rebootRun(opts []client.RebootMode) error {
 		return err
 	}

-	if err := rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...); err != nil {
-		return err
-	}
+	defer func() {
+		if uncordonErr := WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error {
+			return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep)
+		}); uncordonErr != nil {
+			retErr = errors.Join(retErr, uncordonErr)
+		}
+	}()

-	return WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error {
-		return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep)
-	})
+	return rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...)
 }

 func rebootInternal(wait, debug bool, timeout time.Duration, rep *reporter.Reporter, opts ...client.RebootMode) error {
--- a/cmd/talosctl/cmd/talos/upgrade.go
+++ b/cmd/talosctl/cmd/talos/upgrade.go
@ -88,7 +88,7 @@ var talosUpgradeAPIVersionRange = semver.MustParseRange(">1.13.0-alpha.2 <2.0.0"
 // If the server returns codes.Unimplemented, it falls back to the legacy MachineService.Upgrade.
 //
 //nolint:gocyclo
-func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) error {
+func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) (retErr error) {
 	if upgradeCmdFlags.debug {
 		upgradeCmdFlags.wait = true
 	}
@ -147,18 +147,23 @@ func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []s
 		}
 	}

+	defer func() {
+		if !upgradeCmdFlags.drain {
+			return
+		}
+
+		if len(nodeNames) > 0 {
+			if uncordonErr := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); uncordonErr != nil {
+				retErr = errors.Join(retErr, uncordonErr)
+			}
+		}
+	}()
+
 	err = rebootInternal(upgradeCmdFlags.wait, upgradeCmdFlags.debug, upgradeCmdFlags.timeout, rep, opts...)
 	if err != nil {
 		return fmt.Errorf("error during upgrade: %w", err)
 	}

-	// Phase 3: uncordon.
-	if upgradeCmdFlags.drain && len(nodeNames) > 0 {
-		if err := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); err != nil {
-			return err
-		}
-	}
-
 	return nil
 }

--- a/cmd/talosctl/pkg/talos/action/node.go
+++ b/cmd/talosctl/pkg/talos/action/node.go
@ -161,7 +161,7 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error {
 		// handle retryable errors

 		statusCode := client.StatusCode(err)
-		if errors.Is(err, io.EOF) || statusCode == codes.Unavailable {
+		if errors.Is(err, io.EOF) || statusCode == codes.Unavailable || statusCode == codes.Canceled {
 			a.update(reporter.Update{
 				Message: "unavailable, retrying...",
 				Status:  reporter.StatusError,
--- a/cmd/talosctl/pkg/talos/action/tracker.go
+++ b/cmd/talosctl/pkg/talos/action/tracker.go
@ -23,6 +23,7 @@ import (
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/backoff"
 	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/keepalive"
 	"google.golang.org/grpc/status"

 	"github.com/siderolabs/talos/cmd/talosctl/cmd/common"
@ -238,6 +239,9 @@ func (a *Tracker) Run() error {
 		// disable grpc backoff
 		Backoff:           backoff.Config{},
 		MinConnectTimeout: 20 * time.Second,
+	}), grpc.WithKeepaliveParams(keepalive.ClientParameters{
+		Time:    10 * time.Second,
+		Timeout: 5 * time.Second,
 	}))
 	if errors.Is(err, context.Canceled) {
 		err = nil