From d34a61c8d1fd837477c0a8d42b02ce48dde9f971 Mon Sep 17 00:00:00 2001 From: Mateusz Urbanek Date: Thu, 23 Apr 2026 10:27:03 +0200 Subject: [PATCH] fix(talosctl): ensure uncordon runs after reboot/upgrade errors Use defer blocks and error joining to guarantee uncordon cleanup runs regardless of reboot/upgrade success or failure. Prevents nodes from staying cordoned when operations fail. Also added gRPC keepalive params to prevent timeout issues during long operations. Signed-off-by: Mateusz Urbanek (cherry picked from commit 3db14309e058cacc2ab8664944fc18f80a3bb747) --- cmd/talosctl/cmd/talos/reboot.go | 16 +++++++++------- cmd/talosctl/cmd/talos/upgrade.go | 21 +++++++++++++-------- cmd/talosctl/pkg/talos/action/node.go | 2 +- cmd/talosctl/pkg/talos/action/tracker.go | 4 ++++ 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/cmd/talosctl/cmd/talos/reboot.go b/cmd/talosctl/cmd/talos/reboot.go index 5eaa206b0..2192b5f94 100644 --- a/cmd/talosctl/cmd/talos/reboot.go +++ b/cmd/talosctl/cmd/talos/reboot.go @@ -57,7 +57,7 @@ var rebootCmd = &cobra.Command{ }, } -func rebootRun(opts []client.RebootMode) error { +func rebootRun(opts []client.RebootMode) (retErr error) { rep := reporter.New( reporter.WithOutputMode(rebootCmdFlags.progress.Value()), ) @@ -78,13 +78,15 @@ func rebootRun(opts []client.RebootMode) error { return err } - if err := rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...); err != nil { - return err - } + defer func() { + if uncordonErr := WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error { + return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep) + }); uncordonErr != nil { + retErr = errors.Join(retErr, uncordonErr) + } + }() - return WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error { - return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep) - }) + return rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...) } func rebootInternal(wait, debug bool, timeout time.Duration, rep *reporter.Reporter, opts ...client.RebootMode) error { diff --git a/cmd/talosctl/cmd/talos/upgrade.go b/cmd/talosctl/cmd/talos/upgrade.go index a8318d754..6e67eaf3e 100644 --- a/cmd/talosctl/cmd/talos/upgrade.go +++ b/cmd/talosctl/cmd/talos/upgrade.go @@ -88,7 +88,7 @@ var talosUpgradeAPIVersionRange = semver.MustParseRange(">1.13.0-alpha.2 <2.0.0" // If the server returns codes.Unimplemented, it falls back to the legacy MachineService.Upgrade. // //nolint:gocyclo -func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) error { +func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) (retErr error) { if upgradeCmdFlags.debug { upgradeCmdFlags.wait = true } @@ -147,18 +147,23 @@ func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []s } } + defer func() { + if !upgradeCmdFlags.drain { + return + } + + if len(nodeNames) > 0 { + if uncordonErr := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); uncordonErr != nil { + retErr = errors.Join(retErr, uncordonErr) + } + } + }() + err = rebootInternal(upgradeCmdFlags.wait, upgradeCmdFlags.debug, upgradeCmdFlags.timeout, rep, opts...) if err != nil { return fmt.Errorf("error during upgrade: %w", err) } - // Phase 3: uncordon. - if upgradeCmdFlags.drain && len(nodeNames) > 0 { - if err := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); err != nil { - return err - } - } - return nil } diff --git a/cmd/talosctl/pkg/talos/action/node.go b/cmd/talosctl/pkg/talos/action/node.go index da504ee25..29e625eee 100644 --- a/cmd/talosctl/pkg/talos/action/node.go +++ b/cmd/talosctl/pkg/talos/action/node.go @@ -161,7 +161,7 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error { // handle retryable errors statusCode := client.StatusCode(err) - if errors.Is(err, io.EOF) || statusCode == codes.Unavailable { + if errors.Is(err, io.EOF) || statusCode == codes.Unavailable || statusCode == codes.Canceled { a.update(reporter.Update{ Message: "unavailable, retrying...", Status: reporter.StatusError, diff --git a/cmd/talosctl/pkg/talos/action/tracker.go b/cmd/talosctl/pkg/talos/action/tracker.go index c459589be..6d9e4dfd3 100644 --- a/cmd/talosctl/pkg/talos/action/tracker.go +++ b/cmd/talosctl/pkg/talos/action/tracker.go @@ -23,6 +23,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/backoff" "google.golang.org/grpc/codes" + "google.golang.org/grpc/keepalive" "google.golang.org/grpc/status" "github.com/siderolabs/talos/cmd/talosctl/cmd/common" @@ -238,6 +239,9 @@ func (a *Tracker) Run() error { // disable grpc backoff Backoff: backoff.Config{}, MinConnectTimeout: 20 * time.Second, + }), grpc.WithKeepaliveParams(keepalive.ClientParameters{ + Time: 10 * time.Second, + Timeout: 5 * time.Second, })) if errors.Is(err, context.Canceled) { err = nil