fix(talosctl): ensure uncordon runs after reboot/upgrade errors

Use defer blocks and error joining to guarantee uncordon cleanup
runs regardless of reboot/upgrade success or failure. Prevents nodes
from staying cordoned when operations fail.

Also added gRPC keepalive params to prevent timeout issues during
long operations.

Signed-off-by: Mateusz Urbanek <mateusz.urbanek@siderolabs.com>
(cherry picked from commit 3db14309e058cacc2ab8664944fc18f80a3bb747)
This commit is contained in:
Mateusz Urbanek 2026-04-23 10:27:03 +02:00 committed by Andrey Smirnov
parent f9531d3529
commit d34a61c8d1
No known key found for this signature in database
GPG Key ID: 322C6F63F594CE7C
4 changed files with 27 additions and 16 deletions

View File

@ -57,7 +57,7 @@ var rebootCmd = &cobra.Command{
},
}
func rebootRun(opts []client.RebootMode) error {
func rebootRun(opts []client.RebootMode) (retErr error) {
rep := reporter.New(
reporter.WithOutputMode(rebootCmdFlags.progress.Value()),
)
@ -78,13 +78,15 @@ func rebootRun(opts []client.RebootMode) error {
return err
}
if err := rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...); err != nil {
return err
}
defer func() {
if uncordonErr := WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error {
return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep)
}); uncordonErr != nil {
retErr = errors.Join(retErr, uncordonErr)
}
}()
return WithClientAndNodes(func(ctx context.Context, c *client.Client, _ []string) error {
return uncordonNodes(ctx, c, nodeNames, rebootCmdFlags.timeout, rep)
})
return rebootInternal(rebootCmdFlags.wait, rebootCmdFlags.debug, rebootCmdFlags.timeout, rep, opts...)
}
func rebootInternal(wait, debug bool, timeout time.Duration, rep *reporter.Reporter, opts ...client.RebootMode) error {

View File

@ -88,7 +88,7 @@ var talosUpgradeAPIVersionRange = semver.MustParseRange(">1.13.0-alpha.2 <2.0.0"
// If the server returns codes.Unimplemented, it falls back to the legacy MachineService.Upgrade.
//
//nolint:gocyclo
func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) error {
func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []string) (retErr error) {
if upgradeCmdFlags.debug {
upgradeCmdFlags.wait = true
}
@ -147,18 +147,23 @@ func upgradeViaLifecycleService(ctx context.Context, c *client.Client, nodes []s
}
}
defer func() {
if !upgradeCmdFlags.drain {
return
}
if len(nodeNames) > 0 {
if uncordonErr := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); uncordonErr != nil {
retErr = errors.Join(retErr, uncordonErr)
}
}
}()
err = rebootInternal(upgradeCmdFlags.wait, upgradeCmdFlags.debug, upgradeCmdFlags.timeout, rep, opts...)
if err != nil {
return fmt.Errorf("error during upgrade: %w", err)
}
// Phase 3: uncordon.
if upgradeCmdFlags.drain && len(nodeNames) > 0 {
if err := uncordonNodes(ctx, c, nodeNames, upgradeCmdFlags.timeout, rep); err != nil {
return err
}
}
return nil
}

View File

@ -161,7 +161,7 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error {
// handle retryable errors
statusCode := client.StatusCode(err)
if errors.Is(err, io.EOF) || statusCode == codes.Unavailable {
if errors.Is(err, io.EOF) || statusCode == codes.Unavailable || statusCode == codes.Canceled {
a.update(reporter.Update{
Message: "unavailable, retrying...",
Status: reporter.StatusError,

View File

@ -23,6 +23,7 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/keepalive"
"google.golang.org/grpc/status"
"github.com/siderolabs/talos/cmd/talosctl/cmd/common"
@ -238,6 +239,9 @@ func (a *Tracker) Run() error {
// disable grpc backoff
Backoff: backoff.Config{},
MinConnectTimeout: 20 * time.Second,
}), grpc.WithKeepaliveParams(keepalive.ClientParameters{
Time: 10 * time.Second,
Timeout: 5 * time.Second,
}))
if errors.Is(err, context.Canceled) {
err = nil