From 804762c597f3aebdc3521cefc93bbbf0ff4a57eb Mon Sep 17 00:00:00 2001 From: Utku Ozdemir Date: Wed, 9 Nov 2022 13:59:45 +0100 Subject: [PATCH] feat: add timeout to cli action tracking, track by default & refactor Add a timeout of 15 minutes to the trackable CLI actions reboot, reset, shutdown and upgrade and refactor the action tracking. Make waiting for these operations the default behavior (set `--wait` to `true` by default). Signed-off-by: Utku Ozdemir --- cmd/talosctl/cmd/talos/reboot.go | 17 ++-- cmd/talosctl/cmd/talos/reset.go | 15 ++-- cmd/talosctl/cmd/talos/shutdown.go | 14 +-- cmd/talosctl/cmd/talos/track.go | 23 +++++ cmd/talosctl/cmd/talos/upgrade.go | 15 ++-- cmd/talosctl/pkg/talos/action/node.go | 16 ++-- cmd/talosctl/pkg/talos/action/tracker.go | 109 +++++++++++++++-------- hack/structprotogen/main.go | 3 +- hack/structprotogen/proto/proto.go | 3 +- hack/structprotogen/types/types.go | 5 +- website/content/v1.3/reference/cli.md | 38 ++++---- 11 files changed, 166 insertions(+), 92 deletions(-) create mode 100644 cmd/talosctl/cmd/talos/track.go diff --git a/cmd/talosctl/cmd/talos/reboot.go b/cmd/talosctl/cmd/talos/reboot.go index a1b78e1ef..4ed0fd0f9 100644 --- a/cmd/talosctl/cmd/talos/reboot.go +++ b/cmd/talosctl/cmd/talos/reboot.go @@ -16,9 +16,8 @@ import ( ) var rebootCmdFlags struct { - mode string - wait bool - debug bool + trackableActionCmdFlags + mode string } // rebootCmd represents the reboot command. @@ -65,7 +64,14 @@ var rebootCmd = &cobra.Command{ return err } - return action.NewTracker(&GlobalArgs, action.MachineReadyEventFn, rebootGetActorID, postCheckFn, rebootCmdFlags.debug).Run() + return action.NewTracker( + &GlobalArgs, + action.MachineReadyEventFn, + rebootGetActorID, + action.WithPostCheck(postCheckFn), + action.WithDebug(rebootCmdFlags.debug), + action.WithTimeout(rebootCmdFlags.timeout), + ).Run() }, } @@ -84,7 +90,6 @@ func rebootGetActorID(ctx context.Context, c *client.Client) (string, error) { func init() { rebootCmd.Flags().StringVarP(&rebootCmdFlags.mode, "mode", "m", "default", "select the reboot mode: \"default\", \"powercycle\" (skips kexec)") - rebootCmd.Flags().BoolVar(&rebootCmdFlags.wait, "wait", false, "wait for the operation to complete, tracking its progress. always set to true when --debug is set") - rebootCmd.Flags().BoolVar(&rebootCmdFlags.debug, "debug", false, "debug operation from kernel logs. --no-wait is set to false when this flag is set") + rebootCmdFlags.addTrackActionFlags(rebootCmd) addCommand(rebootCmd) } diff --git a/cmd/talosctl/cmd/talos/reset.go b/cmd/talosctl/cmd/talos/reset.go index 419f9be7d..fe6de54c4 100644 --- a/cmd/talosctl/cmd/talos/reset.go +++ b/cmd/talosctl/cmd/talos/reset.go @@ -17,11 +17,10 @@ import ( ) var resetCmdFlags struct { + trackableActionCmdFlags graceful bool reboot bool systemLabelsToWipe []string - wait bool - debug bool } // resetCmd represents the reset command. @@ -70,7 +69,14 @@ var resetCmd = &cobra.Command{ cmd.SilenceErrors = true - return action.NewTracker(&GlobalArgs, action.StopAllServicesEventFn, actionFn, postCheckFn, resetCmdFlags.debug).Run() + return action.NewTracker( + &GlobalArgs, + action.StopAllServicesEventFn, + actionFn, + action.WithPostCheck(postCheckFn), + action.WithDebug(resetCmdFlags.debug), + action.WithTimeout(resetCmdFlags.timeout), + ).Run() }, } @@ -108,7 +114,6 @@ func init() { resetCmd.Flags().BoolVar(&resetCmdFlags.graceful, "graceful", true, "if true, attempt to cordon/drain node and leave etcd (if applicable)") resetCmd.Flags().BoolVar(&resetCmdFlags.reboot, "reboot", false, "if true, reboot the node after resetting instead of shutting down") resetCmd.Flags().StringSliceVar(&resetCmdFlags.systemLabelsToWipe, "system-labels-to-wipe", nil, "if set, just wipe selected system disk partitions by label but keep other partitions intact") - resetCmd.Flags().BoolVar(&resetCmdFlags.wait, "wait", false, "wait for the operation to complete, tracking its progress. always set to true when --debug is set") - resetCmd.Flags().BoolVar(&resetCmdFlags.debug, "debug", false, "debug operation from kernel logs. --no-wait is set to false when this flag is set") + resetCmdFlags.addTrackActionFlags(resetCmd) addCommand(resetCmd) } diff --git a/cmd/talosctl/cmd/talos/shutdown.go b/cmd/talosctl/cmd/talos/shutdown.go index ba36ad098..80576d485 100644 --- a/cmd/talosctl/cmd/talos/shutdown.go +++ b/cmd/talosctl/cmd/talos/shutdown.go @@ -16,9 +16,8 @@ import ( ) var shutdownCmdFlags struct { + trackableActionCmdFlags force bool - wait bool - debug bool } // shutdownCmd represents the shutdown command. @@ -52,7 +51,13 @@ var shutdownCmd = &cobra.Command{ cmd.SilenceErrors = true - return action.NewTracker(&GlobalArgs, action.StopAllServicesEventFn, shutdownGetActorID, nil, shutdownCmdFlags.debug).Run() + return action.NewTracker( + &GlobalArgs, + action.StopAllServicesEventFn, + shutdownGetActorID, + action.WithDebug(shutdownCmdFlags.debug), + action.WithTimeout(shutdownCmdFlags.timeout), + ).Run() }, } @@ -71,7 +76,6 @@ func shutdownGetActorID(ctx context.Context, c *client.Client) (string, error) { func init() { shutdownCmd.Flags().BoolVar(&shutdownCmdFlags.force, "force", false, "if true, force a node to shutdown without a cordon/drain") - shutdownCmd.Flags().BoolVar(&shutdownCmdFlags.wait, "wait", false, "wait for the operation to complete, tracking its progress. always set to true when --debug is set") - shutdownCmd.Flags().BoolVar(&shutdownCmdFlags.debug, "debug", false, "debug operation from kernel logs. --no-wait is set to false when this flag is set") + shutdownCmdFlags.addTrackActionFlags(shutdownCmd) addCommand(shutdownCmd) } diff --git a/cmd/talosctl/cmd/talos/track.go b/cmd/talosctl/cmd/talos/track.go new file mode 100644 index 000000000..8e92c8776 --- /dev/null +++ b/cmd/talosctl/cmd/talos/track.go @@ -0,0 +1,23 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package talos + +import ( + "time" + + "github.com/spf13/cobra" +) + +type trackableActionCmdFlags struct { + wait bool + debug bool + timeout time.Duration +} + +func (f *trackableActionCmdFlags) addTrackActionFlags(cmd *cobra.Command) { + cmd.Flags().BoolVar(&f.wait, "wait", true, "wait for the operation to complete, tracking its progress. always set to true when --debug is set") + cmd.Flags().BoolVar(&f.debug, "debug", false, "debug operation from kernel logs. --no-wait is set to false when this flag is set") + cmd.Flags().DurationVar(&f.timeout, "timeout", 30*time.Minute, "time to wait for the operation is complete if --debug or --wait is set") +} diff --git a/cmd/talosctl/cmd/talos/upgrade.go b/cmd/talosctl/cmd/talos/upgrade.go index ef1505a13..58c021b53 100644 --- a/cmd/talosctl/cmd/talos/upgrade.go +++ b/cmd/talosctl/cmd/talos/upgrade.go @@ -22,12 +22,11 @@ import ( ) var upgradeCmdFlags struct { + trackableActionCmdFlags upgradeImage string preserve bool stage bool force bool - wait bool - debug bool insecure bool } @@ -58,7 +57,14 @@ var upgradeCmd = &cobra.Command{ return err } - return action.NewTracker(&GlobalArgs, action.MachineReadyEventFn, upgradeGetActorID, postCheckFn, upgradeCmdFlags.debug).Run() + return action.NewTracker( + &GlobalArgs, + action.MachineReadyEventFn, + upgradeGetActorID, + action.WithPostCheck(postCheckFn), + action.WithDebug(upgradeCmdFlags.debug), + action.WithTimeout(upgradeCmdFlags.timeout), + ).Run() }, } @@ -136,8 +142,7 @@ func init() { upgradeCmd.Flags().BoolVarP(&upgradeCmdFlags.preserve, "preserve", "p", false, "preserve data") upgradeCmd.Flags().BoolVarP(&upgradeCmdFlags.stage, "stage", "s", false, "stage the upgrade to perform it after a reboot") upgradeCmd.Flags().BoolVarP(&upgradeCmdFlags.force, "force", "f", false, "force the upgrade (skip checks on etcd health and members, might lead to data loss)") - upgradeCmd.Flags().BoolVar(&upgradeCmdFlags.wait, "wait", false, "wait for the operation to complete, tracking its progress. always set to true when --debug is set") - upgradeCmd.Flags().BoolVar(&upgradeCmdFlags.debug, "debug", false, "debug operation from kernel logs. --no-wait is set to false when this flag is set") upgradeCmd.Flags().BoolVar(&upgradeCmdFlags.insecure, "insecure", false, "upgrade using the insecure (encrypted with no auth) maintenance service") + upgradeCmdFlags.addTrackActionFlags(upgradeCmd) addCommand(upgradeCmd) } diff --git a/cmd/talosctl/pkg/talos/action/node.go b/cmd/talosctl/pkg/talos/action/node.go index 27fff9ed7..57e7dc290 100644 --- a/cmd/talosctl/pkg/talos/action/node.go +++ b/cmd/talosctl/pkg/talos/action/node.go @@ -34,9 +34,9 @@ type nodeTracker struct { // tailDebugLogs starts tailing the dmesg of the node. func (a *nodeTracker) tailDebugLogs() error { - return retry.Constant(a.tracker.retryDuration).Retry(func() error { + return retry.Constant(a.tracker.timeout).RetryWithContext(a.ctx, func(ctx context.Context) error { err := func() error { - stream, err := a.cli.Dmesg(a.ctx, true, true) + stream, err := a.cli.Dmesg(ctx, true, true) if err != nil { return err } @@ -125,11 +125,11 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error { waitForActorID = true ) - return retry.Constant(a.tracker.retryDuration).Retry(func() error { + return retry.Constant(a.tracker.timeout).RetryWithContext(a.ctx, func(ctx context.Context) error { // retryable function err := func() error { eventCh := make(chan client.EventResult) - err := a.cli.EventsWatchV2(a.ctx, eventCh, client.WithTailEvents(tailEvents)) + err := a.cli.EventsWatchV2(ctx, eventCh, client.WithTailEvents(tailEvents)) if err != nil { return err } @@ -142,8 +142,8 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error { select { case actorID = <-actorIDCh: - case <-a.ctx.Done(): - return a.ctx.Err() + case <-ctx.Done(): + return ctx.Err() } a.update(reporter.Update{ @@ -188,10 +188,10 @@ func (a *nodeTracker) trackEventsWithRetry(actorIDCh chan string) error { } func (a *nodeTracker) runPostCheckWithRetry() error { - return retry.Constant(a.tracker.retryDuration).Retry(func() error { + return retry.Constant(a.tracker.timeout).RetryWithContext(a.ctx, func(ctx context.Context) error { // retryable function err := func() error { - err := a.tracker.postCheckFn(a.ctx, a.cli) + err := a.tracker.postCheckFn(ctx, a.cli) if err != nil { return err } diff --git a/cmd/talosctl/pkg/talos/action/tracker.go b/cmd/talosctl/pkg/talos/action/tracker.go index aefde9da0..837f90b79 100644 --- a/cmd/talosctl/pkg/talos/action/tracker.go +++ b/cmd/talosctl/pkg/talos/action/tracker.go @@ -11,10 +11,11 @@ import ( "os" "sort" "strings" - "sync" "time" + "github.com/hashicorp/go-multierror" "github.com/mattn/go-isatty" + "github.com/siderolabs/gen/containers" "github.com/siderolabs/gen/maps" "github.com/siderolabs/go-circular" "golang.org/x/sync/errgroup" @@ -64,32 +65,58 @@ type Tracker struct { reporter *reporter.Reporter nodeToLatestStatusUpdate map[string]reporter.Update reportCh chan nodeUpdate - retryDuration time.Duration + timeout time.Duration isTerminal bool debug bool cliContext *global.Args } +// TrackerOption is the functional option for the Tracker. +type TrackerOption func(*Tracker) + +// WithTimeout sets the timeout for the tracker. +func WithTimeout(timeout time.Duration) TrackerOption { + return func(t *Tracker) { + t.timeout = timeout + } +} + +// WithPostCheck sets the post check function. +func WithPostCheck(postCheckFn func(ctx context.Context, c *client.Client) error) TrackerOption { + return func(t *Tracker) { + t.postCheckFn = postCheckFn + } +} + +// WithDebug enables debug mode. +func WithDebug(debug bool) TrackerOption { + return func(t *Tracker) { + t.debug = debug + } +} + // NewTracker creates a new Tracker. func NewTracker( cliContext *global.Args, expectedEventFn func(event client.EventResult) bool, actionFn func(ctx context.Context, c *client.Client) (string, error), - postCheckFn func(ctx context.Context, c *client.Client) error, - debug bool, + opts ...TrackerOption, ) *Tracker { - return &Tracker{ + tracker := Tracker{ expectedEventFn: expectedEventFn, actionFn: actionFn, - postCheckFn: postCheckFn, nodeToLatestStatusUpdate: make(map[string]reporter.Update, len(cliContext.Nodes)), reporter: reporter.New(), reportCh: make(chan nodeUpdate), - retryDuration: 15 * time.Minute, isTerminal: isatty.IsTerminal(os.Stderr.Fd()), - debug: debug, cliContext: cliContext, } + + for _, option := range opts { + option(&tracker) + } + + return &tracker } // Run executes the action on nodes and tracks its progress by watching events with retries. @@ -97,39 +124,14 @@ func NewTracker( // //nolint:gocyclo func (a *Tracker) Run() error { - var failedNodesToDmesgs sync.Map + var failedNodesToDmesgs containers.ConcurrentMap[string, io.Reader] var eg errgroup.Group - defer func() { - eg.Wait() //nolint:errcheck + err := a.cliContext.WithClient(func(ctx context.Context, c *client.Client) error { + ctx, cancel := context.WithTimeout(ctx, a.timeout) + defer cancel() - var failedNodes []string - - failedNodesToDmesgs.Range(func(key, value any) bool { - failedNodes = append(failedNodes, key.(string)) - - return true - }) - - if a.debug && len(failedNodes) > 0 { - sort.Strings(failedNodes) - - fmt.Printf("console logs for nodes %v:\n", failedNodes) - - for _, node := range failedNodes { - dmesgReaderRaw, _ := failedNodesToDmesgs.Load(node) - dmesgReader := dmesgReaderRaw.(io.Reader) //nolint:errcheck - - _, err := io.Copy(os.Stdout, dmesgReader) - if err != nil { - fmt.Printf("%v: failed to print debug logs: %v\n", node, err) - } - } - } - }() - - return a.cliContext.WithClient(func(ctx context.Context, c *client.Client) error { if err := helpers.ClientVersionCheck(ctx, c); err != nil { return err } @@ -170,7 +172,7 @@ func (a *Tracker) Run() error { trackEg.Go(func() error { if trackErr := tracker.run(); trackErr != nil { if a.debug { - failedNodesToDmesgs.Store(node, dmesg.GetReader()) + failedNodesToDmesgs.Set(node, dmesg.GetReader()) } tracker.update(reporter.Update{ @@ -189,6 +191,35 @@ func (a *Tracker) Run() error { Backoff: backoff.Config{}, MinConnectTimeout: 20 * time.Second, })) + + err = multierror.Append(err, eg.Wait()) + + if !a.debug { + return err + } + + var failedNodes []string + + failedNodesToDmesgs.ForEach(func(key string, _ io.Reader) { + failedNodes = append(failedNodes, key) + }) + + if len(failedNodes) > 0 { + sort.Strings(failedNodes) + + fmt.Printf("console logs for nodes %q:\n", failedNodes) + + for _, node := range failedNodes { + dmesgReader, _ := failedNodesToDmesgs.Get(node) + + _, copyErr := io.Copy(os.Stdout, dmesgReader) + if copyErr != nil { + fmt.Printf("%q: failed to print debug logs: %v\n", node, copyErr) + } + } + } + + return err } // runReporter starts the (colored) stderr reporter. @@ -217,7 +248,7 @@ func (a *Tracker) runReporter(ctx context.Context) error { case update = <-a.reportCh: if !a.isTerminal { - fmt.Printf("%v: %v\n", update.node, update.update.Message) + fmt.Printf("%q: %v\n", update.node, update.update.Message) continue } diff --git a/hack/structprotogen/main.go b/hack/structprotogen/main.go index 166e7db9e..5c40554d4 100644 --- a/hack/structprotogen/main.go +++ b/hack/structprotogen/main.go @@ -13,12 +13,11 @@ import ( "path" "path/filepath" - "github.com/spf13/cobra" - "github.com/siderolabs/structprotogen/ast" "github.com/siderolabs/structprotogen/loader" "github.com/siderolabs/structprotogen/proto" "github.com/siderolabs/structprotogen/types" + "github.com/spf13/cobra" ) // rootCmd represents the base command when called without any subcommands. diff --git a/hack/structprotogen/proto/proto.go b/hack/structprotogen/proto/proto.go index 759f5875b..bd7739a37 100644 --- a/hack/structprotogen/proto/proto.go +++ b/hack/structprotogen/proto/proto.go @@ -12,10 +12,9 @@ import ( "regexp" "strings" - "gopkg.in/typ.v4/slices" - "github.com/siderolabs/structprotogen/sliceutil" "github.com/siderolabs/structprotogen/types" + "gopkg.in/typ.v4/slices" ) // Pkg represents a protobuf package. diff --git a/hack/structprotogen/types/types.go b/hack/structprotogen/types/types.go index 772c50311..9fe086bae 100644 --- a/hack/structprotogen/types/types.go +++ b/hack/structprotogen/types/types.go @@ -12,11 +12,10 @@ import ( "path" "strings" - "golang.org/x/tools/go/packages" - "gopkg.in/typ.v4/slices" - "github.com/siderolabs/structprotogen/ast" "github.com/siderolabs/structprotogen/sliceutil" + "golang.org/x/tools/go/packages" + "gopkg.in/typ.v4/slices" ) // PkgDecl is a struct which contains package path and tagged struct declarations. diff --git a/website/content/v1.3/reference/cli.md b/website/content/v1.3/reference/cli.md index 241deec2a..bd2058311 100644 --- a/website/content/v1.3/reference/cli.md +++ b/website/content/v1.3/reference/cli.md @@ -1901,10 +1901,11 @@ talosctl reboot [flags] ### Options ``` - --debug debug operation from kernel logs. --no-wait is set to false when this flag is set - -h, --help help for reboot - -m, --mode string select the reboot mode: "default", "powercycle" (skips kexec) (default "default") - --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set + --debug debug operation from kernel logs. --no-wait is set to false when this flag is set + -h, --help help for reboot + -m, --mode string select the reboot mode: "default", "powercycle" (skips kexec) (default "default") + --timeout duration time to wait for the operation is complete if --debug or --wait is set (default 30m0s) + --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set (default true) ``` ### Options inherited from parent commands @@ -1937,7 +1938,8 @@ talosctl reset [flags] -h, --help help for reset --reboot if true, reboot the node after resetting instead of shutting down --system-labels-to-wipe strings if set, just wipe selected system disk partitions by label but keep other partitions intact - --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set + --timeout duration time to wait for the operation is complete if --debug or --wait is set (default 30m0s) + --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set (default true) ``` ### Options inherited from parent commands @@ -2056,10 +2058,11 @@ talosctl shutdown [flags] ### Options ``` - --debug debug operation from kernel logs. --no-wait is set to false when this flag is set - --force if true, force a node to shutdown without a cordon/drain - -h, --help help for shutdown - --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set + --debug debug operation from kernel logs. --no-wait is set to false when this flag is set + --force if true, force a node to shutdown without a cordon/drain + -h, --help help for shutdown + --timeout duration time to wait for the operation is complete if --debug or --wait is set (default 30m0s) + --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set (default true) ``` ### Options inherited from parent commands @@ -2198,14 +2201,15 @@ talosctl upgrade [flags] ### Options ``` - --debug debug operation from kernel logs. --no-wait is set to false when this flag is set - -f, --force force the upgrade (skip checks on etcd health and members, might lead to data loss) - -h, --help help for upgrade - -i, --image string the container image to use for performing the install - --insecure upgrade using the insecure (encrypted with no auth) maintenance service - -p, --preserve preserve data - -s, --stage stage the upgrade to perform it after a reboot - --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set + --debug debug operation from kernel logs. --no-wait is set to false when this flag is set + -f, --force force the upgrade (skip checks on etcd health and members, might lead to data loss) + -h, --help help for upgrade + -i, --image string the container image to use for performing the install + --insecure upgrade using the insecure (encrypted with no auth) maintenance service + -p, --preserve preserve data + -s, --stage stage the upgrade to perform it after a reboot + --timeout duration time to wait for the operation is complete if --debug or --wait is set (default 30m0s) + --wait wait for the operation to complete, tracking its progress. always set to true when --debug is set (default true) ``` ### Options inherited from parent commands