From f836f145f31c47da27c4c06bea56888b6d01352c Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Fri, 18 Dec 2020 23:15:51 +0300 Subject: [PATCH] fix: synchronize bootkube timeouts and various boot timeouts When bootkube service fails, it can clean up manifests after itself, but it only happens if we give it a chance to shut down cleanly. If boot sequence times out, `machined` does emergency reboot and it doesn't let `bootkube` do the cleanup. So this fix has two paths: * synchronize boot/bootstrap sequence timeouts with bootkube asset timeout; * cleanup bootkube-generated manifests and bootkube service startup. Also logs errors on initial phases like `labelNodeAsMaster` to provide some feedback on why boot is stuck. Signed-off-by: Andrey Smirnov --- internal/app/bootkube/main.go | 39 ++++++++++++------- .../v1alpha1/v1alpha1_sequencer_tasks.go | 6 +-- pkg/kubernetes/kubernetes.go | 2 +- pkg/machinery/constants/constants.go | 10 +++++ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/internal/app/bootkube/main.go b/internal/app/bootkube/main.go index 53f9b37e8..2c9b0c93d 100644 --- a/internal/app/bootkube/main.go +++ b/internal/app/bootkube/main.go @@ -11,6 +11,7 @@ import ( "os" "path/filepath" + "github.com/hashicorp/go-multierror" "github.com/kubernetes-sigs/bootkube/pkg/bootkube" "github.com/kubernetes-sigs/bootkube/pkg/util" @@ -59,6 +60,10 @@ func run() error { return err } + // cleanup manifests which might have been left from previous bootkube run + cleanupManifests("bootkube-*") //nolint: errcheck + cleanupManifests("kube-system-pod-checkpointer-*") //nolint: errcheck + defaultRequiredPods := []string{ "kube-system/pod-checkpointer", "kube-system/kube-apiserver", @@ -92,19 +97,8 @@ func run() error { log.Printf("failed to cleanup bootkube assets dir %s", constants.AssetsDirectory) } - bootstrapWildcard := filepath.Join(constants.ManifestsDirectory, "bootstrap-*") - - var bootstrapFiles []string - - bootstrapFiles, err = filepath.Glob(bootstrapWildcard) - if err != nil { - log.Printf("error finding bootstrap files in manifests dir %s", constants.ManifestsDirectory) - } - - for _, bootstrapFile := range bootstrapFiles { - if err = os.Remove(bootstrapFile); err != nil { - log.Printf("error deleting bootstrap file in manifests dir : %s", err) - } + if err = cleanupManifests("bootstrap-*"); err != nil { + log.Printf("%s", err) } }() @@ -117,6 +111,25 @@ func run() error { return nil } +func cleanupManifests(wildcard string) error { + bootstrapWildcard := filepath.Join(constants.ManifestsDirectory, wildcard) + + bootstrapFiles, err := filepath.Glob(bootstrapWildcard) + if err != nil { + return fmt.Errorf("error finding bootstrap files in manifests dir %s", constants.ManifestsDirectory) + } + + var multiErr *multierror.Error + + for _, bootstrapFile := range bootstrapFiles { + if err = os.Remove(bootstrapFile); err != nil { + multiErr = multierror.Append(multiErr, fmt.Errorf("error deleting bootstrap file in manifests dir: %s", err)) + } + } + + return multiErr.ErrorOrNil() +} + func main() { if err := run(); err != nil { log.Fatalf("bootkube failed: %s", err) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go index 8c14337c4..1f9ed80a5 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go @@ -732,7 +732,7 @@ func StartAllServices(seq runtime.Sequence, data interface{}) (runtime.TaskExecu all = append(all, cond) } - ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + ctx, cancel := context.WithTimeout(ctx, constants.BootkubeRunTimeout) defer cancel() @@ -1456,7 +1456,7 @@ func LabelNodeAsMaster(seq runtime.Sequence, data interface{}) (runtime.TaskExec return err } - err = retry.Constant(10*time.Minute, retry.WithUnits(3*time.Second)).Retry(func() error { + err = retry.Constant(constants.NodeReadyTimeout, retry.WithUnits(3*time.Second), retry.WithErrorLogging(true)).Retry(func() error { if err = h.LabelNodeAsMaster(hostname, !r.Config().Cluster().ScheduleOnMasters()); err != nil { return retry.ExpectedError(err) } @@ -1778,7 +1778,7 @@ func BootstrapKubernetes(seq runtime.Sequence, data interface{}) (runtime.TaskEx system.Services(r).LoadAndStart(svc) - ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + ctx, cancel := context.WithTimeout(ctx, constants.BootkubeRunTimeout) defer cancel() return system.WaitForService(system.StateEventFinished, svc.ID(r)).Wait(ctx) diff --git a/pkg/kubernetes/kubernetes.go b/pkg/kubernetes/kubernetes.go index da28c4350..ebea56e25 100644 --- a/pkg/kubernetes/kubernetes.go +++ b/pkg/kubernetes/kubernetes.go @@ -251,7 +251,7 @@ func (h *Client) LabelNodeAsMaster(name string, taintNoSchedule bool) (err error // WaitUntilReady waits for a node to be ready. func (h *Client) WaitUntilReady(name string) error { - return retry.Exponential(3*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond)).Retry(func() error { + return retry.Exponential(10*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond), retry.WithErrorLogging(true)).Retry(func() error { attemptCtx, attemptCtxCancel := context.WithTimeout(context.TODO(), 30*time.Second) defer attemptCtxCancel() diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index b3d36a5e9..eb805c6f6 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -377,6 +377,16 @@ const ( // InitializedKey is the key used to indicate if the cluster has been // initialized. InitializedKey = "initialized" + + // BootkubeAssetTimeout is the constant in bootkube implementation. + BootkubeAssetTimeout = 20 * time.Minute + + // BootkubeRunTimeout is the timeout to run bootkube. + BootkubeRunTimeout = BootkubeAssetTimeout + 5*time.Minute + + // NodeReadyTimeout is the timeout to wait for the node to be ready (CNI to be running). + // For bootstrap API, this includes time to run bootkube. + NodeReadyTimeout = BootkubeRunTimeout ) // See https://linux.die.net/man/3/klogctl