fix: synchronize bootkube timeouts and various boot timeouts

When bootkube service fails, it can clean up manifests after itself, but
it only happens if we give it a chance to shut down cleanly. If boot
sequence times out, `machined` does emergency reboot and it doesn't let
`bootkube` do the cleanup.

So this fix has two paths:

* synchronize boot/bootstrap sequence timeouts with bootkube asset
timeout;

* cleanup bootkube-generated manifests and bootkube service startup.

Also logs errors on initial phases like `labelNodeAsMaster` to provide
some feedback on why boot is stuck.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
Andrey Smirnov 2020-12-18 23:15:51 +03:00 committed by talos-bot
parent c4624078ce
commit f836f145f3
4 changed files with 40 additions and 17 deletions

View File

@ -11,6 +11,7 @@ import (
"os"
"path/filepath"
"github.com/hashicorp/go-multierror"
"github.com/kubernetes-sigs/bootkube/pkg/bootkube"
"github.com/kubernetes-sigs/bootkube/pkg/util"
@ -59,6 +60,10 @@ func run() error {
return err
}
// cleanup manifests which might have been left from previous bootkube run
cleanupManifests("bootkube-*") //nolint: errcheck
cleanupManifests("kube-system-pod-checkpointer-*") //nolint: errcheck
defaultRequiredPods := []string{
"kube-system/pod-checkpointer",
"kube-system/kube-apiserver",
@ -92,19 +97,8 @@ func run() error {
log.Printf("failed to cleanup bootkube assets dir %s", constants.AssetsDirectory)
}
bootstrapWildcard := filepath.Join(constants.ManifestsDirectory, "bootstrap-*")
var bootstrapFiles []string
bootstrapFiles, err = filepath.Glob(bootstrapWildcard)
if err != nil {
log.Printf("error finding bootstrap files in manifests dir %s", constants.ManifestsDirectory)
}
for _, bootstrapFile := range bootstrapFiles {
if err = os.Remove(bootstrapFile); err != nil {
log.Printf("error deleting bootstrap file in manifests dir : %s", err)
}
if err = cleanupManifests("bootstrap-*"); err != nil {
log.Printf("%s", err)
}
}()
@ -117,6 +111,25 @@ func run() error {
return nil
}
func cleanupManifests(wildcard string) error {
bootstrapWildcard := filepath.Join(constants.ManifestsDirectory, wildcard)
bootstrapFiles, err := filepath.Glob(bootstrapWildcard)
if err != nil {
return fmt.Errorf("error finding bootstrap files in manifests dir %s", constants.ManifestsDirectory)
}
var multiErr *multierror.Error
for _, bootstrapFile := range bootstrapFiles {
if err = os.Remove(bootstrapFile); err != nil {
multiErr = multierror.Append(multiErr, fmt.Errorf("error deleting bootstrap file in manifests dir: %s", err))
}
}
return multiErr.ErrorOrNil()
}
func main() {
if err := run(); err != nil {
log.Fatalf("bootkube failed: %s", err)

View File

@ -732,7 +732,7 @@ func StartAllServices(seq runtime.Sequence, data interface{}) (runtime.TaskExecu
all = append(all, cond)
}
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
ctx, cancel := context.WithTimeout(ctx, constants.BootkubeRunTimeout)
defer cancel()
@ -1456,7 +1456,7 @@ func LabelNodeAsMaster(seq runtime.Sequence, data interface{}) (runtime.TaskExec
return err
}
err = retry.Constant(10*time.Minute, retry.WithUnits(3*time.Second)).Retry(func() error {
err = retry.Constant(constants.NodeReadyTimeout, retry.WithUnits(3*time.Second), retry.WithErrorLogging(true)).Retry(func() error {
if err = h.LabelNodeAsMaster(hostname, !r.Config().Cluster().ScheduleOnMasters()); err != nil {
return retry.ExpectedError(err)
}
@ -1778,7 +1778,7 @@ func BootstrapKubernetes(seq runtime.Sequence, data interface{}) (runtime.TaskEx
system.Services(r).LoadAndStart(svc)
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
ctx, cancel := context.WithTimeout(ctx, constants.BootkubeRunTimeout)
defer cancel()
return system.WaitForService(system.StateEventFinished, svc.ID(r)).Wait(ctx)

View File

@ -251,7 +251,7 @@ func (h *Client) LabelNodeAsMaster(name string, taintNoSchedule bool) (err error
// WaitUntilReady waits for a node to be ready.
func (h *Client) WaitUntilReady(name string) error {
return retry.Exponential(3*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond)).Retry(func() error {
return retry.Exponential(10*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond), retry.WithErrorLogging(true)).Retry(func() error {
attemptCtx, attemptCtxCancel := context.WithTimeout(context.TODO(), 30*time.Second)
defer attemptCtxCancel()

View File

@ -377,6 +377,16 @@ const (
// InitializedKey is the key used to indicate if the cluster has been
// initialized.
InitializedKey = "initialized"
// BootkubeAssetTimeout is the constant in bootkube implementation.
BootkubeAssetTimeout = 20 * time.Minute
// BootkubeRunTimeout is the timeout to run bootkube.
BootkubeRunTimeout = BootkubeAssetTimeout + 5*time.Minute
// NodeReadyTimeout is the timeout to wait for the node to be ready (CNI to be running).
// For bootstrap API, this includes time to run bootkube.
NodeReadyTimeout = BootkubeRunTimeout
)
// See https://linux.die.net/man/3/klogctl