mirror of
https://github.com/siderolabs/talos.git
synced 2025-10-09 06:31:25 +02:00
fix: synchronize bootkube timeouts and various boot timeouts
When bootkube service fails, it can clean up manifests after itself, but it only happens if we give it a chance to shut down cleanly. If boot sequence times out, `machined` does emergency reboot and it doesn't let `bootkube` do the cleanup. So this fix has two paths: * synchronize boot/bootstrap sequence timeouts with bootkube asset timeout; * cleanup bootkube-generated manifests and bootkube service startup. Also logs errors on initial phases like `labelNodeAsMaster` to provide some feedback on why boot is stuck. Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
parent
c4624078ce
commit
f836f145f3
@ -11,6 +11,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
||||||
|
"github.com/hashicorp/go-multierror"
|
||||||
"github.com/kubernetes-sigs/bootkube/pkg/bootkube"
|
"github.com/kubernetes-sigs/bootkube/pkg/bootkube"
|
||||||
"github.com/kubernetes-sigs/bootkube/pkg/util"
|
"github.com/kubernetes-sigs/bootkube/pkg/util"
|
||||||
|
|
||||||
@ -59,6 +60,10 @@ func run() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cleanup manifests which might have been left from previous bootkube run
|
||||||
|
cleanupManifests("bootkube-*") //nolint: errcheck
|
||||||
|
cleanupManifests("kube-system-pod-checkpointer-*") //nolint: errcheck
|
||||||
|
|
||||||
defaultRequiredPods := []string{
|
defaultRequiredPods := []string{
|
||||||
"kube-system/pod-checkpointer",
|
"kube-system/pod-checkpointer",
|
||||||
"kube-system/kube-apiserver",
|
"kube-system/kube-apiserver",
|
||||||
@ -92,19 +97,8 @@ func run() error {
|
|||||||
log.Printf("failed to cleanup bootkube assets dir %s", constants.AssetsDirectory)
|
log.Printf("failed to cleanup bootkube assets dir %s", constants.AssetsDirectory)
|
||||||
}
|
}
|
||||||
|
|
||||||
bootstrapWildcard := filepath.Join(constants.ManifestsDirectory, "bootstrap-*")
|
if err = cleanupManifests("bootstrap-*"); err != nil {
|
||||||
|
log.Printf("%s", err)
|
||||||
var bootstrapFiles []string
|
|
||||||
|
|
||||||
bootstrapFiles, err = filepath.Glob(bootstrapWildcard)
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("error finding bootstrap files in manifests dir %s", constants.ManifestsDirectory)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, bootstrapFile := range bootstrapFiles {
|
|
||||||
if err = os.Remove(bootstrapFile); err != nil {
|
|
||||||
log.Printf("error deleting bootstrap file in manifests dir : %s", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@ -117,6 +111,25 @@ func run() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func cleanupManifests(wildcard string) error {
|
||||||
|
bootstrapWildcard := filepath.Join(constants.ManifestsDirectory, wildcard)
|
||||||
|
|
||||||
|
bootstrapFiles, err := filepath.Glob(bootstrapWildcard)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error finding bootstrap files in manifests dir %s", constants.ManifestsDirectory)
|
||||||
|
}
|
||||||
|
|
||||||
|
var multiErr *multierror.Error
|
||||||
|
|
||||||
|
for _, bootstrapFile := range bootstrapFiles {
|
||||||
|
if err = os.Remove(bootstrapFile); err != nil {
|
||||||
|
multiErr = multierror.Append(multiErr, fmt.Errorf("error deleting bootstrap file in manifests dir: %s", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return multiErr.ErrorOrNil()
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
if err := run(); err != nil {
|
if err := run(); err != nil {
|
||||||
log.Fatalf("bootkube failed: %s", err)
|
log.Fatalf("bootkube failed: %s", err)
|
||||||
|
@ -732,7 +732,7 @@ func StartAllServices(seq runtime.Sequence, data interface{}) (runtime.TaskExecu
|
|||||||
all = append(all, cond)
|
all = append(all, cond)
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
|
ctx, cancel := context.WithTimeout(ctx, constants.BootkubeRunTimeout)
|
||||||
|
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@ -1456,7 +1456,7 @@ func LabelNodeAsMaster(seq runtime.Sequence, data interface{}) (runtime.TaskExec
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = retry.Constant(10*time.Minute, retry.WithUnits(3*time.Second)).Retry(func() error {
|
err = retry.Constant(constants.NodeReadyTimeout, retry.WithUnits(3*time.Second), retry.WithErrorLogging(true)).Retry(func() error {
|
||||||
if err = h.LabelNodeAsMaster(hostname, !r.Config().Cluster().ScheduleOnMasters()); err != nil {
|
if err = h.LabelNodeAsMaster(hostname, !r.Config().Cluster().ScheduleOnMasters()); err != nil {
|
||||||
return retry.ExpectedError(err)
|
return retry.ExpectedError(err)
|
||||||
}
|
}
|
||||||
@ -1778,7 +1778,7 @@ func BootstrapKubernetes(seq runtime.Sequence, data interface{}) (runtime.TaskEx
|
|||||||
|
|
||||||
system.Services(r).LoadAndStart(svc)
|
system.Services(r).LoadAndStart(svc)
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
|
ctx, cancel := context.WithTimeout(ctx, constants.BootkubeRunTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
return system.WaitForService(system.StateEventFinished, svc.ID(r)).Wait(ctx)
|
return system.WaitForService(system.StateEventFinished, svc.ID(r)).Wait(ctx)
|
||||||
|
@ -251,7 +251,7 @@ func (h *Client) LabelNodeAsMaster(name string, taintNoSchedule bool) (err error
|
|||||||
|
|
||||||
// WaitUntilReady waits for a node to be ready.
|
// WaitUntilReady waits for a node to be ready.
|
||||||
func (h *Client) WaitUntilReady(name string) error {
|
func (h *Client) WaitUntilReady(name string) error {
|
||||||
return retry.Exponential(3*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond)).Retry(func() error {
|
return retry.Exponential(10*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond), retry.WithErrorLogging(true)).Retry(func() error {
|
||||||
attemptCtx, attemptCtxCancel := context.WithTimeout(context.TODO(), 30*time.Second)
|
attemptCtx, attemptCtxCancel := context.WithTimeout(context.TODO(), 30*time.Second)
|
||||||
defer attemptCtxCancel()
|
defer attemptCtxCancel()
|
||||||
|
|
||||||
|
@ -377,6 +377,16 @@ const (
|
|||||||
// InitializedKey is the key used to indicate if the cluster has been
|
// InitializedKey is the key used to indicate if the cluster has been
|
||||||
// initialized.
|
// initialized.
|
||||||
InitializedKey = "initialized"
|
InitializedKey = "initialized"
|
||||||
|
|
||||||
|
// BootkubeAssetTimeout is the constant in bootkube implementation.
|
||||||
|
BootkubeAssetTimeout = 20 * time.Minute
|
||||||
|
|
||||||
|
// BootkubeRunTimeout is the timeout to run bootkube.
|
||||||
|
BootkubeRunTimeout = BootkubeAssetTimeout + 5*time.Minute
|
||||||
|
|
||||||
|
// NodeReadyTimeout is the timeout to wait for the node to be ready (CNI to be running).
|
||||||
|
// For bootstrap API, this includes time to run bootkube.
|
||||||
|
NodeReadyTimeout = BootkubeRunTimeout
|
||||||
)
|
)
|
||||||
|
|
||||||
// See https://linux.die.net/man/3/klogctl
|
// See https://linux.die.net/man/3/klogctl
|
||||||
|
Loading…
x
Reference in New Issue
Block a user