mirror of
https://github.com/siderolabs/talos.git
synced 2025-08-09 16:17:05 +02:00
Default manifests created by bootkube so far were only enabling pod-checkpointer for kube-apiserver. This seems to have issues with single-node control plane scenario, when without scheduler and controller-manager node might fall into `NodeAffinity` state. See https://github.com/talos-systems/bootkube-plugin/pull/23 Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
148 lines
3.9 KiB
Go
148 lines
3.9 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// +build integration_api
|
|
|
|
package api
|
|
|
|
import (
|
|
"context"
|
|
"sort"
|
|
"testing"
|
|
"time"
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
|
|
"github.com/talos-systems/talos/internal/integration/base"
|
|
machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
|
|
"github.com/talos-systems/talos/pkg/machinery/client"
|
|
"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
|
|
)
|
|
|
|
// RecoverSuite ...
|
|
type RecoverSuite struct {
|
|
base.K8sSuite
|
|
|
|
ctx context.Context
|
|
ctxCancel context.CancelFunc
|
|
}
|
|
|
|
// SuiteName ...
|
|
func (suite *RecoverSuite) SuiteName() string {
|
|
return "api.RecoverSuite"
|
|
}
|
|
|
|
// SetupTest ...
|
|
func (suite *RecoverSuite) SetupTest() {
|
|
if testing.Short() {
|
|
suite.T().Skip("skipping in short mode")
|
|
}
|
|
|
|
// make sure we abort at some point in time, but give enough room for Recovers
|
|
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
|
|
}
|
|
|
|
// TearDownTest ...
|
|
func (suite *RecoverSuite) TearDownTest() {
|
|
if suite.ctxCancel != nil {
|
|
suite.ctxCancel()
|
|
}
|
|
}
|
|
|
|
// TestRecoverControlPlane removes the control plane components and attempts to recover them with the recover API.
|
|
func (suite *RecoverSuite) TestRecoverControlPlane() {
|
|
suite.T().Skip("with checkpoints enabled for kube-scheduler and kube-controller-manager this test no longer makes sense")
|
|
|
|
if !suite.Capabilities().SupportsRecover {
|
|
suite.T().Skip("cluster doesn't support recovery")
|
|
}
|
|
|
|
if suite.Cluster == nil {
|
|
suite.T().Skip("without full cluster state recover test is not reliable (can't wait for cluster readiness)")
|
|
}
|
|
|
|
for _, source := range []machineapi.RecoverRequest_Source{
|
|
machineapi.RecoverRequest_APISERVER,
|
|
machineapi.RecoverRequest_ETCD,
|
|
} {
|
|
source := source
|
|
|
|
suite.Run(source.String(), func() {
|
|
pods, err := suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
|
|
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
|
|
})
|
|
|
|
suite.Assert().NoError(err)
|
|
|
|
deletedPods := make(map[string]struct{})
|
|
|
|
var eg errgroup.Group
|
|
|
|
for _, pod := range pods.Items {
|
|
pod := pod
|
|
|
|
eg.Go(func() error {
|
|
suite.T().Logf("Deleting %s", pod.GetName())
|
|
|
|
deletedPods[pod.GetName()] = struct{}{}
|
|
|
|
return suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{})
|
|
})
|
|
}
|
|
|
|
suite.Assert().NoError(eg.Wait())
|
|
|
|
suite.T().Logf("Waiting for the pods to be deleted")
|
|
|
|
for len(pods.Items) > 0 {
|
|
pods, err = suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
|
|
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
|
|
})
|
|
|
|
suite.Assert().NoError(err)
|
|
|
|
for _, pod := range pods.Items {
|
|
if _, ok := deletedPods[pod.GetName()]; !ok {
|
|
suite.T().Logf("Deleting %s", pod.GetName())
|
|
|
|
deletedPods[pod.GetName()] = struct{}{}
|
|
|
|
suite.Require().NoError(suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{}))
|
|
}
|
|
}
|
|
}
|
|
|
|
nodes := suite.DiscoverNodes().NodesByType(machine.TypeControlPlane)
|
|
suite.Require().NotEmpty(nodes)
|
|
|
|
sort.Strings(nodes)
|
|
|
|
node := nodes[0]
|
|
|
|
suite.T().Log("Recovering control plane")
|
|
|
|
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
|
|
defer ctxCancel()
|
|
|
|
nodeCtx := client.WithNodes(ctx, node)
|
|
|
|
in := &machineapi.RecoverRequest{
|
|
Source: source,
|
|
}
|
|
|
|
_, err = suite.Client.MachineClient.Recover(nodeCtx, in)
|
|
|
|
suite.Assert().NoError(err)
|
|
|
|
// NB: using `ctx` here to have client talking to init node by default
|
|
suite.AssertClusterHealthy(ctx)
|
|
})
|
|
}
|
|
}
|
|
|
|
func init() {
|
|
allSuites = append(allSuites, new(RecoverSuite))
|
|
}
|