mirror of
https://github.com/siderolabs/talos.git
synced 2025-08-08 23:57:06 +02:00
Bootkube recover process (and `talosctl recover`) was actually regenerating assets each time `recover` runs forcing control plane to be at the state when cluster got created. This PR fixes that by running recover process correctly. Recovery via etcd was fixed to handle encrypted etcd data: it follows the way `apiserver` handles encryption at rest, and as at the moment AES CBC is the only supported encryption method, code simply follows the same path. Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
136 lines
3.4 KiB
Go
136 lines
3.4 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// +build integration_api
|
|
|
|
package api
|
|
|
|
import (
|
|
"context"
|
|
"sort"
|
|
"testing"
|
|
"time"
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
|
|
"github.com/talos-systems/talos/internal/integration/base"
|
|
machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
|
|
"github.com/talos-systems/talos/pkg/machinery/client"
|
|
"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
|
|
)
|
|
|
|
type RecoverSuite struct {
|
|
base.K8sSuite
|
|
|
|
ctx context.Context
|
|
ctxCancel context.CancelFunc
|
|
}
|
|
|
|
// SuiteName ...
|
|
func (suite *RecoverSuite) SuiteName() string {
|
|
return "api.RecoverSuite"
|
|
}
|
|
|
|
// SetupTest ...
|
|
func (suite *RecoverSuite) SetupTest() {
|
|
if testing.Short() {
|
|
suite.T().Skip("skipping in short mode")
|
|
}
|
|
|
|
// make sure we abort at some point in time, but give enough room for Recovers
|
|
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
|
|
}
|
|
|
|
// TearDownTest ...
|
|
func (suite *RecoverSuite) TearDownTest() {
|
|
if suite.ctxCancel != nil {
|
|
suite.ctxCancel()
|
|
}
|
|
}
|
|
|
|
// TestRecoverControlPlane removes the control plane components and attempts to recover them with the recover API.
|
|
func (suite *RecoverSuite) TestRecoverControlPlane() {
|
|
if !suite.Capabilities().SupportsRecover {
|
|
suite.T().Skip("cluster doesn't support recovery")
|
|
}
|
|
|
|
if suite.Cluster == nil {
|
|
suite.T().Skip("without full cluster state recover test is not reliable (can't wait for cluster readiness)")
|
|
}
|
|
|
|
for _, source := range []machineapi.RecoverRequest_Source{
|
|
machineapi.RecoverRequest_APISERVER,
|
|
machineapi.RecoverRequest_ETCD,
|
|
} {
|
|
source := source
|
|
|
|
suite.Run(source.String(), func() {
|
|
pods, err := suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
|
|
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
|
|
})
|
|
|
|
suite.Assert().NoError(err)
|
|
|
|
var eg errgroup.Group
|
|
|
|
for _, pod := range pods.Items {
|
|
pod := pod
|
|
|
|
eg.Go(func() error {
|
|
suite.T().Logf("Deleting %s", pod.GetName())
|
|
|
|
err := suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return err
|
|
})
|
|
}
|
|
|
|
suite.Assert().NoError(eg.Wait())
|
|
|
|
suite.T().Logf("Waiting for the pods to be deleted")
|
|
|
|
for len(pods.Items) > 0 {
|
|
pods, err = suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
|
|
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
|
|
})
|
|
|
|
suite.Assert().NoError(err)
|
|
}
|
|
|
|
nodes := suite.DiscoverNodes().NodesByType(machine.TypeControlPlane)
|
|
suite.Require().NotEmpty(nodes)
|
|
|
|
sort.Strings(nodes)
|
|
|
|
node := nodes[0]
|
|
|
|
suite.T().Log("Recovering control plane")
|
|
|
|
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
|
|
defer ctxCancel()
|
|
|
|
nodeCtx := client.WithNodes(ctx, node)
|
|
|
|
in := &machineapi.RecoverRequest{
|
|
Source: source,
|
|
}
|
|
|
|
_, err = suite.Client.MachineClient.Recover(nodeCtx, in)
|
|
|
|
suite.Assert().NoError(err)
|
|
|
|
// NB: using `ctx` here to have client talking to init node by default
|
|
suite.AssertClusterHealthy(ctx)
|
|
})
|
|
}
|
|
}
|
|
|
|
func init() {
|
|
allSuites = append(allSuites, new(RecoverSuite))
|
|
}
|