talos/internal/integration/api/recover.go
Andrey Smirnov 6a7cc02648 fix: handle bootkube recover correctly, support recovery from etcd
Bootkube recover process (and `talosctl recover`) was actually
regenerating assets each time `recover` runs forcing control plane to be
at the state when cluster got created. This PR fixes that by running
recover process correctly.

Recovery via etcd was fixed to handle encrypted etcd data:
it follows the way `apiserver` handles encryption at rest, and as at
the moment AES CBC is the only supported encryption method, code simply
follows the same path.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
2020-08-18 14:24:14 -07:00

136 lines
3.4 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// +build integration_api
package api
import (
"context"
"sort"
"testing"
"time"
"golang.org/x/sync/errgroup"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/talos-systems/talos/internal/integration/base"
machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
"github.com/talos-systems/talos/pkg/machinery/client"
"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
)
type RecoverSuite struct {
base.K8sSuite
ctx context.Context
ctxCancel context.CancelFunc
}
// SuiteName ...
func (suite *RecoverSuite) SuiteName() string {
return "api.RecoverSuite"
}
// SetupTest ...
func (suite *RecoverSuite) SetupTest() {
if testing.Short() {
suite.T().Skip("skipping in short mode")
}
// make sure we abort at some point in time, but give enough room for Recovers
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
}
// TearDownTest ...
func (suite *RecoverSuite) TearDownTest() {
if suite.ctxCancel != nil {
suite.ctxCancel()
}
}
// TestRecoverControlPlane removes the control plane components and attempts to recover them with the recover API.
func (suite *RecoverSuite) TestRecoverControlPlane() {
if !suite.Capabilities().SupportsRecover {
suite.T().Skip("cluster doesn't support recovery")
}
if suite.Cluster == nil {
suite.T().Skip("without full cluster state recover test is not reliable (can't wait for cluster readiness)")
}
for _, source := range []machineapi.RecoverRequest_Source{
machineapi.RecoverRequest_APISERVER,
machineapi.RecoverRequest_ETCD,
} {
source := source
suite.Run(source.String(), func() {
pods, err := suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
})
suite.Assert().NoError(err)
var eg errgroup.Group
for _, pod := range pods.Items {
pod := pod
eg.Go(func() error {
suite.T().Logf("Deleting %s", pod.GetName())
err := suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{})
if err != nil {
return err
}
return err
})
}
suite.Assert().NoError(eg.Wait())
suite.T().Logf("Waiting for the pods to be deleted")
for len(pods.Items) > 0 {
pods, err = suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
})
suite.Assert().NoError(err)
}
nodes := suite.DiscoverNodes().NodesByType(machine.TypeControlPlane)
suite.Require().NotEmpty(nodes)
sort.Strings(nodes)
node := nodes[0]
suite.T().Log("Recovering control plane")
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
in := &machineapi.RecoverRequest{
Source: source,
}
_, err = suite.Client.MachineClient.Recover(nodeCtx, in)
suite.Assert().NoError(err)
// NB: using `ctx` here to have client talking to init node by default
suite.AssertClusterHealthy(ctx)
})
}
}
func init() {
allSuites = append(allSuites, new(RecoverSuite))
}