talos/internal/integration/api/recover.go
Andrew Rynhard 56d7bf19fe feat: add recovery API
This adds an API for recovering the self-hosted control plane.

Signed-off-by: Andrew Rynhard <andrew@andrewrynhard.com>
2020-05-04 19:38:30 -07:00

111 lines
2.6 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// +build integration_api
package api
import (
"context"
"sort"
"testing"
"time"
"golang.org/x/sync/errgroup"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/talos-systems/talos/api/machine"
"github.com/talos-systems/talos/internal/integration/base"
"github.com/talos-systems/talos/pkg/client"
)
type RecoverSuite struct {
base.K8sSuite
ctx context.Context
ctxCancel context.CancelFunc
}
// SuiteName ...
func (suite *RecoverSuite) SuiteName() string {
return "api.RecoverSuite"
}
// SetupTest ...
func (suite *RecoverSuite) SetupTest() {
if testing.Short() {
suite.T().Skip("skipping in short mode")
}
// make sure we abort at some point in time, but give enough room for Recovers
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
}
// TearDownTest ...
func (suite *RecoverSuite) TearDownTest() {
suite.ctxCancel()
}
// TestRecoverControlPlane removes the control plane components and attempts to recover them with the recover API.
func (suite *RecoverSuite) TestRecoverControlPlane() {
if !suite.Capabilities().SupportsRecover {
suite.T().Skip("cluster doesn't support recovery")
}
if suite.Cluster == nil {
suite.T().Skip("without full cluster state recover test is not reliable (can't wait for cluster readiness)")
}
pods, err := suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
})
suite.Assert().NoError(err)
var eg errgroup.Group
for _, pod := range pods.Items {
pod := pod
eg.Go(func() error {
suite.T().Logf("Deleting %s", pod.GetName())
err := suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{})
return err
})
}
suite.Assert().NoError(eg.Wait())
nodes := suite.DiscoverNodes()
suite.Require().NotEmpty(nodes)
sort.Strings(nodes)
node := nodes[0]
suite.T().Log("Recovering control plane")
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
in := &machine.RecoverRequest{
Source: machine.RecoverRequest_APISERVER,
}
_, err = suite.Client.MachineClient.Recover(nodeCtx, in)
suite.Assert().NoError(err)
// NB: using `ctx` here to have client talking to init node by default
suite.AssertClusterHealthy(ctx)
}
func init() {
allSuites = append(allSuites, new(RecoverSuite))
}