talos/internal/integration/api/recover.go
Andrey Smirnov 3d8418a689 feat: force nodes to be set in talosctl commands using the API
With load-balancing enabled by default running `talosctl` without
`--nodes` is risky, as it might hit any control plane by default without
`--nodes`.

Only two commands do not enforce this check, as they do their own node
contexts: `crashdump` and `health` (client-side).

Integration tests were updated to always supply `--nodes` cli argument,
while doing that I refactored the storage for discovered nodes to use
existing `cluster.Info` interface.

The downside is that with e2e CAPI tests CLI tests will be mostly
skipped as we don't support discovery in CLI tests at the momemnt. This
can be fixed by using `talosctl kubeconfig` + `kubectl get nodes` for
node discovery.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
2020-07-21 12:17:43 -07:00

114 lines
2.7 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// +build integration_api
package api
import (
"context"
"sort"
"testing"
"time"
"golang.org/x/sync/errgroup"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/talos-systems/talos/api/machine"
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
"github.com/talos-systems/talos/internal/integration/base"
"github.com/talos-systems/talos/pkg/client"
)
type RecoverSuite struct {
base.K8sSuite
ctx context.Context
ctxCancel context.CancelFunc
}
// SuiteName ...
func (suite *RecoverSuite) SuiteName() string {
return "api.RecoverSuite"
}
// SetupTest ...
func (suite *RecoverSuite) SetupTest() {
if testing.Short() {
suite.T().Skip("skipping in short mode")
}
// make sure we abort at some point in time, but give enough room for Recovers
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
}
// TearDownTest ...
func (suite *RecoverSuite) TearDownTest() {
if suite.ctxCancel != nil {
suite.ctxCancel()
}
}
// TestRecoverControlPlane removes the control plane components and attempts to recover them with the recover API.
func (suite *RecoverSuite) TestRecoverControlPlane() {
if !suite.Capabilities().SupportsRecover {
suite.T().Skip("cluster doesn't support recovery")
}
if suite.Cluster == nil {
suite.T().Skip("without full cluster state recover test is not reliable (can't wait for cluster readiness)")
}
pods, err := suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
})
suite.Assert().NoError(err)
var eg errgroup.Group
for _, pod := range pods.Items {
pod := pod
eg.Go(func() error {
suite.T().Logf("Deleting %s", pod.GetName())
err := suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{})
return err
})
}
suite.Assert().NoError(eg.Wait())
nodes := suite.DiscoverNodes().NodesByType(runtime.MachineTypeControlPlane)
suite.Require().NotEmpty(nodes)
sort.Strings(nodes)
node := nodes[0]
suite.T().Log("Recovering control plane")
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
in := &machine.RecoverRequest{
Source: machine.RecoverRequest_APISERVER,
}
_, err = suite.Client.MachineClient.Recover(nodeCtx, in)
suite.Assert().NoError(err)
// NB: using `ctx` here to have client talking to init node by default
suite.AssertClusterHealthy(ctx)
}
func init() {
allSuites = append(allSuites, new(RecoverSuite))
}