talos/internal/integration/api/recover.go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

// +build integration_api

package api

import (
	"context"
	"sort"
	"testing"
	"time"

	"golang.org/x/sync/errgroup"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	"github.com/talos-systems/talos/internal/integration/base"
	machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
	"github.com/talos-systems/talos/pkg/machinery/client"
	"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
)

type RecoverSuite struct {
	base.K8sSuite

	ctx       context.Context
	ctxCancel context.CancelFunc
}

// SuiteName ...
func (suite *RecoverSuite) SuiteName() string {
	return "api.RecoverSuite"
}

// SetupTest ...
func (suite *RecoverSuite) SetupTest() {
	if testing.Short() {
		suite.T().Skip("skipping in short mode")
	}

	// make sure we abort at some point in time, but give enough room for Recovers
	suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
}

// TearDownTest ...
func (suite *RecoverSuite) TearDownTest() {
	if suite.ctxCancel != nil {
		suite.ctxCancel()
	}
}

// TestRecoverControlPlane removes the control plane components and attempts to recover them with the recover API.
func (suite *RecoverSuite) TestRecoverControlPlane() {
	if !suite.Capabilities().SupportsRecover {
		suite.T().Skip("cluster doesn't support recovery")
	}

	if suite.Cluster == nil {
		suite.T().Skip("without full cluster state recover test is not reliable (can't wait for cluster readiness)")
	}

	for _, source := range []machineapi.RecoverRequest_Source{
		machineapi.RecoverRequest_APISERVER,
		machineapi.RecoverRequest_ETCD,
	} {
		source := source

		suite.Run(source.String(), func() {
			pods, err := suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
				LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
			})

			suite.Assert().NoError(err)

			var eg errgroup.Group

			for _, pod := range pods.Items {
				pod := pod

				eg.Go(func() error {
					suite.T().Logf("Deleting %s", pod.GetName())

					err := suite.Clientset.CoreV1().Pods(pod.GetNamespace()).Delete(suite.ctx, pod.GetName(), metav1.DeleteOptions{})
					if err != nil {
						return err
					}

					return err
				})
			}

			suite.Assert().NoError(eg.Wait())

			suite.T().Logf("Waiting for the pods to be deleted")

			for len(pods.Items) > 0 {
				pods, err = suite.Clientset.CoreV1().Pods("kube-system").List(suite.ctx, metav1.ListOptions{
					LabelSelector: "k8s-app in (kube-scheduler,kube-controller-manager)",
				})

				suite.Assert().NoError(err)
			}

			nodes := suite.DiscoverNodes().NodesByType(machine.TypeControlPlane)
			suite.Require().NotEmpty(nodes)

			sort.Strings(nodes)

			node := nodes[0]

			suite.T().Log("Recovering control plane")

			ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
			defer ctxCancel()

			nodeCtx := client.WithNodes(ctx, node)

			in := &machineapi.RecoverRequest{
				Source: source,
			}

			_, err = suite.Client.MachineClient.Recover(nodeCtx, in)

			suite.Assert().NoError(err)

			// NB: using `ctx` here to have client talking to init node by default
			suite.AssertClusterHealthy(ctx)
		})
	}
}

func init() {
	allSuites = append(allSuites, new(RecoverSuite))
}