talos/internal/integration/api/reset.go
Andrey Smirnov 350280eb59 feat: implement "staged" (failsafe/backup) upgrades
Regular upgrade path takes just one reboot, but it requires all the
processes to be stopped on the node before upgrade might proceed. Under
some circumstances and with potential Talos bugs it might not work
rendering Talos upgrades almost impossible.

Staged upgrades build upon regular install flow to run the upgrade on
the node reboot. Such upgrades require two reboots of the node, and it
requires two pulls of the installer image, but they should be much less
suspicious to the failure. Once the upgrade is staged, node can be
rebooted in any possible way, including hard reset and upgrade is
performed on the next boot.

New ADV format was implemented as well to allow to store install image
ref/options across reboots. New format allows for bigger values and
takes 50% of the `META` partition. Old ADV is still kept for
compatibility reasons.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
2020-12-08 08:34:26 -08:00

110 lines
2.8 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// +build integration_api
package api
import (
"context"
"sort"
"testing"
"time"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"github.com/talos-systems/talos/internal/integration/base"
"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
)
// ResetSuite ...
type ResetSuite struct {
base.APISuite
ctx context.Context
ctxCancel context.CancelFunc
}
// SuiteName ...
func (suite *ResetSuite) SuiteName() string {
return "api.ResetSuite"
}
// SetupTest ...
func (suite *ResetSuite) SetupTest() {
if testing.Short() {
suite.T().Skip("skipping in short mode")
}
// make sure we abort at some point in time, but give enough room for Resets
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Minute)
}
// TearDownTest ...
func (suite *ResetSuite) TearDownTest() {
if suite.ctxCancel != nil {
suite.ctxCancel()
}
}
// TestResetNodeByNode Resets cluster node by node, waiting for health between Resets.
func (suite *ResetSuite) TestResetNodeByNode() {
if !suite.Capabilities().SupportsReboot {
suite.T().Skip("cluster doesn't support reboot (and reset)")
}
if suite.Cluster == nil {
suite.T().Skip("without full cluster state reset test is not reliable (can't wait for cluster readiness in between resets)")
}
initNodeAddress := ""
for _, node := range suite.Cluster.Info().Nodes {
if node.Type == machine.TypeInit {
initNodeAddress = node.PrivateIP.String()
break
}
}
nodes := suite.DiscoverNodes().Nodes()
suite.Require().NotEmpty(nodes)
sort.Strings(nodes)
for _, node := range nodes {
if node == initNodeAddress {
// due to the bug with etcd cluster build for the init node after Reset(), skip resetting first node
// there's no problem if bootstrap API was used, so this check only protects legacy init nodes
suite.T().Log("Skipping init node", node, "due to known issue with etcd")
continue
}
suite.T().Log("Resetting node", node)
// TODO: there is no good way to assert that node was reset and disk contents were really wiped
// uptime should go down after Reset, as it reboots the node
suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
// force reboot after reset, as this is the only mode we can test
err := suite.Client.Reset(nodeCtx, true, true)
if err != nil {
if s, ok := status.FromError(err); ok && s.Code() == codes.Unavailable {
// ignore errors if reboot happens before response is fully received
err = nil
}
}
return err
}, 10*time.Minute)
}
}
func init() {
allSuites = append(allSuites, new(ResetSuite))
}