mirror of
https://github.com/siderolabs/talos.git
synced 2025-08-10 00:27:05 +02:00
These tests rely on node uptime checks. These checks are quite flaky. Following fixes were applied: * code was refactored as common method shared between reset/reboot tests (reboot all nodes does checks in a different way, so it wasn't updated) * each request to read uptime times out in 5 seconds, so that checks don't wait forever when node is down (or connection is aborted) * to account for node availability vs. lower uptime in the beginning of test, add extra elapsed time to the check condition Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
216 lines
6.0 KiB
Go
216 lines
6.0 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// +build integration_api
|
|
|
|
package base
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/suite"
|
|
|
|
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
|
|
"github.com/talos-systems/talos/internal/pkg/cluster/check"
|
|
"github.com/talos-systems/talos/internal/pkg/provision"
|
|
"github.com/talos-systems/talos/internal/pkg/provision/access"
|
|
"github.com/talos-systems/talos/pkg/client"
|
|
"github.com/talos-systems/talos/pkg/client/config"
|
|
"github.com/talos-systems/talos/pkg/retry"
|
|
)
|
|
|
|
// APISuite is a base suite for API tests
|
|
type APISuite struct {
|
|
suite.Suite
|
|
TalosSuite
|
|
|
|
Client *client.Client
|
|
}
|
|
|
|
// SetupSuite initializes Talos API client
|
|
func (apiSuite *APISuite) SetupSuite() {
|
|
cfg, err := config.Open(apiSuite.TalosConfig)
|
|
apiSuite.Require().NoError(err)
|
|
|
|
opts := []client.OptionFunc{
|
|
client.WithConfig(cfg),
|
|
}
|
|
|
|
if apiSuite.Endpoint != "" {
|
|
opts = append(opts, client.WithEndpoints(apiSuite.Endpoint))
|
|
}
|
|
|
|
apiSuite.Client, err = client.New(context.TODO(), opts...)
|
|
apiSuite.Require().NoError(err)
|
|
}
|
|
|
|
// DiscoverNodes provides list of Talos nodes in the cluster.
|
|
//
|
|
// As there's no way to provide this functionality via Talos API, it works the following way:
|
|
// 1. If there's a provided cluster info, it's used.
|
|
// 2. If integration test was compiled with k8s support, k8s is used.
|
|
func (apiSuite *APISuite) DiscoverNodes() []string {
|
|
discoveredNodes := apiSuite.TalosSuite.DiscoverNodes()
|
|
if discoveredNodes != nil {
|
|
return discoveredNodes
|
|
}
|
|
|
|
var err error
|
|
|
|
apiSuite.discoveredNodes, err = discoverNodesK8s(apiSuite.Client, &apiSuite.TalosSuite)
|
|
apiSuite.Require().NoError(err, "k8s discovery failed")
|
|
|
|
if apiSuite.discoveredNodes == nil {
|
|
// still no nodes, skip the test
|
|
apiSuite.T().Skip("no nodes were discovered")
|
|
}
|
|
|
|
return apiSuite.discoveredNodes
|
|
}
|
|
|
|
// Capabilities describes current cluster allowed actions.
|
|
type Capabilities struct {
|
|
RunsTalosKernel bool
|
|
SupportsReboot bool
|
|
SupportsRecover bool
|
|
}
|
|
|
|
// Capabilities returns a set of capabilities to skip tests for different environments.
|
|
func (apiSuite *APISuite) Capabilities() Capabilities {
|
|
v, err := apiSuite.Client.Version(context.Background())
|
|
apiSuite.Require().NoError(err)
|
|
|
|
caps := Capabilities{}
|
|
|
|
if v.Messages[0].Platform != nil {
|
|
switch v.Messages[0].Platform.Mode {
|
|
case runtime.ModeContainer.String():
|
|
default:
|
|
caps.RunsTalosKernel = true
|
|
caps.SupportsReboot = true
|
|
caps.SupportsRecover = true
|
|
}
|
|
}
|
|
|
|
return caps
|
|
}
|
|
|
|
// AssertClusterHealthy verifies that cluster is healthy using provisioning checks.
|
|
func (apiSuite *APISuite) AssertClusterHealthy(ctx context.Context) {
|
|
if apiSuite.Cluster == nil {
|
|
// can't assert if cluster state was provided
|
|
apiSuite.T().Skip("cluster health can't be verified when cluster state is not provided")
|
|
}
|
|
|
|
clusterAccess := access.NewAdapter(apiSuite.Cluster, provision.WithTalosClient(apiSuite.Client))
|
|
defer clusterAccess.Close() //nolint: errcheck
|
|
|
|
apiSuite.Require().NoError(check.Wait(ctx, clusterAccess, check.DefaultClusterChecks(), check.StderrReporter()))
|
|
}
|
|
|
|
// ReadUptime reads node uptime.
|
|
//
|
|
// Context provided might have specific node attached for API call.
|
|
func (apiSuite *APISuite) ReadUptime(ctx context.Context) (time.Duration, error) {
|
|
// set up a short timeout around uptime read calls to work around
|
|
// cases when rebooted node doesn't answer for a long time on requests
|
|
reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer reqCtxCancel()
|
|
|
|
reader, errCh, err := apiSuite.Client.Read(reqCtx, "/proc/uptime")
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
defer reader.Close() //nolint: errcheck
|
|
|
|
var uptime float64
|
|
|
|
n, err := fmt.Fscanf(reader, "%f", &uptime)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
if n != 1 {
|
|
return 0, fmt.Errorf("not all fields scanned: %d", n)
|
|
}
|
|
|
|
_, err = io.Copy(ioutil.Discard, reader)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
for err = range errCh {
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
return time.Duration(uptime * float64(time.Second)), reader.Close()
|
|
}
|
|
|
|
// AssertRebooted verifies that node got rebooted as result of running some API call.
|
|
//
|
|
// Verification happens via reading uptime of the node.
|
|
func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) {
|
|
// offset to account for uptime measuremenet inaccuracy
|
|
const offset = 2 * time.Second
|
|
|
|
// timeout for single node Reset
|
|
ctx, ctxCancel := context.WithTimeout(ctx, timeout)
|
|
defer ctxCancel()
|
|
|
|
nodeCtx := client.WithNodes(ctx, node)
|
|
|
|
// read uptime before Reset
|
|
uptimeBefore, err := apiSuite.ReadUptime(nodeCtx)
|
|
apiSuite.Require().NoError(err)
|
|
|
|
apiSuite.Assert().NoError(rebootFunc(nodeCtx))
|
|
|
|
// capture current time when API returns
|
|
rebootTimestamp := time.Now()
|
|
|
|
var uptimeAfter time.Duration
|
|
|
|
apiSuite.Require().NoError(retry.Constant(timeout).Retry(func() error {
|
|
requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
|
|
defer requestCtxCancel()
|
|
|
|
elapsed := time.Since(rebootTimestamp) - offset
|
|
|
|
uptimeAfter, err = apiSuite.ReadUptime(requestCtx)
|
|
if err != nil {
|
|
// API might be unresponsive during reboot
|
|
return retry.ExpectedError(err)
|
|
}
|
|
|
|
// uptime of the node before it actually reboots still goes up linearly
|
|
// so we can safely add elapsed time here
|
|
if uptimeAfter >= uptimeBefore+elapsed {
|
|
// uptime should go down after reboot
|
|
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %s + %s, after %s", uptimeBefore, elapsed, uptimeAfter))
|
|
}
|
|
|
|
return nil
|
|
}))
|
|
|
|
if apiSuite.Cluster != nil {
|
|
// without cluster state we can't do deep checks, but basic reboot test still works
|
|
// NB: using `ctx` here to have client talking to init node by default
|
|
apiSuite.AssertClusterHealthy(ctx)
|
|
}
|
|
}
|
|
|
|
// TearDownSuite closes Talos API client
|
|
func (apiSuite *APISuite) TearDownSuite() {
|
|
if apiSuite.Client != nil {
|
|
apiSuite.Assert().NoError(apiSuite.Client.Close())
|
|
}
|
|
}
|