mirror of
https://github.com/siderolabs/talos.git
synced 2025-10-05 04:31:26 +02:00
test: fix and improve reboot/reset tests
These tests rely on node uptime checks. These checks are quite flaky. Following fixes were applied: * code was refactored as common method shared between reset/reboot tests (reboot all nodes does checks in a different way, so it wasn't updated) * each request to read uptime times out in 5 seconds, so that checks don't wait forever when node is down (or connection is aborted) * to account for node availability vs. lower uptime in the beginning of test, add extra elapsed time to the check condition Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
parent
51112a1d86
commit
6fb55229a2
@ -57,43 +57,9 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
|
|||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
suite.T().Log("rebooting node", node)
|
suite.T().Log("rebooting node", node)
|
||||||
|
|
||||||
func(node string) {
|
suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
|
||||||
// timeout for single node reboot
|
return suite.Client.Reboot(nodeCtx)
|
||||||
ctx, ctxCancel := context.WithTimeout(suite.ctx, 10*time.Minute)
|
}, 10*time.Minute)
|
||||||
defer ctxCancel()
|
|
||||||
|
|
||||||
nodeCtx := client.WithNodes(ctx, node)
|
|
||||||
|
|
||||||
// read uptime before reboot
|
|
||||||
uptimeBefore, err := suite.ReadUptime(nodeCtx)
|
|
||||||
suite.Require().NoError(err)
|
|
||||||
|
|
||||||
suite.Assert().NoError(suite.Client.Reboot(nodeCtx))
|
|
||||||
|
|
||||||
var uptimeAfter float64
|
|
||||||
|
|
||||||
suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
|
|
||||||
uptimeAfter, err = suite.ReadUptime(nodeCtx)
|
|
||||||
if err != nil {
|
|
||||||
// API might be unresponsive during reboot
|
|
||||||
return retry.ExpectedError(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if uptimeAfter >= uptimeBefore {
|
|
||||||
// uptime should go down after reboot
|
|
||||||
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}))
|
|
||||||
|
|
||||||
if suite.Cluster != nil {
|
|
||||||
// without cluster state we can't do deep checks, but basic reboot test still works
|
|
||||||
// NB: using `ctx` here to have client talking to init node by default
|
|
||||||
suite.AssertClusterHealthy(ctx)
|
|
||||||
}
|
|
||||||
}(node)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,6 +69,9 @@ func (suite *RebootSuite) TestRebootAllNodes() {
|
|||||||
suite.T().Skip("cluster doesn't support reboots")
|
suite.T().Skip("cluster doesn't support reboots")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// offset to account for uptime measuremenet inaccuracy
|
||||||
|
const offset = 2 * time.Second
|
||||||
|
|
||||||
nodes := suite.DiscoverNodes()
|
nodes := suite.DiscoverNodes()
|
||||||
suite.Require().NotEmpty(nodes)
|
suite.Require().NotEmpty(nodes)
|
||||||
|
|
||||||
@ -131,6 +100,8 @@ func (suite *RebootSuite) TestRebootAllNodes() {
|
|||||||
suite.Require().NoError(<-errCh)
|
suite.Require().NoError(<-errCh)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rebootTimestamp := time.Now()
|
||||||
|
|
||||||
allNodesCtx := client.WithNodes(suite.ctx, nodes...)
|
allNodesCtx := client.WithNodes(suite.ctx, nodes...)
|
||||||
|
|
||||||
suite.Require().NoError(suite.Client.Reboot(allNodesCtx))
|
suite.Require().NoError(suite.Client.Reboot(allNodesCtx))
|
||||||
@ -143,20 +114,27 @@ func (suite *RebootSuite) TestRebootAllNodes() {
|
|||||||
return fmt.Errorf("uptime record not found for %q", node)
|
return fmt.Errorf("uptime record not found for %q", node)
|
||||||
}
|
}
|
||||||
|
|
||||||
uptimeBefore := uptimeBeforeInterface.(float64) //nolint: errcheck
|
uptimeBefore := uptimeBeforeInterface.(time.Duration) //nolint: errcheck
|
||||||
|
|
||||||
nodeCtx := client.WithNodes(suite.ctx, node)
|
nodeCtx := client.WithNodes(suite.ctx, node)
|
||||||
|
|
||||||
return retry.Constant(10 * time.Minute).Retry(func() error {
|
return retry.Constant(10 * time.Minute).Retry(func() error {
|
||||||
uptimeAfter, err := suite.ReadUptime(nodeCtx)
|
requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
|
||||||
|
defer requestCtxCancel()
|
||||||
|
|
||||||
|
elapsed := time.Since(rebootTimestamp) - offset
|
||||||
|
|
||||||
|
uptimeAfter, err := suite.ReadUptime(requestCtx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// API might be unresponsive during reboot
|
// API might be unresponsive during reboot
|
||||||
return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err))
|
return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err))
|
||||||
}
|
}
|
||||||
|
|
||||||
if uptimeAfter >= uptimeBefore {
|
// uptime of the node before it actually reboots still goes up linearly
|
||||||
|
// so we can safely add elapsed time here
|
||||||
|
if uptimeAfter >= uptimeBefore+elapsed {
|
||||||
// uptime should go down after reboot
|
// uptime should go down after reboot
|
||||||
return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %f, after %f", node, uptimeBefore, uptimeAfter))
|
return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %s + %s, after %s", node, uptimeBefore, elapsed, uptimeAfter))
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -8,14 +8,11 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
|
||||||
"sort"
|
"sort"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/talos-systems/talos/internal/integration/base"
|
"github.com/talos-systems/talos/internal/integration/base"
|
||||||
"github.com/talos-systems/talos/pkg/client"
|
|
||||||
"github.com/talos-systems/talos/pkg/retry"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type ResetSuite struct {
|
type ResetSuite struct {
|
||||||
@ -70,43 +67,13 @@ func (suite *ResetSuite) TestResetNodeByNode() {
|
|||||||
|
|
||||||
suite.T().Log("Resetting node", node)
|
suite.T().Log("Resetting node", node)
|
||||||
|
|
||||||
func(node string) {
|
// uptime should go down after Reset, as it reboots the node
|
||||||
// timeout for single node Reset
|
suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
|
||||||
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
|
|
||||||
defer ctxCancel()
|
|
||||||
|
|
||||||
nodeCtx := client.WithNodes(ctx, node)
|
|
||||||
|
|
||||||
// read uptime before Reset
|
|
||||||
uptimeBefore, err := suite.ReadUptime(nodeCtx)
|
|
||||||
suite.Require().NoError(err)
|
|
||||||
|
|
||||||
// force reboot after reset, as this is the only mode we can test
|
// force reboot after reset, as this is the only mode we can test
|
||||||
suite.Assert().NoError(suite.Client.Reset(nodeCtx, true, true))
|
return suite.Client.Reset(nodeCtx, true, true)
|
||||||
|
}, 10*time.Minute)
|
||||||
var uptimeAfter float64
|
|
||||||
|
|
||||||
suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
|
|
||||||
uptimeAfter, err = suite.ReadUptime(nodeCtx)
|
|
||||||
if err != nil {
|
|
||||||
// API might be unresponsive during reboot
|
|
||||||
return retry.ExpectedError(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if uptimeAfter >= uptimeBefore {
|
|
||||||
// uptime should go down after Reset, as it reboots the node
|
|
||||||
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}))
|
|
||||||
|
|
||||||
// TODO: there is no good way to assert that node was reset and disk contents were really wiped
|
|
||||||
|
|
||||||
// NB: using `ctx` here to have client talking to init node by default
|
|
||||||
suite.AssertClusterHealthy(ctx)
|
|
||||||
}(node)
|
|
||||||
|
|
||||||
|
// TODO: there is no good way to assert that node was reset and disk contents were really wiped
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@ import (
|
|||||||
"github.com/talos-systems/talos/internal/pkg/provision/access"
|
"github.com/talos-systems/talos/internal/pkg/provision/access"
|
||||||
"github.com/talos-systems/talos/pkg/client"
|
"github.com/talos-systems/talos/pkg/client"
|
||||||
"github.com/talos-systems/talos/pkg/client/config"
|
"github.com/talos-systems/talos/pkg/client/config"
|
||||||
|
"github.com/talos-systems/talos/pkg/retry"
|
||||||
)
|
)
|
||||||
|
|
||||||
// APISuite is a base suite for API tests
|
// APISuite is a base suite for API tests
|
||||||
@ -115,7 +116,7 @@ func (apiSuite *APISuite) AssertClusterHealthy(ctx context.Context) {
|
|||||||
// ReadUptime reads node uptime.
|
// ReadUptime reads node uptime.
|
||||||
//
|
//
|
||||||
// Context provided might have specific node attached for API call.
|
// Context provided might have specific node attached for API call.
|
||||||
func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
|
func (apiSuite *APISuite) ReadUptime(ctx context.Context) (time.Duration, error) {
|
||||||
// set up a short timeout around uptime read calls to work around
|
// set up a short timeout around uptime read calls to work around
|
||||||
// cases when rebooted node doesn't answer for a long time on requests
|
// cases when rebooted node doesn't answer for a long time on requests
|
||||||
reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
|
reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
|
||||||
@ -150,7 +151,60 @@ func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return uptime, reader.Close()
|
return time.Duration(uptime * float64(time.Second)), reader.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// AssertRebooted verifies that node got rebooted as result of running some API call.
|
||||||
|
//
|
||||||
|
// Verification happens via reading uptime of the node.
|
||||||
|
func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) {
|
||||||
|
// offset to account for uptime measuremenet inaccuracy
|
||||||
|
const offset = 2 * time.Second
|
||||||
|
|
||||||
|
// timeout for single node Reset
|
||||||
|
ctx, ctxCancel := context.WithTimeout(ctx, timeout)
|
||||||
|
defer ctxCancel()
|
||||||
|
|
||||||
|
nodeCtx := client.WithNodes(ctx, node)
|
||||||
|
|
||||||
|
// read uptime before Reset
|
||||||
|
uptimeBefore, err := apiSuite.ReadUptime(nodeCtx)
|
||||||
|
apiSuite.Require().NoError(err)
|
||||||
|
|
||||||
|
apiSuite.Assert().NoError(rebootFunc(nodeCtx))
|
||||||
|
|
||||||
|
// capture current time when API returns
|
||||||
|
rebootTimestamp := time.Now()
|
||||||
|
|
||||||
|
var uptimeAfter time.Duration
|
||||||
|
|
||||||
|
apiSuite.Require().NoError(retry.Constant(timeout).Retry(func() error {
|
||||||
|
requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
|
||||||
|
defer requestCtxCancel()
|
||||||
|
|
||||||
|
elapsed := time.Since(rebootTimestamp) - offset
|
||||||
|
|
||||||
|
uptimeAfter, err = apiSuite.ReadUptime(requestCtx)
|
||||||
|
if err != nil {
|
||||||
|
// API might be unresponsive during reboot
|
||||||
|
return retry.ExpectedError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// uptime of the node before it actually reboots still goes up linearly
|
||||||
|
// so we can safely add elapsed time here
|
||||||
|
if uptimeAfter >= uptimeBefore+elapsed {
|
||||||
|
// uptime should go down after reboot
|
||||||
|
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %s + %s, after %s", uptimeBefore, elapsed, uptimeAfter))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}))
|
||||||
|
|
||||||
|
if apiSuite.Cluster != nil {
|
||||||
|
// without cluster state we can't do deep checks, but basic reboot test still works
|
||||||
|
// NB: using `ctx` here to have client talking to init node by default
|
||||||
|
apiSuite.AssertClusterHealthy(ctx)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TearDownSuite closes Talos API client
|
// TearDownSuite closes Talos API client
|
||||||
|
Loading…
x
Reference in New Issue
Block a user