test: fix and improve reboot/reset tests

These tests rely on node uptime checks. These checks are quite flaky.

Following fixes were applied:

* code was refactored as common method shared between reset/reboot tests
(reboot all nodes does checks in a different way, so it wasn't updated)

* each request to read uptime times out in 5 seconds, so that checks
don't wait forever when node is down (or connection is aborted)

* to account for node availability vs. lower uptime in the beginning of
test, add extra elapsed time to the check condition

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
Andrey Smirnov 2020-06-29 23:14:39 +03:00 committed by talos-bot
parent 51112a1d86
commit 6fb55229a2
3 changed files with 80 additions and 81 deletions

View File

@ -57,43 +57,9 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
for _, node := range nodes { for _, node := range nodes {
suite.T().Log("rebooting node", node) suite.T().Log("rebooting node", node)
func(node string) { suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
// timeout for single node reboot return suite.Client.Reboot(nodeCtx)
ctx, ctxCancel := context.WithTimeout(suite.ctx, 10*time.Minute) }, 10*time.Minute)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
// read uptime before reboot
uptimeBefore, err := suite.ReadUptime(nodeCtx)
suite.Require().NoError(err)
suite.Assert().NoError(suite.Client.Reboot(nodeCtx))
var uptimeAfter float64
suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
uptimeAfter, err = suite.ReadUptime(nodeCtx)
if err != nil {
// API might be unresponsive during reboot
return retry.ExpectedError(err)
}
if uptimeAfter >= uptimeBefore {
// uptime should go down after reboot
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
}
return nil
}))
if suite.Cluster != nil {
// without cluster state we can't do deep checks, but basic reboot test still works
// NB: using `ctx` here to have client talking to init node by default
suite.AssertClusterHealthy(ctx)
}
}(node)
} }
} }
@ -103,6 +69,9 @@ func (suite *RebootSuite) TestRebootAllNodes() {
suite.T().Skip("cluster doesn't support reboots") suite.T().Skip("cluster doesn't support reboots")
} }
// offset to account for uptime measuremenet inaccuracy
const offset = 2 * time.Second
nodes := suite.DiscoverNodes() nodes := suite.DiscoverNodes()
suite.Require().NotEmpty(nodes) suite.Require().NotEmpty(nodes)
@ -131,6 +100,8 @@ func (suite *RebootSuite) TestRebootAllNodes() {
suite.Require().NoError(<-errCh) suite.Require().NoError(<-errCh)
} }
rebootTimestamp := time.Now()
allNodesCtx := client.WithNodes(suite.ctx, nodes...) allNodesCtx := client.WithNodes(suite.ctx, nodes...)
suite.Require().NoError(suite.Client.Reboot(allNodesCtx)) suite.Require().NoError(suite.Client.Reboot(allNodesCtx))
@ -143,20 +114,27 @@ func (suite *RebootSuite) TestRebootAllNodes() {
return fmt.Errorf("uptime record not found for %q", node) return fmt.Errorf("uptime record not found for %q", node)
} }
uptimeBefore := uptimeBeforeInterface.(float64) //nolint: errcheck uptimeBefore := uptimeBeforeInterface.(time.Duration) //nolint: errcheck
nodeCtx := client.WithNodes(suite.ctx, node) nodeCtx := client.WithNodes(suite.ctx, node)
return retry.Constant(10 * time.Minute).Retry(func() error { return retry.Constant(10 * time.Minute).Retry(func() error {
uptimeAfter, err := suite.ReadUptime(nodeCtx) requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
defer requestCtxCancel()
elapsed := time.Since(rebootTimestamp) - offset
uptimeAfter, err := suite.ReadUptime(requestCtx)
if err != nil { if err != nil {
// API might be unresponsive during reboot // API might be unresponsive during reboot
return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err)) return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err))
} }
if uptimeAfter >= uptimeBefore { // uptime of the node before it actually reboots still goes up linearly
// so we can safely add elapsed time here
if uptimeAfter >= uptimeBefore+elapsed {
// uptime should go down after reboot // uptime should go down after reboot
return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %f, after %f", node, uptimeBefore, uptimeAfter)) return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %s + %s, after %s", node, uptimeBefore, elapsed, uptimeAfter))
} }
return nil return nil

View File

@ -8,14 +8,11 @@ package api
import ( import (
"context" "context"
"fmt"
"sort" "sort"
"testing" "testing"
"time" "time"
"github.com/talos-systems/talos/internal/integration/base" "github.com/talos-systems/talos/internal/integration/base"
"github.com/talos-systems/talos/pkg/client"
"github.com/talos-systems/talos/pkg/retry"
) )
type ResetSuite struct { type ResetSuite struct {
@ -70,43 +67,13 @@ func (suite *ResetSuite) TestResetNodeByNode() {
suite.T().Log("Resetting node", node) suite.T().Log("Resetting node", node)
func(node string) {
// timeout for single node Reset
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
// read uptime before Reset
uptimeBefore, err := suite.ReadUptime(nodeCtx)
suite.Require().NoError(err)
// force reboot after reset, as this is the only mode we can test
suite.Assert().NoError(suite.Client.Reset(nodeCtx, true, true))
var uptimeAfter float64
suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
uptimeAfter, err = suite.ReadUptime(nodeCtx)
if err != nil {
// API might be unresponsive during reboot
return retry.ExpectedError(err)
}
if uptimeAfter >= uptimeBefore {
// uptime should go down after Reset, as it reboots the node // uptime should go down after Reset, as it reboots the node
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter)) suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
} // force reboot after reset, as this is the only mode we can test
return suite.Client.Reset(nodeCtx, true, true)
return nil }, 10*time.Minute)
}))
// TODO: there is no good way to assert that node was reset and disk contents were really wiped // TODO: there is no good way to assert that node was reset and disk contents were really wiped
// NB: using `ctx` here to have client talking to init node by default
suite.AssertClusterHealthy(ctx)
}(node)
} }
} }

View File

@ -21,6 +21,7 @@ import (
"github.com/talos-systems/talos/internal/pkg/provision/access" "github.com/talos-systems/talos/internal/pkg/provision/access"
"github.com/talos-systems/talos/pkg/client" "github.com/talos-systems/talos/pkg/client"
"github.com/talos-systems/talos/pkg/client/config" "github.com/talos-systems/talos/pkg/client/config"
"github.com/talos-systems/talos/pkg/retry"
) )
// APISuite is a base suite for API tests // APISuite is a base suite for API tests
@ -115,7 +116,7 @@ func (apiSuite *APISuite) AssertClusterHealthy(ctx context.Context) {
// ReadUptime reads node uptime. // ReadUptime reads node uptime.
// //
// Context provided might have specific node attached for API call. // Context provided might have specific node attached for API call.
func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) { func (apiSuite *APISuite) ReadUptime(ctx context.Context) (time.Duration, error) {
// set up a short timeout around uptime read calls to work around // set up a short timeout around uptime read calls to work around
// cases when rebooted node doesn't answer for a long time on requests // cases when rebooted node doesn't answer for a long time on requests
reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second) reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
@ -150,7 +151,60 @@ func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
} }
} }
return uptime, reader.Close() return time.Duration(uptime * float64(time.Second)), reader.Close()
}
// AssertRebooted verifies that node got rebooted as result of running some API call.
//
// Verification happens via reading uptime of the node.
func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) {
// offset to account for uptime measuremenet inaccuracy
const offset = 2 * time.Second
// timeout for single node Reset
ctx, ctxCancel := context.WithTimeout(ctx, timeout)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
// read uptime before Reset
uptimeBefore, err := apiSuite.ReadUptime(nodeCtx)
apiSuite.Require().NoError(err)
apiSuite.Assert().NoError(rebootFunc(nodeCtx))
// capture current time when API returns
rebootTimestamp := time.Now()
var uptimeAfter time.Duration
apiSuite.Require().NoError(retry.Constant(timeout).Retry(func() error {
requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
defer requestCtxCancel()
elapsed := time.Since(rebootTimestamp) - offset
uptimeAfter, err = apiSuite.ReadUptime(requestCtx)
if err != nil {
// API might be unresponsive during reboot
return retry.ExpectedError(err)
}
// uptime of the node before it actually reboots still goes up linearly
// so we can safely add elapsed time here
if uptimeAfter >= uptimeBefore+elapsed {
// uptime should go down after reboot
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %s + %s, after %s", uptimeBefore, elapsed, uptimeAfter))
}
return nil
}))
if apiSuite.Cluster != nil {
// without cluster state we can't do deep checks, but basic reboot test still works
// NB: using `ctx` here to have client talking to init node by default
apiSuite.AssertClusterHealthy(ctx)
}
} }
// TearDownSuite closes Talos API client // TearDownSuite closes Talos API client