From 6fb55229a22cd13dff498d3b051b4fe7b347d49c Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 29 Jun 2020 23:14:39 +0300 Subject: [PATCH] test: fix and improve reboot/reset tests These tests rely on node uptime checks. These checks are quite flaky. Following fixes were applied: * code was refactored as common method shared between reset/reboot tests (reboot all nodes does checks in a different way, so it wasn't updated) * each request to read uptime times out in 5 seconds, so that checks don't wait forever when node is down (or connection is aborted) * to account for node availability vs. lower uptime in the beginning of test, add extra elapsed time to the check condition Signed-off-by: Andrey Smirnov --- internal/integration/api/reboot.go | 60 ++++++++++-------------------- internal/integration/api/reset.go | 43 +++------------------ internal/integration/base/api.go | 58 ++++++++++++++++++++++++++++- 3 files changed, 80 insertions(+), 81 deletions(-) diff --git a/internal/integration/api/reboot.go b/internal/integration/api/reboot.go index 117850719..46d71d565 100644 --- a/internal/integration/api/reboot.go +++ b/internal/integration/api/reboot.go @@ -57,43 +57,9 @@ func (suite *RebootSuite) TestRebootNodeByNode() { for _, node := range nodes { suite.T().Log("rebooting node", node) - func(node string) { - // timeout for single node reboot - ctx, ctxCancel := context.WithTimeout(suite.ctx, 10*time.Minute) - defer ctxCancel() - - nodeCtx := client.WithNodes(ctx, node) - - // read uptime before reboot - uptimeBefore, err := suite.ReadUptime(nodeCtx) - suite.Require().NoError(err) - - suite.Assert().NoError(suite.Client.Reboot(nodeCtx)) - - var uptimeAfter float64 - - suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error { - uptimeAfter, err = suite.ReadUptime(nodeCtx) - if err != nil { - // API might be unresponsive during reboot - return retry.ExpectedError(err) - } - - if uptimeAfter >= uptimeBefore { - // uptime should go down after reboot - return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter)) - } - - return nil - })) - - if suite.Cluster != nil { - // without cluster state we can't do deep checks, but basic reboot test still works - // NB: using `ctx` here to have client talking to init node by default - suite.AssertClusterHealthy(ctx) - } - }(node) - + suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error { + return suite.Client.Reboot(nodeCtx) + }, 10*time.Minute) } } @@ -103,6 +69,9 @@ func (suite *RebootSuite) TestRebootAllNodes() { suite.T().Skip("cluster doesn't support reboots") } + // offset to account for uptime measuremenet inaccuracy + const offset = 2 * time.Second + nodes := suite.DiscoverNodes() suite.Require().NotEmpty(nodes) @@ -131,6 +100,8 @@ func (suite *RebootSuite) TestRebootAllNodes() { suite.Require().NoError(<-errCh) } + rebootTimestamp := time.Now() + allNodesCtx := client.WithNodes(suite.ctx, nodes...) suite.Require().NoError(suite.Client.Reboot(allNodesCtx)) @@ -143,20 +114,27 @@ func (suite *RebootSuite) TestRebootAllNodes() { return fmt.Errorf("uptime record not found for %q", node) } - uptimeBefore := uptimeBeforeInterface.(float64) //nolint: errcheck + uptimeBefore := uptimeBeforeInterface.(time.Duration) //nolint: errcheck nodeCtx := client.WithNodes(suite.ctx, node) return retry.Constant(10 * time.Minute).Retry(func() error { - uptimeAfter, err := suite.ReadUptime(nodeCtx) + requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second) + defer requestCtxCancel() + + elapsed := time.Since(rebootTimestamp) - offset + + uptimeAfter, err := suite.ReadUptime(requestCtx) if err != nil { // API might be unresponsive during reboot return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err)) } - if uptimeAfter >= uptimeBefore { + // uptime of the node before it actually reboots still goes up linearly + // so we can safely add elapsed time here + if uptimeAfter >= uptimeBefore+elapsed { // uptime should go down after reboot - return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %f, after %f", node, uptimeBefore, uptimeAfter)) + return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %s + %s, after %s", node, uptimeBefore, elapsed, uptimeAfter)) } return nil diff --git a/internal/integration/api/reset.go b/internal/integration/api/reset.go index 2110b467d..411df538e 100644 --- a/internal/integration/api/reset.go +++ b/internal/integration/api/reset.go @@ -8,14 +8,11 @@ package api import ( "context" - "fmt" "sort" "testing" "time" "github.com/talos-systems/talos/internal/integration/base" - "github.com/talos-systems/talos/pkg/client" - "github.com/talos-systems/talos/pkg/retry" ) type ResetSuite struct { @@ -70,43 +67,13 @@ func (suite *ResetSuite) TestResetNodeByNode() { suite.T().Log("Resetting node", node) - func(node string) { - // timeout for single node Reset - ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute) - defer ctxCancel() - - nodeCtx := client.WithNodes(ctx, node) - - // read uptime before Reset - uptimeBefore, err := suite.ReadUptime(nodeCtx) - suite.Require().NoError(err) - + // uptime should go down after Reset, as it reboots the node + suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error { // force reboot after reset, as this is the only mode we can test - suite.Assert().NoError(suite.Client.Reset(nodeCtx, true, true)) - - var uptimeAfter float64 - - suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error { - uptimeAfter, err = suite.ReadUptime(nodeCtx) - if err != nil { - // API might be unresponsive during reboot - return retry.ExpectedError(err) - } - - if uptimeAfter >= uptimeBefore { - // uptime should go down after Reset, as it reboots the node - return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter)) - } - - return nil - })) - - // TODO: there is no good way to assert that node was reset and disk contents were really wiped - - // NB: using `ctx` here to have client talking to init node by default - suite.AssertClusterHealthy(ctx) - }(node) + return suite.Client.Reset(nodeCtx, true, true) + }, 10*time.Minute) + // TODO: there is no good way to assert that node was reset and disk contents were really wiped } } diff --git a/internal/integration/base/api.go b/internal/integration/base/api.go index e7fcc9ae6..2a2ab9239 100644 --- a/internal/integration/base/api.go +++ b/internal/integration/base/api.go @@ -21,6 +21,7 @@ import ( "github.com/talos-systems/talos/internal/pkg/provision/access" "github.com/talos-systems/talos/pkg/client" "github.com/talos-systems/talos/pkg/client/config" + "github.com/talos-systems/talos/pkg/retry" ) // APISuite is a base suite for API tests @@ -115,7 +116,7 @@ func (apiSuite *APISuite) AssertClusterHealthy(ctx context.Context) { // ReadUptime reads node uptime. // // Context provided might have specific node attached for API call. -func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) { +func (apiSuite *APISuite) ReadUptime(ctx context.Context) (time.Duration, error) { // set up a short timeout around uptime read calls to work around // cases when rebooted node doesn't answer for a long time on requests reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second) @@ -150,7 +151,60 @@ func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) { } } - return uptime, reader.Close() + return time.Duration(uptime * float64(time.Second)), reader.Close() +} + +// AssertRebooted verifies that node got rebooted as result of running some API call. +// +// Verification happens via reading uptime of the node. +func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) { + // offset to account for uptime measuremenet inaccuracy + const offset = 2 * time.Second + + // timeout for single node Reset + ctx, ctxCancel := context.WithTimeout(ctx, timeout) + defer ctxCancel() + + nodeCtx := client.WithNodes(ctx, node) + + // read uptime before Reset + uptimeBefore, err := apiSuite.ReadUptime(nodeCtx) + apiSuite.Require().NoError(err) + + apiSuite.Assert().NoError(rebootFunc(nodeCtx)) + + // capture current time when API returns + rebootTimestamp := time.Now() + + var uptimeAfter time.Duration + + apiSuite.Require().NoError(retry.Constant(timeout).Retry(func() error { + requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second) + defer requestCtxCancel() + + elapsed := time.Since(rebootTimestamp) - offset + + uptimeAfter, err = apiSuite.ReadUptime(requestCtx) + if err != nil { + // API might be unresponsive during reboot + return retry.ExpectedError(err) + } + + // uptime of the node before it actually reboots still goes up linearly + // so we can safely add elapsed time here + if uptimeAfter >= uptimeBefore+elapsed { + // uptime should go down after reboot + return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %s + %s, after %s", uptimeBefore, elapsed, uptimeAfter)) + } + + return nil + })) + + if apiSuite.Cluster != nil { + // without cluster state we can't do deep checks, but basic reboot test still works + // NB: using `ctx` here to have client talking to init node by default + apiSuite.AssertClusterHealthy(ctx) + } } // TearDownSuite closes Talos API client