From 6fb55229a22cd13dff498d3b051b4fe7b347d49c Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <smirnov.andrey@gmail.com>
Date: Mon, 29 Jun 2020 23:14:39 +0300
Subject: [PATCH] test: fix and improve reboot/reset tests

These tests rely on node uptime checks. These checks are quite flaky.

Following fixes were applied:

* code was refactored as common method shared between reset/reboot tests
(reboot all nodes does checks in a different way, so it wasn't updated)

* each request to read uptime times out in 5 seconds, so that checks
don't wait forever when node is down (or connection is aborted)

* to account for node availability vs. lower uptime in the beginning of
test, add extra elapsed time to the check condition

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
---
 internal/integration/api/reboot.go | 60 ++++++++++--------------------
 internal/integration/api/reset.go  | 43 +++------------------
 internal/integration/base/api.go   | 58 ++++++++++++++++++++++++++++-
 3 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/internal/integration/api/reboot.go b/internal/integration/api/reboot.go
index 117850719..46d71d565 100644
--- a/internal/integration/api/reboot.go
+++ b/internal/integration/api/reboot.go
@@ -57,43 +57,9 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
 	for _, node := range nodes {
 		suite.T().Log("rebooting node", node)
 
-		func(node string) {
-			// timeout for single node reboot
-			ctx, ctxCancel := context.WithTimeout(suite.ctx, 10*time.Minute)
-			defer ctxCancel()
-
-			nodeCtx := client.WithNodes(ctx, node)
-
-			// read uptime before reboot
-			uptimeBefore, err := suite.ReadUptime(nodeCtx)
-			suite.Require().NoError(err)
-
-			suite.Assert().NoError(suite.Client.Reboot(nodeCtx))
-
-			var uptimeAfter float64
-
-			suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
-				uptimeAfter, err = suite.ReadUptime(nodeCtx)
-				if err != nil {
-					// API might be unresponsive during reboot
-					return retry.ExpectedError(err)
-				}
-
-				if uptimeAfter >= uptimeBefore {
-					// uptime should go down after reboot
-					return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
-				}
-
-				return nil
-			}))
-
-			if suite.Cluster != nil {
-				// without cluster state we can't do deep checks, but basic reboot test still works
-				// NB: using `ctx` here to have client talking to init node by default
-				suite.AssertClusterHealthy(ctx)
-			}
-		}(node)
-
+		suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
+			return suite.Client.Reboot(nodeCtx)
+		}, 10*time.Minute)
 	}
 }
 
@@ -103,6 +69,9 @@ func (suite *RebootSuite) TestRebootAllNodes() {
 		suite.T().Skip("cluster doesn't support reboots")
 	}
 
+	// offset to account for uptime measuremenet inaccuracy
+	const offset = 2 * time.Second
+
 	nodes := suite.DiscoverNodes()
 	suite.Require().NotEmpty(nodes)
 
@@ -131,6 +100,8 @@ func (suite *RebootSuite) TestRebootAllNodes() {
 		suite.Require().NoError(<-errCh)
 	}
 
+	rebootTimestamp := time.Now()
+
 	allNodesCtx := client.WithNodes(suite.ctx, nodes...)
 
 	suite.Require().NoError(suite.Client.Reboot(allNodesCtx))
@@ -143,20 +114,27 @@ func (suite *RebootSuite) TestRebootAllNodes() {
 					return fmt.Errorf("uptime record not found for %q", node)
 				}
 
-				uptimeBefore := uptimeBeforeInterface.(float64) //nolint: errcheck
+				uptimeBefore := uptimeBeforeInterface.(time.Duration) //nolint: errcheck
 
 				nodeCtx := client.WithNodes(suite.ctx, node)
 
 				return retry.Constant(10 * time.Minute).Retry(func() error {
-					uptimeAfter, err := suite.ReadUptime(nodeCtx)
+					requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
+					defer requestCtxCancel()
+
+					elapsed := time.Since(rebootTimestamp) - offset
+
+					uptimeAfter, err := suite.ReadUptime(requestCtx)
 					if err != nil {
 						// API might be unresponsive during reboot
 						return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err))
 					}
 
-					if uptimeAfter >= uptimeBefore {
+					// uptime of the node before it actually reboots still goes up linearly
+					// so we can safely add elapsed time here
+					if uptimeAfter >= uptimeBefore+elapsed {
 						// uptime should go down after reboot
-						return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %f, after %f", node, uptimeBefore, uptimeAfter))
+						return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %s + %s, after %s", node, uptimeBefore, elapsed, uptimeAfter))
 					}
 
 					return nil
diff --git a/internal/integration/api/reset.go b/internal/integration/api/reset.go
index 2110b467d..411df538e 100644
--- a/internal/integration/api/reset.go
+++ b/internal/integration/api/reset.go
@@ -8,14 +8,11 @@ package api
 
 import (
 	"context"
-	"fmt"
 	"sort"
 	"testing"
 	"time"
 
 	"github.com/talos-systems/talos/internal/integration/base"
-	"github.com/talos-systems/talos/pkg/client"
-	"github.com/talos-systems/talos/pkg/retry"
 )
 
 type ResetSuite struct {
@@ -70,43 +67,13 @@ func (suite *ResetSuite) TestResetNodeByNode() {
 
 		suite.T().Log("Resetting node", node)
 
-		func(node string) {
-			// timeout for single node Reset
-			ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
-			defer ctxCancel()
-
-			nodeCtx := client.WithNodes(ctx, node)
-
-			// read uptime before Reset
-			uptimeBefore, err := suite.ReadUptime(nodeCtx)
-			suite.Require().NoError(err)
-
+		// uptime should go down after Reset, as it reboots the node
+		suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
 			// force reboot after reset, as this is the only mode we can test
-			suite.Assert().NoError(suite.Client.Reset(nodeCtx, true, true))
-
-			var uptimeAfter float64
-
-			suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
-				uptimeAfter, err = suite.ReadUptime(nodeCtx)
-				if err != nil {
-					// API might be unresponsive during reboot
-					return retry.ExpectedError(err)
-				}
-
-				if uptimeAfter >= uptimeBefore {
-					// uptime should go down after Reset, as it reboots the node
-					return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
-				}
-
-				return nil
-			}))
-
-			// TODO: there is no good way to assert that node was reset and disk contents were really wiped
-
-			// NB: using `ctx` here to have client talking to init node by default
-			suite.AssertClusterHealthy(ctx)
-		}(node)
+			return suite.Client.Reset(nodeCtx, true, true)
+		}, 10*time.Minute)
 
+		// TODO: there is no good way to assert that node was reset and disk contents were really wiped
 	}
 }
 
diff --git a/internal/integration/base/api.go b/internal/integration/base/api.go
index e7fcc9ae6..2a2ab9239 100644
--- a/internal/integration/base/api.go
+++ b/internal/integration/base/api.go
@@ -21,6 +21,7 @@ import (
 	"github.com/talos-systems/talos/internal/pkg/provision/access"
 	"github.com/talos-systems/talos/pkg/client"
 	"github.com/talos-systems/talos/pkg/client/config"
+	"github.com/talos-systems/talos/pkg/retry"
 )
 
 // APISuite is a base suite for API tests
@@ -115,7 +116,7 @@ func (apiSuite *APISuite) AssertClusterHealthy(ctx context.Context) {
 // ReadUptime reads node uptime.
 //
 // Context provided might have specific node attached for API call.
-func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
+func (apiSuite *APISuite) ReadUptime(ctx context.Context) (time.Duration, error) {
 	// set up a short timeout around uptime read calls to work around
 	// cases when rebooted node doesn't answer for a long time on requests
 	reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
@@ -150,7 +151,60 @@ func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
 		}
 	}
 
-	return uptime, reader.Close()
+	return time.Duration(uptime * float64(time.Second)), reader.Close()
+}
+
+// AssertRebooted verifies that node got rebooted as result of running some API call.
+//
+// Verification happens via reading uptime of the node.
+func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) {
+	// offset to account for uptime measuremenet inaccuracy
+	const offset = 2 * time.Second
+
+	// timeout for single node Reset
+	ctx, ctxCancel := context.WithTimeout(ctx, timeout)
+	defer ctxCancel()
+
+	nodeCtx := client.WithNodes(ctx, node)
+
+	// read uptime before Reset
+	uptimeBefore, err := apiSuite.ReadUptime(nodeCtx)
+	apiSuite.Require().NoError(err)
+
+	apiSuite.Assert().NoError(rebootFunc(nodeCtx))
+
+	// capture current time when API returns
+	rebootTimestamp := time.Now()
+
+	var uptimeAfter time.Duration
+
+	apiSuite.Require().NoError(retry.Constant(timeout).Retry(func() error {
+		requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
+		defer requestCtxCancel()
+
+		elapsed := time.Since(rebootTimestamp) - offset
+
+		uptimeAfter, err = apiSuite.ReadUptime(requestCtx)
+		if err != nil {
+			// API might be unresponsive during reboot
+			return retry.ExpectedError(err)
+		}
+
+		// uptime of the node before it actually reboots still goes up linearly
+		// so we can safely add elapsed time here
+		if uptimeAfter >= uptimeBefore+elapsed {
+			// uptime should go down after reboot
+			return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %s + %s, after %s", uptimeBefore, elapsed, uptimeAfter))
+		}
+
+		return nil
+	}))
+
+	if apiSuite.Cluster != nil {
+		// without cluster state we can't do deep checks, but basic reboot test still works
+		// NB: using `ctx` here to have client talking to init node by default
+		apiSuite.AssertClusterHealthy(ctx)
+	}
 }
 
 // TearDownSuite closes Talos API client