From dc6ea74c358c75145b98369f62d77d1ac00def3d Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Thu, 8 Oct 2020 09:52:34 +0300 Subject: [PATCH] fix: random failures in cluster health checks The problem was that some of the health checks sort the list of the nodes in place (via `sort.Strings()`). If cluster info provider returns original slice, it might be mutated in such a way that it gets corrupted. We never noticed it before CAPI clusters, as in our tests IPs are assigned sequentially, and sort operation is a no-op. Specifically, the problem was with the `Nodes()` function, it returns `append(controlPlaneNodes, workerNodes...)` slice, which by definition might share memory with `controlPlaneNodes` slice. For example, if control plane nodes were `4, 5, 6` and worker nodes were `3`, the returned slice will be `4, 5, 6, 3`, and it shares memory with `controlPlaneNodes` slice (firs three items). If we apply `sort` to the returned slice, it re-orders it as `3, 4, 5, 6`, but as it is done in-place, the `controlPlaneNodes` slice is now `3, 4, 5`, which is obviously wrong. Fix that by always returning a copy of the slice from the functions implementing `ClusterInfo` interface. Signed-off-by: Andrey Smirnov --- cmd/talosctl/cmd/talos/health.go | 4 ++-- .../machined/internal/server/v1alpha1/v1alpha1_cluster.go | 6 +++--- internal/integration/base/cluster.go | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/talosctl/cmd/talos/health.go b/cmd/talosctl/cmd/talos/health.go index 54bb4b6b8..a30ad2753 100644 --- a/cmd/talosctl/cmd/talos/health.go +++ b/cmd/talosctl/cmd/talos/health.go @@ -49,9 +49,9 @@ func (cluster *clusterNodes) NodesByType(t machine.Type) []string { return []string{cluster.InitNode} case machine.TypeControlPlane: - return cluster.ControlPlaneNodes + return append([]string(nil), cluster.ControlPlaneNodes...) case machine.TypeJoin: - return cluster.WorkerNodes + return append([]string(nil), cluster.WorkerNodes...) case machine.TypeUnknown: return nil default: diff --git a/internal/app/machined/internal/server/v1alpha1/v1alpha1_cluster.go b/internal/app/machined/internal/server/v1alpha1/v1alpha1_cluster.go index 23e6e3168..630bcffca 100644 --- a/internal/app/machined/internal/server/v1alpha1/v1alpha1_cluster.go +++ b/internal/app/machined/internal/server/v1alpha1/v1alpha1_cluster.go @@ -81,7 +81,7 @@ type clusterState struct { } func (cluster *clusterState) Nodes() []string { - return append(cluster.controlPlaneNodes, cluster.workerNodes...) + return append([]string(nil), append(cluster.controlPlaneNodes, cluster.workerNodes...)...) } func (cluster *clusterState) NodesByType(t machine.Type) []string { @@ -89,9 +89,9 @@ func (cluster *clusterState) NodesByType(t machine.Type) []string { case machine.TypeInit: return nil case machine.TypeControlPlane: - return cluster.controlPlaneNodes + return append([]string(nil), cluster.controlPlaneNodes...) case machine.TypeJoin: - return cluster.workerNodes + return append([]string(nil), cluster.workerNodes...) case machine.TypeUnknown: return nil default: diff --git a/internal/integration/base/cluster.go b/internal/integration/base/cluster.go index d04f944aa..c9c81b8d3 100644 --- a/internal/integration/base/cluster.go +++ b/internal/integration/base/cluster.go @@ -14,7 +14,7 @@ type infoWrapper struct { } func (wrapper *infoWrapper) Nodes() []string { - return append(wrapper.masterNodes, wrapper.workerNodes...) + return append([]string(nil), append(wrapper.masterNodes, wrapper.workerNodes...)...) } func (wrapper *infoWrapper) NodesByType(t machine.Type) []string { @@ -22,9 +22,9 @@ func (wrapper *infoWrapper) NodesByType(t machine.Type) []string { case machine.TypeInit: return nil case machine.TypeControlPlane: - return wrapper.masterNodes + return append([]string(nil), wrapper.masterNodes...) case machine.TypeJoin: - return wrapper.workerNodes + return append([]string(nil), wrapper.workerNodes...) default: panic("unreachable") }