mirror of
https://github.com/siderolabs/talos.git
synced 2026-04-12 01:01:07 +02:00
A fixup for #12896 The health check might be running as a reduced privilege role client, so don't pull the machine config, but instead read a field from a non-sensitive resource. As this field doesn't exist in older versions of Talos, the check should still run by default (as it will be empty). Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
202 lines
7.3 KiB
Go
202 lines
7.3 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
package check
|
|
|
|
import (
|
|
"context"
|
|
"slices"
|
|
"time"
|
|
|
|
"github.com/cosi-project/runtime/pkg/safe"
|
|
|
|
"github.com/siderolabs/talos/pkg/conditions"
|
|
"github.com/siderolabs/talos/pkg/machinery/config/machine"
|
|
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
|
|
)
|
|
|
|
// DefaultClusterChecks returns a set of default Talos cluster readiness checks.
|
|
func DefaultClusterChecks() []ClusterCheck {
|
|
return slices.Concat(
|
|
PreBootSequenceChecks(),
|
|
K8sComponentsReadinessChecks(),
|
|
[]ClusterCheck{
|
|
// wait for all the nodes to report ready at k8s level
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all k8s nodes to report ready", func(ctx context.Context) error {
|
|
if cniDisabled, err := cniDisabledStatus(ctx, cluster); err != nil {
|
|
return err
|
|
} else if cniDisabled {
|
|
return conditions.ErrSkipAssertion
|
|
}
|
|
|
|
return K8sAllNodesReadyAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for kube-proxy to report ready
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("kube-proxy to report ready", func(ctx context.Context) error {
|
|
present, replicas, err := DaemonSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !present {
|
|
return conditions.ErrSkipAssertion
|
|
}
|
|
|
|
return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-proxy")
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for coredns to report ready
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("coredns to report ready", func(ctx context.Context) error {
|
|
if cniDisabled, err := cniDisabledStatus(ctx, cluster); err != nil {
|
|
return err
|
|
} else if cniDisabled {
|
|
return conditions.ErrSkipAssertion
|
|
}
|
|
|
|
present, replicas, err := DeploymentPresent(ctx, cluster, "kube-system", "k8s-app=kube-dns")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !present {
|
|
return conditions.ErrSkipAssertion
|
|
}
|
|
|
|
return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-dns")
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for all the nodes to be schedulable
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all k8s nodes to report schedulable", func(ctx context.Context) error {
|
|
return K8sAllNodesSchedulableAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
},
|
|
)
|
|
}
|
|
|
|
func cniDisabledStatus(ctx context.Context, cluster ClusterInfo) (bool, error) {
|
|
cli, err := cluster.Client()
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
bmc, err := safe.ReaderGetByID[*k8s.BootstrapManifestsConfig](ctx, cli.COSI, k8s.BootstrapManifestsConfigID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
return bmc.TypedSpec().CNIName == "none", nil
|
|
}
|
|
|
|
// K8sComponentsReadinessChecks returns a set of K8s cluster readiness checks which are specific to the k8s components
|
|
// being up and running. This test can be skipped if the cluster is set to use a custom CNI, as the checks won't be healthy
|
|
// until the CNI is up and running.
|
|
func K8sComponentsReadinessChecks() []ClusterCheck {
|
|
return []ClusterCheck{
|
|
// wait for all the nodes to report in at k8s level
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all k8s nodes to report", func(ctx context.Context) error {
|
|
return K8sAllNodesReportedAssertion(ctx, cluster)
|
|
}, 30*time.Second) // give more time per each attempt, as this check is going to build and cache kubeconfig
|
|
},
|
|
|
|
// wait for k8s control plane static pods
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all control plane static pods to be running", func(ctx context.Context) error {
|
|
return K8sControlPlaneStaticPods(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for HA k8s control plane
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all control plane components to be ready", func(ctx context.Context) error {
|
|
return K8sFullControlPlaneAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
}
|
|
}
|
|
|
|
// ExtraClusterChecks returns a set of additional Talos cluster readiness checks which work only for newer versions of Talos.
|
|
//
|
|
// ExtraClusterChecks can't be used reliably in upgrade tests, as older versions might not pass the checks.
|
|
func ExtraClusterChecks() []ClusterCheck {
|
|
return []ClusterCheck{}
|
|
}
|
|
|
|
// PreBootSequenceChecks returns a set of Talos cluster readiness checks which are run before boot sequence.
|
|
func PreBootSequenceChecks() []ClusterCheck {
|
|
return []ClusterCheck{
|
|
// wait for etcd to be healthy on all control plane nodes
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("etcd to be healthy", func(ctx context.Context) error {
|
|
return ServiceHealthAssertion(ctx, cluster, "etcd", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for etcd members to be consistent across nodes
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("etcd members to be consistent across nodes", func(ctx context.Context) error {
|
|
return EtcdConsistentAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for etcd members to be the control plane nodes
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("etcd members to be control plane nodes", func(ctx context.Context) error {
|
|
return EtcdControlPlaneNodesAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for apid to be ready on all the nodes
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("apid to be ready", func(ctx context.Context) error {
|
|
return ApidReadyAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for all nodes to report their memory size
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all nodes memory sizes", func(ctx context.Context) error {
|
|
return AllNodesMemorySizes(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for all nodes to report their disk size
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all nodes disk sizes", func(ctx context.Context) error {
|
|
return AllNodesDiskSizes(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// check diagnostics
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("no diagnostics", func(ctx context.Context) error {
|
|
return NoDiagnostics(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for kubelet to be healthy on all
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("kubelet to be healthy", func(ctx context.Context) error {
|
|
return ServiceHealthAssertion(ctx, cluster, "kubelet", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
|
|
}, 5*time.Second)
|
|
},
|
|
|
|
// wait for all nodes to finish booting
|
|
func(cluster ClusterInfo) conditions.Condition {
|
|
return conditions.PollingCondition("all nodes to finish boot sequence", func(ctx context.Context) error {
|
|
return AllNodesBootedAssertion(ctx, cluster)
|
|
}, 5*time.Second)
|
|
},
|
|
}
|
|
}
|