talos/pkg/cluster/check/nodes.go
Dmitriy Matrenichev 45e6e27af7
chore: bump runtime
Use new functions and methods from runtime module.

Signed-off-by: Dmitriy Matrenichev <dmitry.matrenichev@siderolabs.com>
2023-05-11 17:18:08 -04:00

272 lines
6.5 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package check
import (
"context"
"errors"
fmt "fmt"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/dustin/go-humanize"
"github.com/hashicorp/go-multierror"
"github.com/siderolabs/gen/slices"
"google.golang.org/grpc/codes"
"github.com/siderolabs/talos/pkg/machinery/client"
"github.com/siderolabs/talos/pkg/machinery/config/types/v1alpha1/machine"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
"github.com/siderolabs/talos/pkg/minimal"
)
// AllNodesMemorySizes checks that all nodes have enough memory.
func AllNodesMemorySizes(ctx context.Context, cluster ClusterInfo) error {
cl, err := cluster.Client()
if err != nil {
return fmt.Errorf("error getting client: %w", err)
}
nodesIP, err := getNonContainerNodes(
client.WithNodes(
ctx,
mapIPsToStrings(mapNodeInfosToInternalIPs(cluster.Nodes()))...,
),
cl,
)
if err != nil {
return err
}
if len(nodesIP) == 0 {
return nil
}
resp, err := cl.Memory(client.WithNodes(ctx, nodesIP...))
if err != nil {
return fmt.Errorf("error getting nodes memory: %w", err)
}
var resultErr error
nodeToType := getNodesTypes(cluster, machine.TypeInit, machine.TypeControlPlane, machine.TypeWorker)
for _, msg := range resp.Messages {
if msg.Metadata == nil {
return fmt.Errorf("no metadata in the response")
}
hostname := msg.Metadata.Hostname
typ, ok := nodeToType[hostname]
if !ok {
return fmt.Errorf("unexpected node %q in response", hostname)
}
minimum, _, err := minimal.Memory(typ)
if err != nil {
resultErr = multierror.Append(resultErr, err)
continue
}
if totalMemory := msg.Meminfo.Memtotal * humanize.KiByte; totalMemory < minimum {
resultErr = multierror.Append(
resultErr,
fmt.Errorf(
"node %q does not meet memory requirements: expected at least %d MiB, actual %d MiB",
hostname,
minimum/humanize.MiByte,
totalMemory/humanize.MiByte,
),
)
}
}
return resultErr
}
func getNodesTypes(cluster ClusterInfo, nodeTypes ...machine.Type) map[string]machine.Type {
result := map[string]machine.Type{}
for _, typ := range nodeTypes {
for _, node := range cluster.NodesByType(typ) {
result[node.InternalIP.String()] = typ
}
}
return result
}
// AllNodesDiskSizes checks that all nodes have enough disk space.
//
//nolint:gocyclo
func AllNodesDiskSizes(ctx context.Context, cluster ClusterInfo) error {
cl, err := cluster.Client()
if err != nil {
return fmt.Errorf("error getting client: %w", err)
}
nodesIP, err := getNonContainerNodes(
client.WithNodes(
ctx,
mapIPsToStrings(mapNodeInfosToInternalIPs(cluster.Nodes()))...,
),
cl,
)
if err != nil {
return err
}
if len(nodesIP) == 0 {
return nil
}
ctx = client.WithNodes(ctx, nodesIP...)
nodesMounts, err := getNodesMounts(ctx, cl)
if err != nil {
return err
}
var resultErr error
for _, nodeIP := range nodesIP {
data, err := getEphemeralPartitionData(ctx, cl.COSI, nodeIP)
if errors.Is(err, ErrOldTalosVersion) {
continue
} else if err != nil {
resultErr = multierror.Append(resultErr, err)
continue
}
nodeMounts, ok := nodesMounts[nodeIP]
if !ok {
resultErr = multierror.Append(resultErr, fmt.Errorf("node %q not found in mounts", nodeIP))
continue
}
idx := slices.IndexFunc(nodeMounts, func(mnt mntData) bool { return mnt.Filesystem == data.Source })
if idx == -1 {
resultErr = multierror.Append(resultErr, fmt.Errorf("ephemeral partition %q not found for node %q", data.Source, nodeIP))
continue
}
minimalDiskSize := minimal.DiskSize()
// adjust by 1400 MiB to account for the size of system stuff
if actualDiskSize := nodeMounts[idx].Size + 1400*humanize.MiByte; actualDiskSize < minimal.DiskSize() {
resultErr = multierror.Append(resultErr, fmt.Errorf(
"ephemeral partition %q for node %q is too small, expected at least %s, actual %s",
data.Source,
nodeIP,
humanize.IBytes(minimalDiskSize),
humanize.IBytes(actualDiskSize),
))
continue
}
}
return resultErr
}
func getNonContainerNodes(ctx context.Context, cl *client.Client) ([]string, error) {
resp, err := cl.Version(ctx)
if err != nil {
return nil, fmt.Errorf("error getting version: %w", err)
}
result := make([]string, 0, len(resp.Messages))
for _, msg := range resp.Messages {
if msg.Metadata == nil {
return nil, errors.New("got empty metadata")
}
if msg.Platform.Mode == "container" {
continue
}
result = append(result, msg.Metadata.Hostname)
}
return result, nil
}
type mountData struct {
Source string
}
// ErrOldTalosVersion is returned when the node is running an old version of Talos.
var ErrOldTalosVersion = fmt.Errorf("old Talos version")
func getEphemeralPartitionData(ctx context.Context, state state.State, nodeIP string) (mountData, error) {
items, err := safe.StateListAll[*runtime.MountStatus](client.WithNode(ctx, nodeIP), state)
if err != nil {
if client.StatusCode(err) == codes.Unimplemented {
// old version of Talos without COSI API
return mountData{}, ErrOldTalosVersion
}
return mountData{}, fmt.Errorf("error listing mounts for node %q: %w", nodeIP, err)
}
for it := safe.IteratorFromList(items); it.Next(); {
mount := it.Value()
mountID := mount.Metadata().ID()
if mountID == constants.EphemeralPartitionLabel {
return mountData{
Source: mount.TypedSpec().Source,
}, nil
}
}
return mountData{}, fmt.Errorf("no ephemeral partition found for node '%s'", nodeIP)
}
type mntData struct {
Filesystem string
Size uint64
}
func getNodesMounts(ctx context.Context, cl *client.Client) (map[string][]mntData, error) {
diskResp, err := cl.Mounts(ctx)
if err != nil {
return nil, fmt.Errorf("error getting nodes mounts: %w", err)
}
if len(diskResp.Messages) == 0 {
return nil, fmt.Errorf("no nodes with mounts found")
}
nodesMnts := map[string][]mntData{}
for _, msg := range diskResp.Messages {
switch {
case msg.Metadata == nil:
return nil, fmt.Errorf("no metadata in response")
case len(msg.GetStats()) == 0:
return nil, fmt.Errorf("no mounts found for node %q", msg.Metadata.Hostname)
}
hostname := msg.Metadata.Hostname
for _, mnt := range msg.GetStats() {
nodesMnts[hostname] = append(nodesMnts[hostname], mntData{
Filesystem: mnt.Filesystem,
Size: mnt.Size,
})
}
}
return nodesMnts, nil
}