mirror of
https://github.com/siderolabs/omni.git
synced 2025-08-06 01:27:02 +02:00
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
All test modules were moved under `integration` tag and are now in `internal/integration` folder: no more `cmd/integration-test` executable. New Kres version is able to build the same executable from the tests directory instead. All Omni related flags were renamed, for example `--endpoint` -> `--omni.endpoint`. 2 more functional changes: - Enabled `--test.failfast` for all test runs. - Removed finalizers, which were running if the test has failed. Both of these changes should make it easier to understand the test failure: Talos node logs won't be cluttered with the finalizer tearing down the cluster. Fixes: https://github.com/siderolabs/omni/issues/1171 Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
433 lines
16 KiB
Go
433 lines
16 KiB
Go
// Copyright (c) 2025 Sidero Labs, Inc.
|
|
//
|
|
// Use of this software is governed by the Business Source License
|
|
// included in the LICENSE file.
|
|
|
|
//go:build integration
|
|
|
|
package integration_test
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/cosi-project/runtime/pkg/resource"
|
|
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
|
|
"github.com/cosi-project/runtime/pkg/state"
|
|
"github.com/dustin/go-humanize"
|
|
"github.com/siderolabs/gen/pair"
|
|
"github.com/siderolabs/go-retry/retry"
|
|
"github.com/siderolabs/talos/pkg/machinery/api/machine"
|
|
talosclient "github.com/siderolabs/talos/pkg/machinery/client"
|
|
"github.com/siderolabs/talos/pkg/machinery/resources/network"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"google.golang.org/grpc/codes"
|
|
"google.golang.org/grpc/status"
|
|
|
|
"github.com/siderolabs/omni/client/api/omni/specs"
|
|
"github.com/siderolabs/omni/client/pkg/client"
|
|
"github.com/siderolabs/omni/client/pkg/omni/resources"
|
|
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
|
|
)
|
|
|
|
const (
|
|
dummyIfacePatchTemplate = `machine:
|
|
network:
|
|
interfaces:
|
|
- interface: %s
|
|
dummy: true`
|
|
)
|
|
|
|
// AssertLargeImmediateConfigApplied tests that config patch that be applied immediately (without reboot) gets applied to all the nodes.
|
|
//
|
|
// Config patch is generated to be large to test the edge case.
|
|
// The patch is removed on finalize.
|
|
func AssertLargeImmediateConfigApplied(testCtx context.Context, cli *client.Client, clusterName string, talosAPIKeyPrepare TalosAPIKeyPrepareFunc) TestFunc {
|
|
return func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(testCtx, 3*time.Minute)
|
|
defer cancel()
|
|
|
|
require.NoError(t, talosAPIKeyPrepare(ctx, "default"))
|
|
|
|
talosCli, err := talosClient(ctx, cli, clusterName)
|
|
require.NoError(t, err)
|
|
|
|
nodeIPs, err := talosNodeIPs(ctx, talosCli.COSI)
|
|
require.NoError(t, err)
|
|
|
|
st := cli.Omni().State()
|
|
|
|
epochSeconds := time.Now().Unix()
|
|
id := fmt.Sprintf("000-config-patch-test-dummy-iface-%d", epochSeconds)
|
|
iface := fmt.Sprintf("dummy%d", epochSeconds)
|
|
configPatchYAML := fmt.Sprintf(dummyIfacePatchTemplate, iface)
|
|
|
|
var sb strings.Builder
|
|
|
|
for range 40_000 {
|
|
sb.WriteString("################################################################################\n")
|
|
}
|
|
|
|
sb.WriteString(configPatchYAML)
|
|
sb.WriteString("\n")
|
|
|
|
sizeHumanReadable := humanize.Bytes(uint64(sb.Len()))
|
|
|
|
require.LessOrEqual(t, sb.Len(), 4*1024*1024, "generated config patch is too large (%v), abort", sizeHumanReadable)
|
|
|
|
t.Logf("creating large config patch with size: %v", sizeHumanReadable)
|
|
|
|
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id, pair.MakePair(omni.LabelCluster, clusterName))
|
|
|
|
// apply the large config patch that creates a dummy interface
|
|
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
|
|
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
|
|
})
|
|
|
|
t.Logf("created large config patch with dummy interface: %q", iface)
|
|
|
|
// assert that the patch is propagated to all clustermachines
|
|
cmIDs := rtestutils.ResourceIDs[*omni.ClusterMachineConfigPatches](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)))
|
|
|
|
rtestutils.AssertResources(ctx, t, st, cmIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.True(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches don't contain string %q", cm.Metadata().ID(), iface)
|
|
})
|
|
|
|
linkStatus := network.NewLinkStatus(network.NamespaceName, "")
|
|
|
|
containsDummyIface := func(node string) (bool, error) {
|
|
links, linksErr := talosCli.COSI.List(talosclient.WithNode(ctx, node), linkStatus.Metadata())
|
|
if linksErr != nil {
|
|
return false, linksErr
|
|
}
|
|
|
|
for _, res := range links.Items {
|
|
if res.Metadata().ID() == iface {
|
|
return true, nil
|
|
}
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
for _, node := range nodeIPs {
|
|
err = retry.Constant(3*time.Minute, retry.WithUnits(1*time.Second)).RetryWithContext(ctx, func(context.Context) error {
|
|
contains, containsErr := containsDummyIface(node)
|
|
if containsErr != nil {
|
|
return containsErr
|
|
}
|
|
|
|
if !contains {
|
|
return retry.ExpectedError(fmt.Errorf("%q: dummy interface %q not found", node, iface))
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
|
|
|
|
t.Logf("destroyed config patch with dummy interface: %q", iface)
|
|
|
|
// assert that the patch deletion is propagated to all clustermachines
|
|
rtestutils.AssertResources(ctx, t, st, cmIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.False(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches contain string %q", cm.Metadata().ID(), iface)
|
|
})
|
|
}
|
|
}
|
|
|
|
// AssertConfigPatchWithReboot tests that config patch that requires reboot gets applied to a single node, the node reboots and gets back.
|
|
//
|
|
// The patch is NOT removed.
|
|
func AssertConfigPatchWithReboot(testCtx context.Context, cli *client.Client, clusterName string, talosAPIKeyPrepare TalosAPIKeyPrepareFunc) TestFunc {
|
|
return func(t *testing.T) {
|
|
// just a single machine with a reboot, so it should take no more than 3 minutes
|
|
ctx, cancel := context.WithTimeout(testCtx, 3*time.Minute)
|
|
defer cancel()
|
|
|
|
require.NoError(t, talosAPIKeyPrepare(ctx, "default"))
|
|
|
|
talosCli, err := talosClient(ctx, cli, clusterName)
|
|
require.NoError(t, err)
|
|
|
|
st := cli.Omni().State()
|
|
|
|
nodeList, err := nodes(ctx, cli, clusterName, resource.LabelExists(omni.LabelWorkerRole))
|
|
require.NoError(t, err)
|
|
require.Greater(t, len(nodeList), 0)
|
|
|
|
node := nodeList[0]
|
|
|
|
nodeID := node.machine.Metadata().ID()
|
|
|
|
epochSeconds := time.Now().Unix()
|
|
id := fmt.Sprintf("000-config-patch-test-file-%d", epochSeconds)
|
|
file := fmt.Sprintf("/var/config-patch-test-file-%d.txt", epochSeconds)
|
|
configPatchYAML := fmt.Sprintf(`machine:
|
|
files:
|
|
- content: test
|
|
permissions: 0o666
|
|
path: %s
|
|
op: create`, file)
|
|
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id,
|
|
pair.MakePair(omni.LabelCluster, clusterName),
|
|
pair.MakePair(omni.LabelClusterMachine, nodeID))
|
|
|
|
// apply the config patch that creates a file
|
|
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
|
|
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
|
|
})
|
|
|
|
t.Logf("created config patch with file: %q", file)
|
|
|
|
// skip cleanup to avoid waiting for an additional reboot
|
|
|
|
// assert that the patch is propagated to all clustermachines
|
|
rtestutils.AssertResources(ctx, t, st, []resource.ID{nodeID}, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.True(clusterMachinePatchesContainsString(t, cm, file), "cluster machine %q patches don't contain string %q", file, cm.Metadata().ID())
|
|
})
|
|
|
|
// assert that machine set enters into reconfiguring phase
|
|
rtestutils.AssertResources(ctx, t, st, []resource.ID{omni.WorkersResourceID(clusterName)}, func(mss *omni.MachineSetStatus, assert *assert.Assertions) {
|
|
assert.Equal(specs.MachineSetPhase_Reconfiguring, mss.TypedSpec().Value.GetPhase())
|
|
})
|
|
|
|
// assert that the file is created on the node
|
|
err = retry.Constant(3*time.Minute, retry.WithUnits(1*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
|
|
exists, existsErr := talosFileExists(ctx, talosCli, node.talosIP, file)
|
|
if existsErr != nil {
|
|
if strings.Contains(existsErr.Error(), "not reachable") || status.Code(existsErr) == codes.Unavailable {
|
|
return retry.ExpectedError(existsErr)
|
|
}
|
|
|
|
return existsErr
|
|
}
|
|
|
|
if !exists {
|
|
return retry.ExpectedError(fmt.Errorf("%q: file %q is not found", node.talosIP, file))
|
|
}
|
|
|
|
t.Logf("file %q is found on node %q", file, node.talosIP)
|
|
|
|
return nil
|
|
})
|
|
|
|
assert.NoError(t, err)
|
|
|
|
// wait cluster machine status to be running
|
|
rtestutils.AssertResources(ctx, t, st, []resource.ID{nodeID}, func(cms *omni.ClusterMachineStatus, assertion *assert.Assertions) {
|
|
assertion.Equal(specs.ClusterMachineStatusSpec_RUNNING, cms.TypedSpec().Value.GetStage())
|
|
})
|
|
}
|
|
}
|
|
|
|
// AssertConfigPatchWithInvalidConfig tests that a machine is able to recover from a patch with broken config when the broken patch is deleted.
|
|
func AssertConfigPatchWithInvalidConfig(testCtx context.Context, cli *client.Client, clusterName string, talosAPIKeyPrepare TalosAPIKeyPrepareFunc) TestFunc {
|
|
return func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(testCtx, 8*time.Minute)
|
|
defer cancel()
|
|
|
|
require.NoError(t, talosAPIKeyPrepare(ctx, "default"))
|
|
|
|
st := cli.Omni().State()
|
|
|
|
cmIDs := rtestutils.ResourceIDs[*omni.ClusterMachine](ctx, t, st,
|
|
state.WithLabelQuery(
|
|
resource.LabelEqual(omni.LabelCluster, clusterName),
|
|
resource.LabelExists(omni.LabelControlPlaneRole),
|
|
),
|
|
)
|
|
require.NotEmpty(t, cmIDs)
|
|
|
|
cmID := cmIDs[0]
|
|
|
|
epochSeconds := time.Now().Unix()
|
|
id := fmt.Sprintf("000-config-patch-test-file-broken-%d", epochSeconds)
|
|
file := fmt.Sprintf("/tmp/config-patch-test-file-broken-%d.txt", epochSeconds)
|
|
configPatchYAML := fmt.Sprintf(`machine:
|
|
files:
|
|
- content: test
|
|
permissions: 0o666
|
|
path: %s
|
|
op: create`, file)
|
|
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id,
|
|
pair.MakePair(omni.LabelCluster, clusterName),
|
|
pair.MakePair(omni.LabelClusterMachine, cmID))
|
|
|
|
// apply the broken config patch
|
|
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
|
|
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
|
|
})
|
|
|
|
t.Logf("created config patch with file: %q", file)
|
|
|
|
rtestutils.AssertResources(ctx, t, st, []resource.ID{cmID}, func(cms *omni.ClusterMachineStatus, assertion *assert.Assertions) {
|
|
assertion.Equal(specs.ClusterMachineStatusSpec_BOOTING, cms.TypedSpec().Value.GetStage())
|
|
assertion.False(cms.TypedSpec().Value.GetReady())
|
|
})
|
|
|
|
// TODO: wait for a Talos error about invalid config in the logs
|
|
|
|
t.Logf("destroyed config patch with file: %q", file)
|
|
|
|
// remove broken config patch
|
|
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
|
|
|
|
// wait until k8s nodes come back
|
|
rtestutils.AssertResources(ctx, t, st, []resource.ID{cmID}, func(cms *omni.ClusterMachineStatus, assertion *assert.Assertions) {
|
|
assertion.Equal(specs.ClusterMachineStatusSpec_RUNNING, cms.TypedSpec().Value.GetStage())
|
|
assertion.True(cms.TypedSpec().Value.GetReady())
|
|
})
|
|
}
|
|
}
|
|
|
|
// AssertConfigPatchMachineSet applies config patch on a single machine set.
|
|
//
|
|
// The patch is removed at the end.
|
|
func AssertConfigPatchMachineSet(testCtx context.Context, cli *client.Client, clusterName string) TestFunc {
|
|
return func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(testCtx, 30*time.Second)
|
|
defer cancel()
|
|
|
|
st := cli.Omni().State()
|
|
|
|
workerMachineSetName := omni.WorkersResourceID(clusterName)
|
|
controlPlaneMachineSetName := omni.ControlPlanesResourceID(clusterName)
|
|
|
|
epochSeconds := time.Now().Unix()
|
|
id := fmt.Sprintf("000-config-patch-test-dummy-iface-%d", epochSeconds)
|
|
iface := fmt.Sprintf("dummy%d", epochSeconds)
|
|
configPatchYAML := fmt.Sprintf(dummyIfacePatchTemplate, iface)
|
|
|
|
configPatch := omni.NewConfigPatch(
|
|
resources.DefaultNamespace,
|
|
id,
|
|
pair.MakePair(omni.LabelCluster, clusterName),
|
|
pair.MakePair(omni.LabelMachineSet, workerMachineSetName),
|
|
)
|
|
|
|
// apply the config patch that creates a dummy interface
|
|
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
|
|
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
|
|
})
|
|
|
|
// assert that the patch is propagated to all worker machine set ClusterMachines
|
|
workerIDs := rtestutils.ResourceIDs[*omni.ClusterMachineConfigPatches](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelMachineSet, workerMachineSetName)))
|
|
|
|
rtestutils.AssertResources(ctx, t, st, workerIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.True(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches don't contain string %q", cm.Metadata().ID(), iface)
|
|
})
|
|
|
|
// assert that the patch is *NOT* propagated to all controlplane machine set ClusterMachines
|
|
controlPlaneIDs := rtestutils.ResourceIDs[*omni.ClusterMachineConfigPatches](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelMachineSet, controlPlaneMachineSetName)))
|
|
|
|
rtestutils.AssertResources(ctx, t, st, controlPlaneIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.False(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches contain string %q", cm.Metadata().ID(), iface)
|
|
})
|
|
|
|
// cleanup
|
|
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
|
|
}
|
|
}
|
|
|
|
// AssertConfigPatchSingleClusterMachine applies a config patch on a single cluster machine.
|
|
//
|
|
// The patch is removed at the end.
|
|
func AssertConfigPatchSingleClusterMachine(testCtx context.Context, cli *client.Client, clusterName string) TestFunc {
|
|
return func(t *testing.T) {
|
|
ctx, cancel := context.WithTimeout(testCtx, 30*time.Second)
|
|
defer cancel()
|
|
|
|
st := cli.Omni().State()
|
|
|
|
// get a single clustermachine
|
|
cmIDs := rtestutils.ResourceIDs[*omni.ClusterMachine](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)))
|
|
require.NotEmpty(t, cmIDs)
|
|
|
|
cmID := cmIDs[0]
|
|
|
|
epochSeconds := time.Now().Unix()
|
|
id := fmt.Sprintf("000-config-patch-test-dummy-iface-%d", epochSeconds)
|
|
iface := fmt.Sprintf("dummy%d", epochSeconds)
|
|
configPatchYAML := fmt.Sprintf(dummyIfacePatchTemplate, iface)
|
|
|
|
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id,
|
|
pair.MakePair(omni.LabelCluster, clusterName),
|
|
pair.MakePair(omni.LabelClusterMachine, cmID))
|
|
|
|
// apply the config patch
|
|
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
|
|
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
|
|
})
|
|
|
|
// assert that the patch is propagated to the clustermachine
|
|
rtestutils.AssertResources(ctx, t, st, []resource.ID{cmID}, func(r *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.True(clusterMachinePatchesContainsString(t, r, iface), "cluster machine %q patches don't contain string %q", r.Metadata().ID(), iface)
|
|
})
|
|
|
|
// assert that the patch is *NOT* propagated to other clustermachines of the cluster
|
|
otherCMIDs := cmIDs[1:]
|
|
|
|
rtestutils.AssertResources(ctx, t, st, otherCMIDs, func(r *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
|
|
assertion.False(clusterMachinePatchesContainsString(t, r, iface), "cluster machine %q patches contain string %q", r.Metadata().ID(), iface)
|
|
})
|
|
|
|
// cleanup
|
|
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
|
|
}
|
|
}
|
|
|
|
func clusterMachinePatchesContainsString(t *testing.T, clusterMachineConfigPatches *omni.ClusterMachineConfigPatches, str string) bool {
|
|
patches, err := clusterMachineConfigPatches.TypedSpec().Value.GetUncompressedPatches()
|
|
require.NoError(t, err)
|
|
|
|
for _, patch := range patches {
|
|
if strings.Contains(patch, str) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func talosFileExists(ctx context.Context, talosClient *talosclient.Client, node, filePath string) (bool, error) {
|
|
resp, err := talosClient.MachineClient.List(talosclient.WithNode(ctx, node), &machine.ListRequest{Root: filePath})
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
for {
|
|
var fileInfo *machine.FileInfo
|
|
|
|
fileInfo, err = resp.Recv()
|
|
if errors.Is(err, io.EOF) {
|
|
return false, nil
|
|
}
|
|
|
|
if err != nil {
|
|
if strings.Contains(err.Error(), "no such file or directory") {
|
|
return false, nil
|
|
}
|
|
|
|
return false, err
|
|
}
|
|
|
|
if fileInfo.GetError() != "" {
|
|
return false, errors.New(fileInfo.GetError())
|
|
}
|
|
|
|
if fileInfo.Name == filepath.Clean(filePath) {
|
|
return true, nil
|
|
}
|
|
}
|
|
}
|