omni/internal/integration/config_patch_test.go
Artem Chernyshev c9c4c8e10d
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
test: use go test to build and run Omni integration tests
All test modules were moved under `integration` tag and are now in
`internal/integration` folder: no more `cmd/integration-test`
executable.

New Kres version is able to build the same executable from the tests
directory instead.

All Omni related flags were renamed, for example `--endpoint` ->
`--omni.endpoint`.

2 more functional changes:

- Enabled `--test.failfast` for all test runs.
- Removed finalizers, which were running if the test has failed.

Both of these changes should make it easier to understand the test
failure: Talos node logs won't be cluttered with the finalizer tearing
down the cluster.

Fixes: https://github.com/siderolabs/omni/issues/1171

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
2025-06-03 15:07:00 +03:00

433 lines
16 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//go:build integration
package integration_test
import (
"context"
"errors"
"fmt"
"io"
"path/filepath"
"strings"
"testing"
"time"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/cosi-project/runtime/pkg/state"
"github.com/dustin/go-humanize"
"github.com/siderolabs/gen/pair"
"github.com/siderolabs/go-retry/retry"
"github.com/siderolabs/talos/pkg/machinery/api/machine"
talosclient "github.com/siderolabs/talos/pkg/machinery/client"
"github.com/siderolabs/talos/pkg/machinery/resources/network"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"github.com/siderolabs/omni/client/api/omni/specs"
"github.com/siderolabs/omni/client/pkg/client"
"github.com/siderolabs/omni/client/pkg/omni/resources"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
)
const (
dummyIfacePatchTemplate = `machine:
network:
interfaces:
- interface: %s
dummy: true`
)
// AssertLargeImmediateConfigApplied tests that config patch that be applied immediately (without reboot) gets applied to all the nodes.
//
// Config patch is generated to be large to test the edge case.
// The patch is removed on finalize.
func AssertLargeImmediateConfigApplied(testCtx context.Context, cli *client.Client, clusterName string, talosAPIKeyPrepare TalosAPIKeyPrepareFunc) TestFunc {
return func(t *testing.T) {
ctx, cancel := context.WithTimeout(testCtx, 3*time.Minute)
defer cancel()
require.NoError(t, talosAPIKeyPrepare(ctx, "default"))
talosCli, err := talosClient(ctx, cli, clusterName)
require.NoError(t, err)
nodeIPs, err := talosNodeIPs(ctx, talosCli.COSI)
require.NoError(t, err)
st := cli.Omni().State()
epochSeconds := time.Now().Unix()
id := fmt.Sprintf("000-config-patch-test-dummy-iface-%d", epochSeconds)
iface := fmt.Sprintf("dummy%d", epochSeconds)
configPatchYAML := fmt.Sprintf(dummyIfacePatchTemplate, iface)
var sb strings.Builder
for range 40_000 {
sb.WriteString("################################################################################\n")
}
sb.WriteString(configPatchYAML)
sb.WriteString("\n")
sizeHumanReadable := humanize.Bytes(uint64(sb.Len()))
require.LessOrEqual(t, sb.Len(), 4*1024*1024, "generated config patch is too large (%v), abort", sizeHumanReadable)
t.Logf("creating large config patch with size: %v", sizeHumanReadable)
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id, pair.MakePair(omni.LabelCluster, clusterName))
// apply the large config patch that creates a dummy interface
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
})
t.Logf("created large config patch with dummy interface: %q", iface)
// assert that the patch is propagated to all clustermachines
cmIDs := rtestutils.ResourceIDs[*omni.ClusterMachineConfigPatches](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)))
rtestutils.AssertResources(ctx, t, st, cmIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.True(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches don't contain string %q", cm.Metadata().ID(), iface)
})
linkStatus := network.NewLinkStatus(network.NamespaceName, "")
containsDummyIface := func(node string) (bool, error) {
links, linksErr := talosCli.COSI.List(talosclient.WithNode(ctx, node), linkStatus.Metadata())
if linksErr != nil {
return false, linksErr
}
for _, res := range links.Items {
if res.Metadata().ID() == iface {
return true, nil
}
}
return false, nil
}
for _, node := range nodeIPs {
err = retry.Constant(3*time.Minute, retry.WithUnits(1*time.Second)).RetryWithContext(ctx, func(context.Context) error {
contains, containsErr := containsDummyIface(node)
if containsErr != nil {
return containsErr
}
if !contains {
return retry.ExpectedError(fmt.Errorf("%q: dummy interface %q not found", node, iface))
}
return nil
})
assert.NoError(t, err)
}
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
t.Logf("destroyed config patch with dummy interface: %q", iface)
// assert that the patch deletion is propagated to all clustermachines
rtestutils.AssertResources(ctx, t, st, cmIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.False(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches contain string %q", cm.Metadata().ID(), iface)
})
}
}
// AssertConfigPatchWithReboot tests that config patch that requires reboot gets applied to a single node, the node reboots and gets back.
//
// The patch is NOT removed.
func AssertConfigPatchWithReboot(testCtx context.Context, cli *client.Client, clusterName string, talosAPIKeyPrepare TalosAPIKeyPrepareFunc) TestFunc {
return func(t *testing.T) {
// just a single machine with a reboot, so it should take no more than 3 minutes
ctx, cancel := context.WithTimeout(testCtx, 3*time.Minute)
defer cancel()
require.NoError(t, talosAPIKeyPrepare(ctx, "default"))
talosCli, err := talosClient(ctx, cli, clusterName)
require.NoError(t, err)
st := cli.Omni().State()
nodeList, err := nodes(ctx, cli, clusterName, resource.LabelExists(omni.LabelWorkerRole))
require.NoError(t, err)
require.Greater(t, len(nodeList), 0)
node := nodeList[0]
nodeID := node.machine.Metadata().ID()
epochSeconds := time.Now().Unix()
id := fmt.Sprintf("000-config-patch-test-file-%d", epochSeconds)
file := fmt.Sprintf("/var/config-patch-test-file-%d.txt", epochSeconds)
configPatchYAML := fmt.Sprintf(`machine:
files:
- content: test
permissions: 0o666
path: %s
op: create`, file)
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id,
pair.MakePair(omni.LabelCluster, clusterName),
pair.MakePair(omni.LabelClusterMachine, nodeID))
// apply the config patch that creates a file
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
})
t.Logf("created config patch with file: %q", file)
// skip cleanup to avoid waiting for an additional reboot
// assert that the patch is propagated to all clustermachines
rtestutils.AssertResources(ctx, t, st, []resource.ID{nodeID}, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.True(clusterMachinePatchesContainsString(t, cm, file), "cluster machine %q patches don't contain string %q", file, cm.Metadata().ID())
})
// assert that machine set enters into reconfiguring phase
rtestutils.AssertResources(ctx, t, st, []resource.ID{omni.WorkersResourceID(clusterName)}, func(mss *omni.MachineSetStatus, assert *assert.Assertions) {
assert.Equal(specs.MachineSetPhase_Reconfiguring, mss.TypedSpec().Value.GetPhase())
})
// assert that the file is created on the node
err = retry.Constant(3*time.Minute, retry.WithUnits(1*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
exists, existsErr := talosFileExists(ctx, talosCli, node.talosIP, file)
if existsErr != nil {
if strings.Contains(existsErr.Error(), "not reachable") || status.Code(existsErr) == codes.Unavailable {
return retry.ExpectedError(existsErr)
}
return existsErr
}
if !exists {
return retry.ExpectedError(fmt.Errorf("%q: file %q is not found", node.talosIP, file))
}
t.Logf("file %q is found on node %q", file, node.talosIP)
return nil
})
assert.NoError(t, err)
// wait cluster machine status to be running
rtestutils.AssertResources(ctx, t, st, []resource.ID{nodeID}, func(cms *omni.ClusterMachineStatus, assertion *assert.Assertions) {
assertion.Equal(specs.ClusterMachineStatusSpec_RUNNING, cms.TypedSpec().Value.GetStage())
})
}
}
// AssertConfigPatchWithInvalidConfig tests that a machine is able to recover from a patch with broken config when the broken patch is deleted.
func AssertConfigPatchWithInvalidConfig(testCtx context.Context, cli *client.Client, clusterName string, talosAPIKeyPrepare TalosAPIKeyPrepareFunc) TestFunc {
return func(t *testing.T) {
ctx, cancel := context.WithTimeout(testCtx, 8*time.Minute)
defer cancel()
require.NoError(t, talosAPIKeyPrepare(ctx, "default"))
st := cli.Omni().State()
cmIDs := rtestutils.ResourceIDs[*omni.ClusterMachine](ctx, t, st,
state.WithLabelQuery(
resource.LabelEqual(omni.LabelCluster, clusterName),
resource.LabelExists(omni.LabelControlPlaneRole),
),
)
require.NotEmpty(t, cmIDs)
cmID := cmIDs[0]
epochSeconds := time.Now().Unix()
id := fmt.Sprintf("000-config-patch-test-file-broken-%d", epochSeconds)
file := fmt.Sprintf("/tmp/config-patch-test-file-broken-%d.txt", epochSeconds)
configPatchYAML := fmt.Sprintf(`machine:
files:
- content: test
permissions: 0o666
path: %s
op: create`, file)
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id,
pair.MakePair(omni.LabelCluster, clusterName),
pair.MakePair(omni.LabelClusterMachine, cmID))
// apply the broken config patch
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
})
t.Logf("created config patch with file: %q", file)
rtestutils.AssertResources(ctx, t, st, []resource.ID{cmID}, func(cms *omni.ClusterMachineStatus, assertion *assert.Assertions) {
assertion.Equal(specs.ClusterMachineStatusSpec_BOOTING, cms.TypedSpec().Value.GetStage())
assertion.False(cms.TypedSpec().Value.GetReady())
})
// TODO: wait for a Talos error about invalid config in the logs
t.Logf("destroyed config patch with file: %q", file)
// remove broken config patch
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
// wait until k8s nodes come back
rtestutils.AssertResources(ctx, t, st, []resource.ID{cmID}, func(cms *omni.ClusterMachineStatus, assertion *assert.Assertions) {
assertion.Equal(specs.ClusterMachineStatusSpec_RUNNING, cms.TypedSpec().Value.GetStage())
assertion.True(cms.TypedSpec().Value.GetReady())
})
}
}
// AssertConfigPatchMachineSet applies config patch on a single machine set.
//
// The patch is removed at the end.
func AssertConfigPatchMachineSet(testCtx context.Context, cli *client.Client, clusterName string) TestFunc {
return func(t *testing.T) {
ctx, cancel := context.WithTimeout(testCtx, 30*time.Second)
defer cancel()
st := cli.Omni().State()
workerMachineSetName := omni.WorkersResourceID(clusterName)
controlPlaneMachineSetName := omni.ControlPlanesResourceID(clusterName)
epochSeconds := time.Now().Unix()
id := fmt.Sprintf("000-config-patch-test-dummy-iface-%d", epochSeconds)
iface := fmt.Sprintf("dummy%d", epochSeconds)
configPatchYAML := fmt.Sprintf(dummyIfacePatchTemplate, iface)
configPatch := omni.NewConfigPatch(
resources.DefaultNamespace,
id,
pair.MakePair(omni.LabelCluster, clusterName),
pair.MakePair(omni.LabelMachineSet, workerMachineSetName),
)
// apply the config patch that creates a dummy interface
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
})
// assert that the patch is propagated to all worker machine set ClusterMachines
workerIDs := rtestutils.ResourceIDs[*omni.ClusterMachineConfigPatches](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelMachineSet, workerMachineSetName)))
rtestutils.AssertResources(ctx, t, st, workerIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.True(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches don't contain string %q", cm.Metadata().ID(), iface)
})
// assert that the patch is *NOT* propagated to all controlplane machine set ClusterMachines
controlPlaneIDs := rtestutils.ResourceIDs[*omni.ClusterMachineConfigPatches](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelMachineSet, controlPlaneMachineSetName)))
rtestutils.AssertResources(ctx, t, st, controlPlaneIDs, func(cm *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.False(clusterMachinePatchesContainsString(t, cm, iface), "cluster machine %q patches contain string %q", cm.Metadata().ID(), iface)
})
// cleanup
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
}
}
// AssertConfigPatchSingleClusterMachine applies a config patch on a single cluster machine.
//
// The patch is removed at the end.
func AssertConfigPatchSingleClusterMachine(testCtx context.Context, cli *client.Client, clusterName string) TestFunc {
return func(t *testing.T) {
ctx, cancel := context.WithTimeout(testCtx, 30*time.Second)
defer cancel()
st := cli.Omni().State()
// get a single clustermachine
cmIDs := rtestutils.ResourceIDs[*omni.ClusterMachine](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)))
require.NotEmpty(t, cmIDs)
cmID := cmIDs[0]
epochSeconds := time.Now().Unix()
id := fmt.Sprintf("000-config-patch-test-dummy-iface-%d", epochSeconds)
iface := fmt.Sprintf("dummy%d", epochSeconds)
configPatchYAML := fmt.Sprintf(dummyIfacePatchTemplate, iface)
configPatch := omni.NewConfigPatch(resources.DefaultNamespace, id,
pair.MakePair(omni.LabelCluster, clusterName),
pair.MakePair(omni.LabelClusterMachine, cmID))
// apply the config patch
createOrUpdate(ctx, t, st, configPatch, func(p *omni.ConfigPatch) error {
return p.TypedSpec().Value.SetUncompressedData([]byte(configPatchYAML))
})
// assert that the patch is propagated to the clustermachine
rtestutils.AssertResources(ctx, t, st, []resource.ID{cmID}, func(r *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.True(clusterMachinePatchesContainsString(t, r, iface), "cluster machine %q patches don't contain string %q", r.Metadata().ID(), iface)
})
// assert that the patch is *NOT* propagated to other clustermachines of the cluster
otherCMIDs := cmIDs[1:]
rtestutils.AssertResources(ctx, t, st, otherCMIDs, func(r *omni.ClusterMachineConfigPatches, assertion *assert.Assertions) {
assertion.False(clusterMachinePatchesContainsString(t, r, iface), "cluster machine %q patches contain string %q", r.Metadata().ID(), iface)
})
// cleanup
rtestutils.Destroy[*omni.ConfigPatch](ctx, t, st, []string{configPatch.Metadata().ID()})
}
}
func clusterMachinePatchesContainsString(t *testing.T, clusterMachineConfigPatches *omni.ClusterMachineConfigPatches, str string) bool {
patches, err := clusterMachineConfigPatches.TypedSpec().Value.GetUncompressedPatches()
require.NoError(t, err)
for _, patch := range patches {
if strings.Contains(patch, str) {
return true
}
}
return false
}
func talosFileExists(ctx context.Context, talosClient *talosclient.Client, node, filePath string) (bool, error) {
resp, err := talosClient.MachineClient.List(talosclient.WithNode(ctx, node), &machine.ListRequest{Root: filePath})
if err != nil {
return false, err
}
for {
var fileInfo *machine.FileInfo
fileInfo, err = resp.Recv()
if errors.Is(err, io.EOF) {
return false, nil
}
if err != nil {
if strings.Contains(err.Error(), "no such file or directory") {
return false, nil
}
return false, err
}
if fileInfo.GetError() != "" {
return false, errors.New(fileInfo.GetError())
}
if fileInfo.Name == filepath.Clean(filePath) {
return true, nil
}
}
}