omni/internal/integration/template_test.go
Artem Chernyshev c9c4c8e10d
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
test: use go test to build and run Omni integration tests
All test modules were moved under `integration` tag and are now in
`internal/integration` folder: no more `cmd/integration-test`
executable.

New Kres version is able to build the same executable from the tests
directory instead.

All Omni related flags were renamed, for example `--endpoint` ->
`--omni.endpoint`.

2 more functional changes:

- Enabled `--test.failfast` for all test runs.
- Removed finalizers, which were running if the test has failed.

Both of these changes should make it easier to understand the test
failure: Talos node logs won't be cluttered with the finalizer tearing
down the cluster.

Fixes: https://github.com/siderolabs/omni/issues/1171

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
2025-06-03 15:07:00 +03:00

194 lines
6.7 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//go:build integration
package integration_test
import (
"bytes"
"context"
_ "embed"
"os"
"testing"
"text/template"
"time"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/cosi-project/runtime/pkg/state"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/siderolabs/omni/client/api/omni/specs"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
"github.com/siderolabs/omni/client/pkg/template/operations"
)
//go:embed testdata/cluster-1.tmpl.yaml
var cluster1Tmpl []byte
//go:embed testdata/cluster-2.tmpl.yaml
var cluster2Tmpl []byte
type tmplOptions struct {
KubernetesVersion string
TalosVersion string
CP []string
W []string
}
func renderTemplate(t *testing.T, tmpl []byte, opts tmplOptions) []byte {
var b bytes.Buffer
require.NoError(t, template.Must(template.New("cluster").Parse(string(tmpl))).Execute(&b, opts))
return b.Bytes()
}
// AssertClusterTemplateFlow verifies cluster template operations.
func AssertClusterTemplateFlow(testCtx context.Context, st state.State, options MachineOptions) TestFunc {
return func(t *testing.T) {
ctx, cancel := context.WithTimeout(testCtx, 20*time.Minute)
defer cancel()
const (
clusterName = "tmpl-cluster"
additionalWorkersName = "additional-workers"
)
require := require.New(t)
var (
machineIDs []resource.ID
opts tmplOptions
tmpl1 []byte
)
pickUnallocatedMachines(ctx, t, st, 5, func(mIDs []resource.ID) {
machineIDs = mIDs
opts = tmplOptions{
KubernetesVersion: "v" + options.KubernetesVersion,
TalosVersion: "v" + options.TalosVersion,
CP: machineIDs[:3],
W: machineIDs[3:],
}
tmpl1 = renderTemplate(t, cluster1Tmpl, opts)
require.NoError(operations.ValidateTemplate(bytes.NewReader(tmpl1)))
t.Log("creating template cluster")
require.NoError(operations.SyncTemplate(ctx, bytes.NewReader(tmpl1), os.Stderr, st, operations.SyncOptions{
Verbose: true,
}))
// assert that machines got allocated (label available is removed)
rtestutils.AssertResources(ctx, t, st, machineIDs, func(machineStatus *omni.MachineStatus, assert *assert.Assertions) {
assert.True(machineStatus.Metadata().Labels().Matches(
resource.LabelTerm{
Key: omni.MachineStatusLabelAvailable,
Op: resource.LabelOpExists,
Invert: true,
},
), resourceDetails(machineStatus))
})
})
t.Log("wait for cluster to be ready")
// wait using the status command
require.NoError(operations.StatusTemplate(ctx, bytes.NewReader(tmpl1), os.Stderr, st, operations.StatusOptions{
Wait: true,
}))
// re-check with short timeout to make sure the cluster is ready
checkCtx, checkCancel := context.WithTimeout(ctx, 30*time.Second)
defer checkCancel()
rtestutils.AssertResources(checkCtx, t, st, []string{clusterName}, func(status *omni.ClusterStatus, assert *assert.Assertions) {
spec := status.TypedSpec().Value
assert.Truef(spec.Available, "not available: %s", resourceDetails(status))
assert.Equalf(specs.ClusterStatusSpec_RUNNING, spec.Phase, "cluster is not in phase running: %s", resourceDetails(status))
assert.Equalf(spec.GetMachines().Total, spec.GetMachines().Healthy, "not all machines are healthy: %s", resourceDetails(status))
assert.Truef(spec.Ready, "cluster is not ready: %s", resourceDetails(status))
assert.Truef(spec.ControlplaneReady, "cluster controlplane is not ready: %s", resourceDetails(status))
assert.Truef(spec.KubernetesAPIReady, "cluster kubernetes API is not ready: %s", resourceDetails(status))
assert.EqualValuesf(len(opts.CP)+len(opts.W), spec.GetMachines().Total, "total machines is not the same as in the machine sets: %s", resourceDetails(status))
})
rtestutils.AssertResources(checkCtx, t, st, []string{
omni.ControlPlanesResourceID(clusterName),
omni.WorkersResourceID(clusterName),
omni.AdditionalWorkersResourceID(clusterName, additionalWorkersName),
}, func(*omni.MachineSet, *assert.Assertions) {})
t.Log("updating template cluster")
opts.CP = opts.CP[:1]
tmpl2 := renderTemplate(t, cluster2Tmpl, opts)
require.NoError(operations.SyncTemplate(ctx, bytes.NewReader(tmpl2), os.Stderr, st, operations.SyncOptions{
Verbose: true,
}))
t.Log("waiting for cluster operations to apply")
time.Sleep(10 * time.Second)
t.Log("wait for cluster to be ready")
// wait using the status command
require.NoError(operations.StatusTemplate(ctx, bytes.NewReader(tmpl2), os.Stderr, st, operations.StatusOptions{
Wait: true,
}))
// re-check with short timeout to make sure the cluster is ready
checkCtx, checkCancel = context.WithTimeout(ctx, 10*time.Second)
defer checkCancel()
rtestutils.AssertResources(checkCtx, t, st, []string{clusterName}, func(status *omni.ClusterStatus, assert *assert.Assertions) {
spec := status.TypedSpec().Value
assert.Truef(spec.Available, "not available: %s", resourceDetails(status))
assert.Equalf(specs.ClusterStatusSpec_RUNNING, spec.Phase, "cluster is not in phase running: %s", resourceDetails(status))
assert.Equalf(spec.GetMachines().Total, spec.GetMachines().Healthy, "not all machines are healthy: %s", resourceDetails(status))
assert.Truef(spec.Ready, "cluster is not ready: %s", resourceDetails(status))
assert.Truef(spec.ControlplaneReady, "cluster controlplane is not ready: %s", resourceDetails(status))
assert.Truef(spec.KubernetesAPIReady, "cluster kubernetes API is not ready: %s", resourceDetails(status))
assert.EqualValuesf(len(opts.CP)+len(opts.W), spec.GetMachines().Total, "total machines is not the same as in the machine sets: %s", resourceDetails(status))
})
require.NoError(operations.ValidateTemplate(bytes.NewReader(tmpl1)))
t.Log("deleting template cluster")
require.NoError(operations.DeleteTemplate(ctx, bytes.NewReader(tmpl1), os.Stderr, st, operations.SyncOptions{
Verbose: true,
}))
rtestutils.AssertNoResource[*omni.Cluster](ctx, t, st, clusterName)
// make sure machines are returned to the pool or allocated into another cluster
rtestutils.AssertResources(ctx, t, st, machineIDs, func(machineStatus *omni.MachineStatus, assert *assert.Assertions) {
assert.True(machineStatus.Metadata().Labels().Matches(resource.LabelTerm{
Key: omni.MachineStatusLabelAvailable,
Op: resource.LabelOpExists,
}) || machineStatus.Metadata().Labels().Matches(resource.LabelTerm{
Key: omni.LabelCluster,
Op: resource.LabelOpEqual,
Value: []string{clusterName},
Invert: true,
}), resourceDetails(machineStatus))
})
}
}