omni/internal/integration/suites_test.go
Oguz Kilcan f8de9a6d96
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-cluster-import (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-omni-upgrade (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
feat: add support for imported cluster secrets
Introduce new resource `ImportedClusterSecrets` for importing an existing secrets bundle.
Add new field `imported` to `ClusterSpec` for utilizing resource `ImportedCreatedSecrets`.
Add new field `imported` to `ClusterSecrets` for pointing out source of the secrets bundle.

This is a feature-gated feature to allow using an existing secrets bundle (`talos gen secrets`) while creating a new Cluster. Cluster created with this method are marked as `tainted`. This feature is part of a story to facilitate importing existing talos clusters to omni.

Signed-off-by: Oguz Kilcan <oguz.kilcan@siderolabs.com>
2025-07-16 12:34:47 +02:00

1518 lines
42 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//go:build integration
package integration_test
import (
"context"
"net/http"
"testing"
"time"
"github.com/siderolabs/talos/pkg/machinery/config"
talossecrets "github.com/siderolabs/talos/pkg/machinery/config/generate/secrets"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/durationpb"
"gopkg.in/yaml.v3"
"github.com/siderolabs/omni/client/api/omni/specs"
"github.com/siderolabs/omni/client/pkg/omni/resources"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
"github.com/siderolabs/omni/internal/integration/workloadproxy"
"github.com/siderolabs/omni/internal/pkg/clientconfig"
)
type assertClusterReadyOptions struct {
talosVersion string
kubernetesVersion string
}
type assertClusterReadyOption func(*assertClusterReadyOptions)
func withTalosVersion(version string) assertClusterReadyOption {
return func(acro *assertClusterReadyOptions) {
acro.talosVersion = version
}
}
func withKubernetesVersion(version string) assertClusterReadyOption {
return func(acro *assertClusterReadyOptions) {
acro.kubernetesVersion = version
}
}
func assertClusterAndAPIReady(t *testing.T, clusterName string, options *TestOptions, opts ...assertClusterReadyOption) {
optionsStruct := assertClusterReadyOptions{
talosVersion: options.MachineOptions.TalosVersion,
kubernetesVersion: options.MachineOptions.KubernetesVersion,
}
for _, o := range opts {
o(&optionsStruct)
}
runTests(t, AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(
t.Context(),
options.omniClient,
clusterName,
optionsStruct.talosVersion,
optionsStruct.kubernetesVersion,
))
}
func testCleanState(options *TestOptions) TestFunc {
return func(t *testing.T) {
ctx := t.Context()
t.Log(`
Bring the state of Omni to a clean state by removing all clusters, config patches, etc. which might have been left from previous runs.
Wait for all expected machines to join and be in maintenance mode.`)
t.Run(
"DestroyAllClusterRelatedResources",
DestroyAllClusterRelatedResources(ctx, options.omniClient.Omni().State()),
)
// machine discovery, all machines should be in maintenance mode
t.Run(
"LinkCountShouldMatchExpectedMachines",
AssertNumberOfLinks(ctx, options.omniClient.Omni().State(), expectedMachines),
)
t.Run(
"LinksShouldBeConnected",
AssertLinksConnected(ctx, options.omniClient.Omni().State()),
)
t.Run(
"LinksShouldMatchMachines",
AssertMachinesMatchLinks(ctx, options.omniClient.Omni().State()),
)
t.Run(
"MachinesShouldHaveLogs",
AssertMachinesHaveLogs(ctx, options.omniClient.Omni().State(), options.omniClient.Management()),
)
t.Run(
"MachinesShouldBeReachableInMaintenanceMode",
AssertTalosMaintenanceAPIAccessViaOmni(ctx, options.omniClient),
)
t.Run(
"MachinesShouldBeInMaintenanceMode",
AssertMachineStatus(ctx, options.omniClient.Omni().State(), true, "", map[string]string{
omni.MachineStatusLabelConnected: "",
omni.MachineStatusLabelReportingEvents: "",
omni.MachineStatusLabelAvailable: "",
// QEMU-specific labels which should always match, others are specific to the settings (number of cores, etc.)
omni.MachineStatusLabelCPU: "qemu",
omni.MachineStatusLabelArch: "amd64",
omni.MachineStatusLabelPlatform: "metal",
}, nil),
)
}
}
func testImageGeneration(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Parallel()
t.Log(`
Generate various Talos images with Omni and try to download them.`)
t.Run(
"TalosImagesShouldBeDownloadableUsingCLI",
AssertDownloadUsingCLI(t.Context(), options.omniClient, options.OmnictlPath, options.HTTPEndpoint),
)
t.Run(
"TalosImagesShouldBeDownloadable",
AssertSomeImagesAreDownloadable(t.Context(), options.omniClient, func(ctx context.Context, req *http.Request) error {
return clientconfig.SignHTTPRequest(ctx, options.omniClient, req)
}, options.HTTPEndpoint),
)
}
}
func testCLICommands(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Parallel()
t.Log(`
Verify various omnictl commands.`)
t.Run(
"OmnictlUserCLIShouldWork",
AssertUserCLI(t.Context(), options.omniClient, options.OmnictlPath, options.HTTPEndpoint),
)
}
}
func testKubernetesNodeAudit(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Parallel()
clusterName := "integration-k8s-node-audit"
options.claimMachines(t, 2)
t.Log(`
Test the auditing of the Kubernetes nodes, i.e. when a node is gone from the Omni perspective but still exists on the Kubernetes cluster.`)
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
runTests(
t,
AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(
t.Context(),
options.omniClient,
clusterName,
options.MachineOptions.TalosVersion,
options.MachineOptions.KubernetesVersion,
),
)
t.Run(
"KubernetesNodeAuditShouldBePerformed",
AssertKubernetesNodeAudit(
t.Context(),
clusterName,
options,
),
)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testForcedMachineRemoval(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests different scenarios for forced Machine removal (vs. graceful removing from a cluster):
- force remove a Machine which is not allocated (not part of any cluster)
- force remove a worker Machine which is part of the cluster
- force remove a control plane Machine which is part of the cluster, and replace with a new Machine.
These tests simulate a hardware failure of a Machine which requires a forced removal from Omni.
In the tests, we wipe and reboot the VMs to bring them back as available for the next test.`)
t.Parallel()
options.claimMachines(t, 4)
clusterName := "integration-forced-removal"
assertClusterReady := func() {
runTests(t, AssertBlockClusterShouldBeReady(
t.Context(),
options.omniClient,
clusterName,
options.MachineOptions.TalosVersion,
))
}
t.Run(
"UnallocatedMachinesShouldBeDestroyable",
AssertUnallocatedMachineDestroyFlow(t.Context(), options.omniClient.Omni().State(), options.RestartAMachineFunc),
)
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterReady()
t.Run(
"WorkerNodesShouldBeForceRemovable",
AssertForceRemoveWorkerNode(t.Context(), options.omniClient.Omni().State(), clusterName, options.FreezeAMachineFunc, options.WipeAMachineFunc),
)
assertClusterReady()
t.Run(
"ControlPlaneNodeShouldBeForceReplaceable",
AssertControlPlaneForceReplaceMachine(
t.Context(),
options.omniClient.Omni().State(),
clusterName,
options.Options,
),
)
assertClusterReady()
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testImmediateClusterDestruction(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Regression test: create a cluster and destroy it without waiting for the cluster to reach any state.`)
t.Parallel()
options.claimMachines(t, 3)
clusterName := "integration-immediate"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 2,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testDefaultCluster(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Create a regular 3 + 2 cluster with HA controlplane, assert that the cluster is ready and accessible.
Don't do any changes to the cluster.`)
t.Parallel()
clusterOptions := ClusterOptions{
Name: "integration-default",
ControlPlanes: 3,
Workers: 2,
MachineOptions: options.MachineOptions,
}
options.claimMachines(t, clusterOptions.ControlPlanes+clusterOptions.Workers)
runTests(t, AssertClusterCreateAndReady(t.Context(), options.omniClient, clusterOptions))
}
}
func testEncryptedCluster(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Create a 1 + 1 cluster and enable disk encryption via Omni as a KMS.
Don't do any changes to the cluster.`)
t.Parallel()
clusterOptions := ClusterOptions{
Name: "integration-encrypted",
ControlPlanes: 1,
Workers: 1,
MachineOptions: options.MachineOptions,
Features: &specs.ClusterSpec_Features{
DiskEncryption: true,
},
}
options.claimMachines(t, clusterOptions.ControlPlanes+clusterOptions.Workers)
runTests(t, AssertClusterCreateAndReady(t.Context(), options.omniClient, clusterOptions))
}
}
func testSinglenodeCluster(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Create a single node cluster.
Don't do any changes to the cluster.`)
t.Parallel()
clusterOptions := ClusterOptions{
Name: "integration-singlenode",
ControlPlanes: 1,
Workers: 0,
MachineOptions: options.MachineOptions,
}
options.claimMachines(t, clusterOptions.ControlPlanes+clusterOptions.Workers)
runTests(t, AssertClusterCreateAndReady(t.Context(), options.omniClient, clusterOptions))
}
}
func testScaleUpAndDown(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests scaling up and down a cluster:
- create a 1+0 cluster
- scale up to 1+1
- scale up to 3+1
- scale down to 3+0
- scale down to 1+0
In between the scaling operations, assert that the cluster is ready and accessible.`)
t.Parallel()
options.claimMachines(t, 4)
clusterName := "integration-scaling"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"OneWorkerShouldBeAdded",
ScaleClusterUp(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"TwoControlPlanesShouldBeAdded",
ScaleClusterUp(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 2,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"OneWorkerShouldBeRemoved",
ScaleClusterDown(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: -1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"TwoControlPlanesShouldBeRemoved",
ScaleClusterDown(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: -2,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testScaleUpAndDownMachineClassBasedMachineSets(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests scaling up and down a cluster using machine classes:
- create a 1+0 cluster
- scale up to 1+1
- scale up to 3+1
- scale down to 3+0
- scale down to 1+0
In between the scaling operations, assert that the cluster is ready and accessible.`)
t.Parallel()
options.claimMachines(t, 4)
clusterName := "integration-scaling-machine-class-based-machine-sets"
t.Run(
"ClusterShouldBeCreated",
CreateClusterWithMachineClass(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"OneWorkerShouldBeAdded",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"TwoControlPlanesShouldBeAdded",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 2,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"OneWorkerShouldBeRemoved",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: -1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"TwoControlPlanesShouldBeRemoved",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: -2,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testScaleUpAndDownAutoProvisionMachineSets(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests scaling up and down a cluster using infrastructure provisioner:
- create a 1+0 cluster
- scale up to 1+1
- scale up to 3+1
- scale down to 3+0
- scale down to 1+0
In between the scaling operations, assert that the cluster is ready and accessible.`)
t.Parallel()
clusterName := "integration-scaling-auto-provision"
t.Run(
"ClusterShouldBeCreated",
CreateClusterWithMachineClass(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 0,
InfraProvider: options.defaultInfraProvider(),
MachineOptions: options.MachineOptions,
ProviderData: options.defaultProviderData(),
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"OneWorkerShouldBeAdded",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: 1,
InfraProvider: options.defaultInfraProvider(),
MachineOptions: options.MachineOptions,
ProviderData: options.defaultProviderData(),
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"TwoControlPlanesShouldBeAdded",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 2,
Workers: 0,
MachineOptions: options.MachineOptions,
ProviderData: options.defaultInfraProvider(),
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"OneWorkerShouldBeRemoved",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: -1,
InfraProvider: options.defaultInfraProvider(),
MachineOptions: options.MachineOptions,
ProviderData: options.defaultProviderData(),
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"TwoControlPlanesShouldBeRemoved",
ScaleClusterMachineSets(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: -2,
Workers: 0,
InfraProvider: options.defaultInfraProvider(),
MachineOptions: options.MachineOptions,
ProviderData: options.defaultProviderData(),
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, true, false),
)
}
}
func testRollingUpdateParallelism(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests rolling update & scale down strategies for concurrency control for worker machine sets.
- create a 1+3 cluster
- update the worker configs with rolling strategy using maxParallelism of 2
- scale down the workers to 0 with rolling strategy using maxParallelism of 2
- assert that the maxParallelism of 2 was respected and used in both operations,`)
t.Parallel()
clusterName := "integration-rolling-update-parallelism"
options.claimMachines(t, 4)
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 3,
MachineOptions: options.MachineOptions,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"WorkersUpdateShouldBeRolledOutWithMaxParallelism",
AssertWorkerNodesRollingConfigUpdate(t.Context(), options.omniClient, clusterName, 2),
)
t.Run(
"WorkersShouldScaleDownWithMaxParallelism",
AssertWorkerNodesRollingScaleDown(t.Context(), options.omniClient, clusterName, 2),
)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testReplaceControlPlanes(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests replacing control plane nodes:
- create a 1+0 cluster
- scale up to 2+0, and immediately remove the first control plane node
In between the scaling operations, assert that the cluster is ready and accessible.`)
t.Parallel()
options.claimMachines(t, 2)
clusterName := "integration-replace-cp"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ControlPlanesShouldBeReplaced",
ReplaceControlPlanes(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
MachineOptions: options.MachineOptions,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testConfigPatching(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests applying various config patching, including "broken" config patches which should not apply.`)
t.Parallel()
options.claimMachines(t, 4)
clusterName := "integration-config-patching"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"LargeImmediateConfigPatchShouldBeAppliedAndRemoved",
AssertLargeImmediateConfigApplied(t.Context(), options.omniClient, clusterName),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"MachineSetConfigPatchShouldBeAppliedAndRemoved",
AssertConfigPatchMachineSet(t.Context(), options.omniClient, clusterName),
)
t.Run(
"SingleClusterMachineConfigPatchShouldBeAppliedAndRemoved",
AssertConfigPatchSingleClusterMachine(t.Context(), options.omniClient, clusterName),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ConfigPatchWithRebootShouldBeApplied",
AssertConfigPatchWithReboot(t.Context(), options.omniClient, clusterName),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"InvalidConfigPatchShouldNotBeApplied",
AssertConfigPatchWithInvalidConfig(t.Context(), options.omniClient, clusterName),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testTalosUpgrades(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests upgrading Talos version, including reverting a failed upgrade.`)
t.Parallel()
options.claimMachines(t, 4)
clusterName := "integration-talos-upgrade"
machineOptions := MachineOptions{
TalosVersion: options.AnotherTalosVersion,
KubernetesVersion: options.AnotherKubernetesVersion, // use older Kubernetes compatible with AnotherTalosVersion
}
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
MachineOptions: machineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options, withTalosVersion(machineOptions.TalosVersion), withKubernetesVersion(machineOptions.KubernetesVersion))
if !options.SkipExtensionsCheckOnCreate {
t.Run(
"HelloWorldServiceExtensionShouldBePresent",
AssertExtensionIsPresent(t.Context(), options.omniClient, clusterName, HelloWorldServiceExtensionName),
)
}
t.Run(
"TalosSchematicUpdateShouldSucceed",
AssertTalosSchematicUpdateFlow(t.Context(), options.omniClient, clusterName),
)
t.Run(
"QemuGuestAgentExtensionShouldBePresent",
AssertExtensionIsPresent(t.Context(), options.omniClient, clusterName, QemuGuestAgentExtensionName),
)
t.Run(
"ClusterBootstrapManifestSyncShouldBeSuccessful",
KubernetesBootstrapManifestSync(t.Context(), options.omniClient.Management(), clusterName),
)
t.Run(
"TalosUpgradeShouldSucceed",
AssertTalosUpgradeFlow(t.Context(), options.omniClient.Omni().State(), clusterName, options.MachineOptions.TalosVersion),
)
t.Run(
"ClusterBootstrapManifestSyncShouldBeSuccessful",
KubernetesBootstrapManifestSync(t.Context(), options.omniClient.Management(), clusterName),
)
if !options.SkipExtensionsCheckOnCreate {
t.Run(
"HelloWorldServiceExtensionShouldBePresent",
AssertExtensionIsPresent(t.Context(), options.omniClient, clusterName, HelloWorldServiceExtensionName),
)
}
assertClusterAndAPIReady(t, clusterName, options, withTalosVersion(options.MachineOptions.TalosVersion), withKubernetesVersion(machineOptions.KubernetesVersion))
t.Run(
"FailedTalosUpgradeShouldBeRevertible",
AssertTalosUpgradeIsRevertible(t.Context(), options.omniClient.Omni().State(), clusterName, options.MachineOptions.TalosVersion),
)
t.Run(
"RunningTalosUpgradeShouldBeCancelable",
AssertTalosUpgradeIsCancelable(t.Context(), options.omniClient.Omni().State(), clusterName, options.MachineOptions.TalosVersion, options.AnotherTalosVersion),
)
assertClusterAndAPIReady(t, clusterName, options, withKubernetesVersion(machineOptions.KubernetesVersion))
t.Run(
"MaintenanceTestConfigShouldStillBePresent",
AssertMaintenanceTestConfigIsPresent(t.Context(), options.omniClient.Omni().State(), clusterName, 0), // check the maintenance config in the first machine
)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testKubernetesUpgrades(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests upgrading Kubernetes version, including reverting a failed upgrade.`)
t.Parallel()
options.claimMachines(t, 4)
clusterName := "integration-k8s-upgrade"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
MachineOptions: MachineOptions{
TalosVersion: options.MachineOptions.TalosVersion,
KubernetesVersion: options.AnotherKubernetesVersion,
},
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options, withKubernetesVersion(options.AnotherKubernetesVersion))
t.Run(
"KubernetesUpgradeShouldSucceed",
AssertKubernetesUpgradeFlow(
t.Context(), options.omniClient.Omni().State(), options.omniClient.Management(),
clusterName,
options.MachineOptions.KubernetesVersion,
),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"FailedKubernetesUpgradeShouldBeRevertible",
AssertKubernetesUpgradeIsRevertible(t.Context(), options.omniClient.Omni().State(), clusterName, options.MachineOptions.KubernetesVersion),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testEtcdBackupAndRestore(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests automatic & manual backup & restore for workload etcd.
Automatic backups are enabled, done, and then a manual backup is created.
Afterwards, a cluster's control plane is destroyed then recovered from the backup.
Finally, a completely new cluster is created using the same backup to test the "point-in-time recovery".`)
t.Parallel()
options.claimMachines(t, 6)
clusterName := "integration-etcd-backup"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
EtcdBackup: &specs.EtcdBackupConf{
Interval: durationpb.New(2 * time.Hour),
Enabled: true,
},
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
runTests(t,
AssertBlockKubernetesDeploymentCreateAndRunning(t.Context(), options.omniClient.Management(),
clusterName,
"default",
"test",
),
)
t.Run(
"KubernetesSecretShouldBeCreated",
AssertKubernetesSecretIsCreated(t.Context(), options.omniClient.Management(),
clusterName, "default", "test", "backup-test-secret-val"),
)
t.Run(
"EtcdAutomaticBackupShouldBeCreated",
AssertEtcdAutomaticBackupIsCreated(t.Context(), options.omniClient.Omni().State(), clusterName),
)
t.Run(
"EtcdManualBackupShouldBeCreated",
AssertEtcdManualBackupIsCreated(t.Context(), options.omniClient.Omni().State(), clusterName),
)
secondClusterName := "integration-etcd-backup-new-cluster"
runTests(
t,
AssertBlockCreateClusterFromEtcdBackup(t.Context(), options.omniClient, options.Options,
clusterName,
secondClusterName,
"default",
"test",
),
)
t.Run(
"EtcdSecretShouldBeSameAfterCreateFromBackup",
AssertKubernetesSecretHasValue(t.Context(), options.omniClient.Management(), secondClusterName, "default", "test", "backup-test-secret-val"),
)
t.Run(
"NewClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), secondClusterName, false, false),
)
runTests(
t,
AssertBlockRestoreEtcdFromLatestBackup(t.Context(), options.omniClient, options.Options,
3,
clusterName,
"default",
"test",
),
)
t.Run(
"RestoredClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testMaintenanceUpgrade(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Test upgrading (downgrading) a machine in maintenance mode.
Create a cluster out of a single machine on version1, remove cluster (the machine will stay on version1, Talos is installed).
Create a cluster out of the same machine on version2, Omni should upgrade the machine to version2 while in maintenance.`)
t.Parallel()
options.claimMachines(t, 1)
t.Run(
"MachineShouldBeUpgradedInMaintenanceMode",
AssertMachineShouldBeUpgradedInMaintenanceMode(
t.Context(), options.omniClient,
"integration-maintenance-upgrade",
options.AnotherKubernetesVersion,
options.MachineOptions.TalosVersion,
options.AnotherTalosVersion,
),
)
}
}
func testAuth(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Test authorization on accessing Omni API, some tests run without a cluster, some only run with a context of a cluster.`)
t.Parallel()
options.claimMachines(t, 1)
t.Run(
"AnonymousRequestShouldBeDenied",
AssertAnonymousAuthenication(t.Context(), options.omniClient),
)
t.Run(
"InvalidSignatureShouldBeDenied",
AssertAPIInvalidSignature(t.Context(), options.omniClient),
)
t.Run(
"PublicKeyWithoutLifetimeShouldNotBeRegistered",
AssertPublicKeyWithoutLifetimeNotRegistered(t.Context(), options.omniClient),
)
t.Run(
"PublicKeyWithLongLifetimeShouldNotBeRegistered",
AssertPublicKeyWithLongLifetimeNotRegistered(t.Context(), options.omniClient),
)
t.Run(
"OmniconfigShouldBeDownloadable",
AssertOmniconfigDownload(t.Context(), options.omniClient),
)
t.Run(
"PublicKeyWithUnknownEmailShouldNotBeRegistered",
AssertRegisterPublicKeyWithUnknownEmail(t.Context(), options.omniClient),
)
t.Run(
"ServiceAccountAPIShouldWork",
AssertServiceAccountAPIFlow(t.Context(), options.omniClient),
)
t.Run(
"ResourceAuthzShouldWork",
AssertResourceAuthz(t.Context(), options.omniClient, options.clientConfig),
)
t.Run(
"ResourceAuthzWithACLShouldWork",
AssertResourceAuthzWithACL(t.Context(), options.omniClient, options.clientConfig),
)
clusterName := "integration-auth"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 0,
Features: &specs.ClusterSpec_Features{
UseEmbeddedDiscoveryService: true,
},
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"APIAuthorizationShouldBeTested",
AssertAPIAuthz(t.Context(), options.omniClient, options.clientConfig, clusterName),
)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testClusterTemplate(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Test flow of cluster creation and scaling using cluster templates.`)
t.Parallel()
options.claimMachines(t, 5)
t.Run(
"TestClusterTemplateFlow",
AssertClusterTemplateFlow(t.Context(), options.omniClient.Omni().State(), options.MachineOptions),
)
}
}
func testWorkloadProxy(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Test workload service proxying feature`)
t.Parallel()
options.claimMachines(t, 6)
omniClient := options.omniClient
cluster1 := "integration-workload-proxy-1"
cluster2 := "integration-workload-proxy-2"
t.Run("ClusterShouldBeCreated-"+cluster1, CreateCluster(t.Context(), omniClient, ClusterOptions{
Name: cluster1,
ControlPlanes: 1,
Workers: 1,
Features: &specs.ClusterSpec_Features{
EnableWorkloadProxy: true,
},
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
AllowSchedulingOnControlPlanes: true,
}))
t.Run("ClusterShouldBeCreated-"+cluster2, CreateCluster(t.Context(), omniClient, ClusterOptions{
Name: cluster2,
ControlPlanes: 1,
Workers: 2,
Features: &specs.ClusterSpec_Features{
EnableWorkloadProxy: true,
},
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
AllowSchedulingOnControlPlanes: true,
}))
runTests(t, AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(t.Context(), omniClient, cluster1, options.MachineOptions.TalosVersion,
options.MachineOptions.KubernetesVersion))
runTests(t, AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(t.Context(), omniClient, cluster2, options.MachineOptions.TalosVersion,
options.MachineOptions.KubernetesVersion))
parentCtx := t.Context()
t.Run("WorkloadProxyShouldBeTested", func(t *testing.T) {
workloadproxy.Test(parentCtx, t, omniClient, cluster1, cluster2)
})
t.Run("ClusterShouldBeDestroyed-"+cluster1, AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), cluster1, false, false))
t.Run("ClusterShouldBeDestroyed-"+cluster2, AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), cluster2, false, false))
}
}
func testStaticInfraProvider(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Tests common Omni operations on machines created by a static infrastructure provider:,
Note: this test expects all machines to be provisioned by the bare-metal infra provider as it doesn't filter them.
- create a 1+0 cluster - assert that cluster is healthy and ready
- scale it up to be 3+1 - assert that cluster is healthy and ready
- assert that machines are not ready to use (occupied)
- scale it down to be 1+0 - assert that cluster is healthy and ready
- destroy the cluster - assert that machines are wiped, then marked as ready to use
- create a new 3+1 cluster
- assert that cluster is healthy and ready
- remove links of the machines
`)
t.Parallel()
clusterName := "integration-static-infra-provider"
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 1,
Workers: 0,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: true,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeScaledUp",
ScaleClusterUp(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 2,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ExtensionsShouldBeUpdated",
UpdateExtensions(t.Context(), options.omniClient, clusterName, []string{"siderolabs/binfmt-misc", "siderolabs/glibc"}),
)
t.Run(
"MachinesShouldBeAllocated",
AssertInfraMachinesAreAllocated(t.Context(), options.omniClient.Omni().State(), clusterName,
options.MachineOptions.TalosVersion, []string{"siderolabs/binfmt-misc", "siderolabs/glibc"}),
)
t.Run(
"ClusterShouldBeScaledDown",
ScaleClusterDown(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: -2,
Workers: -1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, true),
)
t.Run(
"ClusterShouldBeRecreated",
CreateCluster(t.Context(), options.omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: true,
}),
)
assertClusterAndAPIReady(t, clusterName, options)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, true),
)
}
}
func testOmniUpgradePrepare(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Test Omni upgrades, the first half that runs on the previous Omni version
- create 3+1 cluster
- enable and verify workload proxying
- save cluster snapshot in the cluster resource for the future use`)
t.Parallel()
options.claimMachines(t, 4)
omniClient := options.omniClient
clusterName := "integration-omni-upgrades"
t.Run("ClusterShouldBeCreated", CreateCluster(t.Context(), omniClient, ClusterOptions{
Name: clusterName,
ControlPlanes: 3,
Workers: 1,
Features: &specs.ClusterSpec_Features{
EnableWorkloadProxy: true,
},
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
SkipExtensionCheckOnCreate: options.SkipExtensionsCheckOnCreate,
AllowSchedulingOnControlPlanes: true,
}))
runTests(t, AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(t.Context(), omniClient, clusterName, options.MachineOptions.TalosVersion,
options.MachineOptions.KubernetesVersion))
parentCtx := t.Context()
t.Run("WorkloadProxyShouldBeTested", func(t *testing.T) {
workloadproxy.Test(parentCtx, t, omniClient, clusterName)
})
t.Run("SaveClusterSnapshot", SaveClusterSnapshot(t.Context(), omniClient, clusterName))
}
}
func testOmniUpgradeVerify(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Test Omni upgrades, the second half that runs on the current Omni version
- check that the cluster exists and is healthy
- verify that machines were not restarted
- check that machine configuration was not changed
- verify workload proxying still works
- scale up the cluster by one worker`)
t.Parallel()
options.claimMachines(t, 5)
omniClient := options.omniClient
clusterName := "integration-omni-upgrades"
runTests(t, AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(t.Context(), omniClient, clusterName, options.MachineOptions.TalosVersion,
options.MachineOptions.KubernetesVersion))
parentCtx := t.Context()
t.Run("AssertMachinesNotRebootedConfigUnchanged", AssertClusterSnapshot(t.Context(), omniClient, clusterName))
t.Run("WorkloadProxyShouldBeTested", func(t *testing.T) {
workloadproxy.Test(parentCtx, t, omniClient, clusterName)
})
t.Run(
"OneWorkerShouldBeAdded",
ScaleClusterUp(t.Context(), options.omniClient.Omni().State(), ClusterOptions{
Name: clusterName,
ControlPlanes: 0,
Workers: 1,
MachineOptions: options.MachineOptions,
ScalingTimeout: options.ScalingTimeout,
}),
)
runTests(t, AssertBlockClusterAndTalosAPIAndKubernetesShouldBeReady(t.Context(), omniClient, clusterName, options.MachineOptions.TalosVersion,
options.MachineOptions.KubernetesVersion))
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterName, false, false),
)
}
}
func testClusterImport(options *TestOptions) TestFunc {
return func(t *testing.T) {
t.Log(`
Create a single node imported cluster, assert that the cluster is ready and accessible and using the imported secrets bundle.`)
t.Parallel()
clusterOptions := ClusterOptions{
Name: "integration-imported-cluster",
ControlPlanes: 1,
Workers: 0,
MachineOptions: options.MachineOptions,
SkipExtensionCheckOnCreate: true,
}
bundle, err := talossecrets.NewBundle(talossecrets.NewFixedClock(time.Now()), config.TalosVersion1_10)
require.NoError(t, err)
bundleYaml, err := yaml.Marshal(bundle)
require.NoError(t, err)
ics := omni.NewImportedClusterSecrets(resources.DefaultNamespace, clusterOptions.Name)
ics.TypedSpec().Value.Data = string(bundleYaml)
require.NoError(t, options.omniClient.Omni().State().Create(t.Context(), ics))
options.claimMachines(t, clusterOptions.ControlPlanes+clusterOptions.Workers)
t.Run(
"ClusterShouldBeCreated",
CreateCluster(t.Context(), options.omniClient, clusterOptions),
)
assertClusterAndAPIReady(t, clusterOptions.Name, options)
assertClusterIsImported(t.Context(), t, options.omniClient.Omni().State(), clusterOptions.Name, bundleYaml)
t.Run(
"ClusterShouldBeDestroyed",
AssertDestroyCluster(t.Context(), options.omniClient.Omni().State(), clusterOptions.Name, false, false),
)
}
}