omni/internal/integration/cluster_test.go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.

//go:build integration

package integration_test

import (
	"context"
	"fmt"
	"math/rand/v2"
	"os"
	"sync"
	"testing"
	"time"

	"github.com/cosi-project/runtime/pkg/resource"
	"github.com/cosi-project/runtime/pkg/resource/rtestutils"
	"github.com/cosi-project/runtime/pkg/safe"
	"github.com/cosi-project/runtime/pkg/state"
	"github.com/siderolabs/gen/pair"
	"github.com/siderolabs/gen/xslices"
	"github.com/siderolabs/go-retry/retry"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	"gopkg.in/yaml.v3"

	"github.com/siderolabs/omni/client/api/omni/specs"
	"github.com/siderolabs/omni/client/pkg/client"
	"github.com/siderolabs/omni/client/pkg/omni/resources"
	"github.com/siderolabs/omni/client/pkg/omni/resources/infra"
	"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
	"github.com/siderolabs/omni/client/pkg/omni/resources/siderolink"
	"github.com/siderolabs/omni/client/pkg/omni/resources/virtual"
)

// BeforeClusterCreateFunc is a function that is called before a cluster is created.
type BeforeClusterCreateFunc func(ctx context.Context, t *testing.T, cli *client.Client, machineIDs []resource.ID)

// ClusterOptions are the options for cluster creation.
//
//nolint:govet
type ClusterOptions struct {
	Name string

	// RestoreFromEtcdBackupClusterID is the cluster ID of the cluster to restore from.
	// When specified, the cluster will be created with the etcd from the latest etcd backup of the specified cluster.
	RestoreFromEtcdBackupClusterID string

	ControlPlanes, Workers int
	Features               *specs.ClusterSpec_Features
	EtcdBackup             *specs.EtcdBackupConf

	MachineOptions MachineOptions

	BeforeClusterCreateFunc BeforeClusterCreateFunc

	InfraProvider string
	ProviderData  string

	ScalingTimeout time.Duration

	SkipExtensionCheckOnCreate bool
}

// CreateCluster verifies cluster creation.
func CreateCluster(testCtx context.Context, cli *client.Client, options ClusterOptions) TestFunc {
	return func(t *testing.T) {
		if options.ScalingTimeout == 0 {
			options.ScalingTimeout = time.Second * 150
		}

		ctx, cancel := context.WithTimeout(testCtx, options.ScalingTimeout)
		defer cancel()

		st := cli.Omni().State()
		require := require.New(t)

		pickUnallocatedMachines(ctx, t, st, options.ControlPlanes+options.Workers, func(machineIDs []resource.ID) {
			if !options.SkipExtensionCheckOnCreate {
				checkExtensionWithRetries(ctx, t, cli, HelloWorldServiceExtensionName, machineIDs...)
			}

			if options.BeforeClusterCreateFunc != nil {
				options.BeforeClusterCreateFunc(ctx, t, cli, machineIDs)
			}

			cluster := omni.NewCluster(resources.DefaultNamespace, options.Name)
			cluster.TypedSpec().Value.TalosVersion = options.MachineOptions.TalosVersion
			cluster.TypedSpec().Value.KubernetesVersion = options.MachineOptions.KubernetesVersion
			cluster.TypedSpec().Value.Features = options.Features
			cluster.TypedSpec().Value.BackupConfiguration = options.EtcdBackup

			require.NoError(st.Create(ctx, cluster))

			for i := range options.ControlPlanes {
				t.Logf("Adding machine '%s' to control plane (cluster %q)", machineIDs[i], options.Name)
				bindMachine(ctx, t, st, bindMachineOptions{
					clusterName:                    options.Name,
					role:                           omni.LabelControlPlaneRole,
					machineID:                      machineIDs[i],
					restoreFromEtcdBackupClusterID: options.RestoreFromEtcdBackupClusterID,
				})
			}

			for i := options.ControlPlanes; i < options.ControlPlanes+options.Workers; i++ {
				t.Logf("Adding machine '%s' to workers (cluster %q)", machineIDs[i], options.Name)
				bindMachine(ctx, t, st, bindMachineOptions{
					clusterName: options.Name,
					role:        omni.LabelWorkerRole,
					machineID:   machineIDs[i],
				})
			}

			// assert that machines got allocated (label available is removed)
			rtestutils.AssertResources(ctx, t, st, machineIDs, func(machineStatus *omni.MachineStatus, assert *assert.Assertions) {
				assert.True(machineStatus.Metadata().Labels().Matches(
					resource.LabelTerm{
						Key:    omni.MachineStatusLabelAvailable,
						Op:     resource.LabelOpExists,
						Invert: true,
					},
				), resourceDetails(machineStatus))
			})
		})
	}
}

// CreateClusterWithMachineClass verifies cluster creation.
func CreateClusterWithMachineClass(testCtx context.Context, st state.State, options ClusterOptions) TestFunc {
	return func(t *testing.T) {
		if options.ScalingTimeout == 0 {
			options.ScalingTimeout = time.Second * 150
		}

		ctx, cancel := context.WithTimeout(testCtx, options.ScalingTimeout)
		defer cancel()

		require := require.New(t)

		cluster := omni.NewCluster(resources.DefaultNamespace, options.Name)
		cluster.TypedSpec().Value.TalosVersion = options.MachineOptions.TalosVersion
		cluster.TypedSpec().Value.KubernetesVersion = options.MachineOptions.KubernetesVersion
		cluster.TypedSpec().Value.Features = options.Features
		cluster.TypedSpec().Value.BackupConfiguration = options.EtcdBackup

		kubespanEnabler := omni.NewConfigPatch(resources.DefaultNamespace, fmt.Sprintf("%s-kubespan-enabler", options.Name))
		kubespanEnabler.Metadata().Labels().Set(omni.LabelCluster, options.Name)

		err := kubespanEnabler.TypedSpec().Value.SetUncompressedData([]byte(`machine:
  network:
    kubespan:
      enabled: true
`))
		require.NoError(err)

		require.NoError(st.Create(ctx, cluster))
		require.NoError(st.Create(ctx, kubespanEnabler))

		machineClass := omni.NewMachineClass(resources.DefaultNamespace, options.Name)

		if options.InfraProvider != "" {
			createOrUpdate(ctx, t, st, machineClass, func(r *omni.MachineClass) error {
				r.TypedSpec().Value.MatchLabels = nil
				r.TypedSpec().Value.AutoProvision = &specs.MachineClassSpec_Provision{
					ProviderId:   options.InfraProvider,
					ProviderData: options.ProviderData,
				}

				return nil
			})
		} else {
			createOrUpdate(ctx, t, st, machineClass, func(r *omni.MachineClass) error {
				r.TypedSpec().Value.MatchLabels = []string{omni.MachineStatusLabelConnected}
				r.TypedSpec().Value.AutoProvision = nil

				return nil
			})
		}

		updateMachineClassMachineSets(ctx, t, st, options, machineClass)
	}
}

// ScaleClusterMachineSets scales the cluster with machine sets which are using machine classes.
func ScaleClusterMachineSets(testCtx context.Context, st state.State, options ClusterOptions) TestFunc {
	return func(t *testing.T) {
		if options.ScalingTimeout == 0 {
			options.ScalingTimeout = time.Second * 30
		}

		ctx, cancel := context.WithTimeout(testCtx, options.ScalingTimeout)
		defer cancel()

		updateMachineClassMachineSets(ctx, t, st, options, nil)
	}
}

// ScaleClusterUp scales up the cluster.
func ScaleClusterUp(testCtx context.Context, st state.State, options ClusterOptions) TestFunc {
	return func(t *testing.T) {
		if options.ScalingTimeout == 0 {
			options.ScalingTimeout = time.Second * 30
		}

		ctx, cancel := context.WithTimeout(testCtx, options.ScalingTimeout)
		defer cancel()

		pickUnallocatedMachines(ctx, t, st, options.ControlPlanes+options.Workers, func(machineIDs []resource.ID) {
			for i := range options.ControlPlanes {
				t.Logf("Adding machine '%s' to control plane (cluster %q)", machineIDs[i], options.Name)

				bindMachine(ctx, t, st, bindMachineOptions{
					clusterName: options.Name,
					role:        omni.LabelControlPlaneRole,
					machineID:   machineIDs[i],
				})
			}

			for i := options.ControlPlanes; i < options.ControlPlanes+options.Workers; i++ {
				t.Logf("Adding machine '%s' to workers (cluster %q)", machineIDs[i], options.Name)

				bindMachine(ctx, t, st, bindMachineOptions{
					clusterName: options.Name,
					role:        omni.LabelWorkerRole,
					machineID:   machineIDs[i],
				})
			}

			// assert that machines got allocated (label available is removed)
			rtestutils.AssertResources(ctx, t, st, machineIDs, func(machineStatus *omni.MachineStatus, assert *assert.Assertions) {
				assert.True(machineStatus.Metadata().Labels().Matches(
					resource.LabelTerm{
						Key:    omni.MachineStatusLabelAvailable,
						Op:     resource.LabelOpExists,
						Invert: true,
					},
				), resourceDetails(machineStatus))
			})

			// assert that ClusterMachines got created
			rtestutils.AssertResources(ctx, t, st, machineIDs, func(*omni.ClusterMachine, *assert.Assertions) {})
		})
	}
}

// ScaleClusterDown scales cluster down.
//
// Pass < 0 to scale down machine set.
// Pass 0 to leave machine set as is.
func ScaleClusterDown(testCtx context.Context, st state.State, options ClusterOptions) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 10*time.Second)
		defer cancel()

		controlPlanes := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st,
			state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, options.Name), resource.LabelExists(omni.LabelControlPlaneRole)))

		workers := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st,
			state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, options.Name), resource.LabelExists(omni.LabelWorkerRole)))

		if options.ControlPlanes < 0 {
			finalCount := len(controlPlanes) + options.ControlPlanes

			require.Greaterf(t, finalCount, 0, "can't scale down")
			controlPlanes = controlPlanes[finalCount:]

			t.Logf("Removing machines '%s' from control planes (cluster %q)", controlPlanes, options.Name)

			rtestutils.Destroy[*omni.MachineSetNode](ctx, t, st, controlPlanes)
		}

		if options.Workers < 0 {
			finalCount := len(workers) + options.Workers

			require.GreaterOrEqualf(t, finalCount, 0, "can't scale down")
			workers = workers[finalCount:]

			t.Logf("Removing machines '%s' from workers (cluster %q)", workers, options.Name)

			rtestutils.Destroy[*omni.MachineSetNode](ctx, t, st, workers)
		}
	}
}

// ReplaceControlPlanes replaces controlplane nodes.
func ReplaceControlPlanes(testCtx context.Context, st state.State, options ClusterOptions) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 30*time.Second)
		defer cancel()

		existingControlPlanes := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st,
			state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, options.Name), resource.LabelExists(omni.LabelControlPlaneRole)),
		)

		pickUnallocatedMachines(ctx, t, st, len(existingControlPlanes), func(machineIDs []resource.ID) {
			for _, machineID := range machineIDs {
				t.Logf("Adding machine '%s' to control plane (cluster %q)", machineID, options.Name)

				bindMachine(ctx, t, st, bindMachineOptions{
					clusterName: options.Name,
					role:        omni.LabelControlPlaneRole,
					machineID:   machineID,
				})
			}

			t.Logf("Removing machines '%s' from control planes (cluster %q)", existingControlPlanes, options.Name)

			rtestutils.Destroy[*omni.MachineSetNode](ctx, t, st, existingControlPlanes)

			// assert that machines got allocated (label available is removed)
			rtestutils.AssertResources(ctx, t, st, machineIDs, func(machineStatus *omni.MachineStatus, assert *assert.Assertions) {
				assert.True(machineStatus.Metadata().Labels().Matches(
					resource.LabelTerm{
						Key:    omni.MachineStatusLabelAvailable,
						Op:     resource.LabelOpExists,
						Invert: true,
					},
				), resourceDetails(machineStatus))
			})

			// assert that ClusterMachines got created
			rtestutils.AssertResources(ctx, t, st, machineIDs, func(*omni.ClusterMachine, *assert.Assertions) {})
		})
	}
}

// AssertClusterMachinesStage verifies that cluster machines reach a specified phase.
func AssertClusterMachinesStage(testCtx context.Context, st state.State, clusterName string, stage specs.ClusterMachineStatusSpec_Stage) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 6*time.Minute)
		defer cancel()

		require := require.New(t)

		machineIDs := getMachineSetNodes(ctx, t, st, clusterName)
		require.NotEmpty(machineIDs, "no machine set nodes found for cluster %q", clusterName)

		// assert that all machinesetnodes are present as cluster machines
		rtestutils.AssertResources(ctx, t, st, machineIDs, func(*omni.ClusterMachine, *assert.Assertions) {})

		// assert that there are not clustermachines which are not machinesetnodes
		clusterMachines, err := safe.ReaderListAll[*omni.ClusterMachine](ctx, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)))
		require.NoError(err)

		machineIDMap := xslices.ToSet(machineIDs)

		clusterMachines.ForEach(func(r *omni.ClusterMachine) {
			cmID := r.Metadata().ID()

			if _, ok := machineIDMap[cmID]; ok && r.Metadata().Phase() == resource.PhaseRunning {
				// cluster machine matches expected machine set node
				return
			}

			// wait for the cluster machine to be cleaned up
			rtestutils.AssertNoResource[*omni.ClusterMachine](ctx, t, st, cmID)
		})

		// retry with the poller as the set of the machine set nodes can be changed during the lifecycle
		err = retry.Constant(time.Minute*6, retry.WithUnits(time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
			machineIDs := getMachineSetNodes(ctx, t, st, clusterName)

			for _, machine := range machineIDs {
				var status *omni.ClusterMachineStatus

				status, err = safe.ReaderGetByID[*omni.ClusterMachineStatus](ctx, st, machine)
				if err != nil && !state.IsNotFoundError(err) {
					return err
				}

				if status == nil {
					return retry.ExpectedErrorf("machine %q status doesn't exist yet", machine)
				}

				spec := status.TypedSpec().Value

				if spec.Stage != stage {
					return retry.ExpectedErrorf("%s != %s, %s", stage.String(), spec.Stage.String(), resourceDetails(status))
				}
			}

			return nil
		})
		require.NoError(err)
	}
}

// AssertClusterMachinesReady verifies that cluster machines reach ready state.
func AssertClusterMachinesReady(testCtx context.Context, st state.State, clusterName string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 4*time.Minute)
		defer cancel()

		require := require.New(t)

		machineIDs := getMachineSetNodes(ctx, t, st, clusterName)
		require.NotEmpty(machineIDs)

		rtestutils.AssertResources(ctx, t, st, machineIDs, func(*omni.ClusterMachine, *assert.Assertions) {})

		rtestutils.AssertResources(ctx, t, st, machineIDs, func(status *omni.ClusterMachineStatus, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			assert.Truef(spec.Ready, "cluster machine status not ready: %s", resourceDetails(status))
		})

		rtestutils.AssertResources(ctx, t, st, machineIDs, func(status *omni.ClusterMachineIdentity, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			assert.NotEmptyf(spec.NodeIdentity, "no node identity: %s", resourceDetails(status))
		})
	}
}

// AssertClusterStatusReady verifies that cluster status reaches ready state.
func AssertClusterStatusReady(testCtx context.Context, st state.State, clusterName string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, time.Minute*5)
		defer cancel()

		require := require.New(t)

		rtestutils.AssertResources(ctx, t, st, []string{clusterName}, func(status *omni.ClusterStatus, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			machineIDs := getMachineSetNodes(ctx, t, st, clusterName)
			require.NotEmpty(machineIDs)

			assert.Truef(spec.Available, "not available: %s", resourceDetails(status))
			assert.Equalf(specs.ClusterStatusSpec_RUNNING, spec.Phase, "cluster is not in phase running: %s", resourceDetails(status))
			assert.Equalf(spec.GetMachines().Total, spec.GetMachines().Healthy, "not all machines are healthy: %s", resourceDetails(status))
			assert.Truef(spec.Ready, "cluster is not ready: %s", resourceDetails(status))
			assert.Truef(spec.ControlplaneReady, "cluster controlplane is not ready: %s", resourceDetails(status))
			assert.Truef(spec.KubernetesAPIReady, "cluster kubernetes API is not ready: %s", resourceDetails(status))
			assert.EqualValuesf(len(machineIDs), spec.GetMachines().Total, "total machines is not the same as in the machine sets: %s", resourceDetails(status))
		})
	}
}

// AssertClusterLoadBalancerReady verifies that cluster load balancer reaches ready state.
func AssertClusterLoadBalancerReady(testCtx context.Context, st state.State, clusterName string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 30*time.Second)
		defer cancel()

		rtestutils.AssertResources(ctx, t, st, []string{clusterName}, func(status *omni.LoadBalancerStatus, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			assert.Truef(spec.Healthy, "lb not healthy: %s", resourceDetails(status))
		})
	}
}

// AssertClusterKubernetesVersion verifies that Kubernetes version matches expectations.
func AssertClusterKubernetesVersion(testCtx context.Context, st state.State, clusterName, expectedKubernetesVersion string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 180*time.Second)
		defer cancel()

		rtestutils.AssertResources(ctx, t, st, []string{clusterName}, func(status *omni.KubernetesUpgradeStatus, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			assert.Equal(expectedKubernetesVersion, spec.LastUpgradeVersion, resourceDetails(status))
			assert.Equal(specs.KubernetesUpgradeStatusSpec_Done, spec.Phase, resourceDetails(status))
		})
	}
}

// AssertClusterBootstrapManifestStatus verifies that Kubernetes boostrap manifests are in sync.
func AssertClusterBootstrapManifestStatus(testCtx context.Context, st state.State, clusterName string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 60*time.Second)
		defer cancel()

		rtestutils.AssertResources(ctx, t, st, []string{clusterName}, func(status *omni.KubernetesUpgradeManifestStatus, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			assert.EqualValues(0, spec.OutOfSync, resourceDetails(status))
		})
	}
}

// AssertClusterKubernetesUsage verifies that Kubernetes usage matches expectations.
func AssertClusterKubernetesUsage(testCtx context.Context, st state.State, clusterName string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 180*time.Second)
		defer cancel()

		rtestutils.AssertResource(ctx, t, st, clusterName, func(status *virtual.KubernetesUsage, assert *assert.Assertions) {
			spec := status.TypedSpec().Value

			assert.NotNil(spec.Cpu, resourceDetails(status))
			assert.NotNil(spec.Mem, resourceDetails(status))
			assert.NotNil(spec.Storage, resourceDetails(status))
			assert.NotNil(spec.Pods, resourceDetails(status))

			assert.Greater(spec.Cpu.Requests, float64(0), resourceDetails(status))
			assert.Greater(spec.Cpu.Capacity, float64(0), resourceDetails(status))

			assert.Greater(spec.Mem.Requests, float64(0), resourceDetails(status))
			assert.Greater(spec.Mem.Capacity, float64(0), resourceDetails(status))

			assert.Greater(spec.Storage.Capacity, float64(0), resourceDetails(status))

			assert.Greater(spec.Pods.Count, int32(0), resourceDetails(status))
			assert.Greater(spec.Pods.Capacity, int32(0), resourceDetails(status))
		}, rtestutils.WithNamespace(resources.VirtualNamespace))
	}
}

// DestroyCluster destroys a cluster and waits for it to be destroyed.
//
// It is used as a finalizer when the test group fails.
func DestroyCluster(testCtx context.Context, client *client.Client, supportBundleDir, clusterName string) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 6*time.Minute)
		defer cancel()

		st := client.Omni().State()

		if err := saveSupportBundle(ctx, client, supportBundleDir, clusterName); err != nil {
			t.Logf("failed to save support bundle: %v", err)
		}

		clusterMachineIDs := rtestutils.ResourceIDs[*omni.ClusterMachine](ctx, t, st, state.WithLabelQuery(
			resource.LabelEqual(omni.LabelCluster, clusterName),
		))

		t.Log("destroying cluster", clusterName)

		rtestutils.Teardown[*omni.Cluster](ctx, t, st, []resource.ID{clusterName})

		rtestutils.AssertNoResource[*omni.Cluster](ctx, t, st, clusterName)

		// wait for all machines to returned to the pool as 'available' or be part of a different cluster
		rtestutils.AssertResources(ctx, t, st, clusterMachineIDs, func(machine *omni.MachineStatus, asrt *assert.Assertions) {
			_, isAvailable := machine.Metadata().Labels().Get(omni.MachineStatusLabelAvailable)
			machineCluster, machineBound := machine.Metadata().Labels().Get(omni.LabelCluster)

			asrt.True(isAvailable || (machineBound && machineCluster != clusterName),
				"machine %q: available %v, bound %v, cluster %q", machine.Metadata().ID(), isAvailable, machineBound, machineCluster,
			)
		})

		_, err := st.Get(ctx, omni.NewMachineClass(resources.DefaultNamespace, clusterName).Metadata())
		if state.IsNotFoundError(err) {
			return
		}

		require.NoError(t, err)

		t.Log("destroying related machine class", clusterName)

		rtestutils.Destroy[*omni.MachineClass](ctx, t, st, []string{clusterName})
	}
}

func saveSupportBundle(ctx context.Context, cli *client.Client, dir, cluster string) error {
	supportBundle, err := cli.Management().GetSupportBundle(ctx, cluster, nil)
	if err != nil {
		return fmt.Errorf("failed to get support bundle before destruction for cluster %q: %w", cluster, err)
	}

	if err = os.MkdirAll(dir, 0o755); err != nil {
		return fmt.Errorf("failed to create directory %q: %w", dir, err)
	}

	if err = os.WriteFile("support-bundle-"+cluster+".zip", supportBundle, 0o644); err != nil {
		return fmt.Errorf("failed to write support bundle file %q: %w", cluster, err)
	}

	return nil
}

// AssertDestroyCluster destroys a cluster and verifies that all dependent resources are gone.
func AssertDestroyCluster(testCtx context.Context, st state.State, clusterName string, expectMachinesRemoved, assertInfraMachinesState bool) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 300*time.Second)
		defer cancel()

		patches := rtestutils.ResourceIDs[*omni.ConfigPatch](ctx, t, st, state.WithLabelQuery(
			resource.LabelEqual(omni.LabelCluster, clusterName),
		))

		machineSets := rtestutils.ResourceIDs[*omni.MachineSet](ctx, t, st, state.WithLabelQuery(
			resource.LabelEqual(omni.LabelCluster, clusterName),
		))

		clusterMachineIDs := rtestutils.ResourceIDs[*omni.ClusterMachine](ctx, t, st, state.WithLabelQuery(
			resource.LabelEqual(omni.LabelCluster, clusterName),
		))

		t.Log("destroying cluster", clusterName)

		_, err := st.Teardown(ctx, resource.NewMetadata(resources.DefaultNamespace, omni.ClusterType, clusterName, resource.VersionUndefined))

		require.NoError(t, err)

		rtestutils.AssertNoResource[*omni.Cluster](ctx, t, st, clusterName)

		for _, id := range patches {
			rtestutils.AssertNoResource[*omni.ConfigPatch](ctx, t, st, id)
		}

		for _, id := range machineSets {
			rtestutils.AssertNoResource[*omni.MachineSet](ctx, t, st, id)
		}

		if expectMachinesRemoved {
			for _, id := range clusterMachineIDs {
				rtestutils.AssertNoResource[*omni.MachineStatus](ctx, t, st, id)
			}

			return
		}

		// wait for all machines to returned to the pool as 'available' or be part of a different cluster
		rtestutils.AssertResources(ctx, t, st, clusterMachineIDs, func(machine *omni.MachineStatus, asrt *assert.Assertions) {
			_, isAvailable := machine.Metadata().Labels().Get(omni.MachineStatusLabelAvailable)
			machineCluster, machineBound := machine.Metadata().Labels().Get(omni.LabelCluster)
			asrt.True(isAvailable || (machineBound && machineCluster != clusterName),
				"machine %q: available %v, bound %v, cluster %q", machine.Metadata().ID(), isAvailable, machineBound, machineCluster,
			)
		})

		if assertInfraMachinesState {
			rtestutils.AssertResources(ctx, t, st, clusterMachineIDs, func(res *infra.Machine, assertion *assert.Assertions) {
				assertion.Empty(res.TypedSpec().Value.ClusterTalosVersion) // unallocated
				assertion.Empty(res.TypedSpec().Value.Extensions)

				if assertion.NotEmpty(res.TypedSpec().Value.WipeId) { // the machine should be marked for wipe
					t.Logf("machine %q is marked for wipe: %s", res.Metadata().ID(), res.TypedSpec().Value.WipeId)
				}
			})

			// the provider will wipe the machine and sets the Installed field to false
			// after the machine is wiped, ReadyToUse field will be set to true
			rtestutils.AssertResources(ctx, t, st, clusterMachineIDs, func(res *infra.MachineStatus, assertion *assert.Assertions) {
				assertion.False(res.TypedSpec().Value.Installed)
				assertion.True(res.TypedSpec().Value.ReadyToUse)
			})
		}
	}
}

// AssertBreakAndDestroyControlPlane breaks the control plane of the given cluster
// by freezing all control plane machines, then destroys its control plane.
func AssertBreakAndDestroyControlPlane(testCtx context.Context, st state.State, clusterName string, options Options) TestFunc {
	return func(t *testing.T) {
		ctx, cancel := context.WithTimeout(testCtx, 300*time.Second)
		defer cancel()

		rtestutils.AssertResource[*omni.ClusterBootstrapStatus](ctx, t, st, clusterName, func(status *omni.ClusterBootstrapStatus, assert *assert.Assertions) {
			assert.True(status.TypedSpec().Value.GetBootstrapped())
		})

		// break the control plane
		frozenMachineIDs := freezeMachinesOfType(ctx, t, st, clusterName, options.FreezeAMachineFunc, omni.LabelControlPlaneRole)

		// remove the broken machines
		rtestutils.Teardown[*siderolink.Link](ctx, t, st, frozenMachineIDs)

		// destroy the control plane
		rtestutils.Destroy[*omni.MachineSet](ctx, t, st, []string{omni.ControlPlanesResourceID(clusterName)})
		rtestutils.Destroy[*siderolink.Link](ctx, t, st, frozenMachineIDs)

		// assert that the bootstrapped flag is set to false
		rtestutils.AssertResource[*omni.ClusterBootstrapStatus](ctx, t, st, clusterName, func(status *omni.ClusterBootstrapStatus, assert *assert.Assertions) {
			assert.False(status.TypedSpec().Value.GetBootstrapped())
		})

		// wipe the frozen machines to bring them back to the pool
		for _, machineID := range frozenMachineIDs {
			wipeMachine(ctx, t, st, machineID, options.WipeAMachineFunc)
		}
	}
}

const nodeLabel = "omni-uuid"

type bindMachineOptions struct {
	clusterName, role, machineID, restoreFromEtcdBackupClusterID string
}

func bindMachine(ctx context.Context, t *testing.T, st state.State, bindOpts bindMachineOptions) {
	configPatch := omni.NewConfigPatch(
		resources.DefaultNamespace,
		fmt.Sprintf("000-%s-%s-install-disk", bindOpts.clusterName, bindOpts.machineID),
		pair.MakePair(omni.LabelCluster, bindOpts.clusterName),
		pair.MakePair(omni.LabelClusterMachine, bindOpts.machineID),
	)

	createOrUpdate(ctx, t, st, configPatch, func(cps *omni.ConfigPatch) error {
		cps.Metadata().Labels().Set(omni.LabelCluster, bindOpts.clusterName)
		cps.Metadata().Labels().Set(omni.LabelClusterMachine, bindOpts.machineID)

		var shortRole string

		switch bindOpts.role {
		case omni.LabelControlPlaneRole:
			shortRole = "cp"
		case omni.LabelWorkerRole:
			shortRole = "w"
		}

		hostname := fmt.Sprintf("%s-%s-%s", bindOpts.clusterName, shortRole, bindOpts.machineID)

		if len(hostname) > 63 {
			// trim left, to keep the UUID intact
			hostname = hostname[len(hostname)-63:]
		}

		patch := map[string]any{
			"machine": map[string]any{
				"install": map[string]any{
					"disk": "/dev/vda",
				},
				"network": map[string]any{
					"hostname": hostname,
				},
				"kubelet": map[string]any{
					"extraArgs": map[string]any{
						"node-labels": fmt.Sprintf("%s=%s", nodeLabel, bindOpts.machineID),
					},
				},
			},
		}

		patchBytes, err := yaml.Marshal(patch)
		if err != nil {
			return err
		}

		return cps.TypedSpec().Value.SetUncompressedData(patchBytes)
	})

	id := omni.WorkersResourceID(bindOpts.clusterName)
	if bindOpts.role == omni.LabelControlPlaneRole {
		id = omni.ControlPlanesResourceID(bindOpts.clusterName)
	}

	ms := omni.NewMachineSet(resources.DefaultNamespace, id)
	ms.Metadata().Labels().Set(omni.LabelCluster, bindOpts.clusterName)
	ms.Metadata().Labels().Set(bindOpts.role, "")

	var bootstrapSpec *specs.MachineSetSpec_BootstrapSpec

	if bindOpts.restoreFromEtcdBackupClusterID != "" && bindOpts.role == omni.LabelControlPlaneRole { // not a fresh cluster - restore from the etcd backup of another cluster
		backupList, err := safe.StateListAll[*omni.EtcdBackup](ctx, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, bindOpts.restoreFromEtcdBackupClusterID)))
		require.NoError(t, err)

		require.NotEmpty(t, backupList.Len(), "no etcd backup found for cluster %q", bindOpts.restoreFromEtcdBackupClusterID)

		clusterUUID, err := safe.StateGetByID[*omni.ClusterUUID](ctx, st, bindOpts.restoreFromEtcdBackupClusterID)
		require.NoError(t, err)

		backup := backupList.Get(0)

		bootstrapSpec = &specs.MachineSetSpec_BootstrapSpec{
			ClusterUuid: clusterUUID.TypedSpec().Value.GetUuid(),
			Snapshot:    backup.TypedSpec().Value.GetSnapshot(),
		}
	}

	createOrUpdate(ctx, t, st, ms, func(ms *omni.MachineSet) error {
		ms.Metadata().Labels().Set(omni.LabelCluster, bindOpts.clusterName)
		ms.Metadata().Labels().Set(bindOpts.role, "")

		ms.TypedSpec().Value.UpdateStrategy = specs.MachineSetSpec_Rolling

		if bootstrapSpec != nil {
			ms.TypedSpec().Value.BootstrapSpec = bootstrapSpec
		}

		return nil
	})

	machineSetNode := omni.NewMachineSetNode(resources.DefaultNamespace, bindOpts.machineID, ms)

	_, ok := machineSetNode.Metadata().Labels().Get(omni.LabelCluster)
	require.Truef(t, ok, "the machine label cluster is not set on the machine set node")

	createOrUpdate(ctx, t, st, machineSetNode, func(*omni.MachineSetNode) error {
		return nil
	})
}

func getMachineSetNodes(ctx context.Context, t *testing.T, st state.State, clusterName string) []string {
	require := require.New(t)

	machineIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)))
	require.NotEmpty(machineIDs)

	return machineIDs
}

// machineAllocationLock makes sure that only one test allocates machines at a time.
var machineAllocationLock sync.Mutex

func pickUnallocatedMachines(ctx context.Context, t *testing.T, st state.State, count int, f func([]resource.ID)) {
	machineAllocationLock.Lock()
	defer machineAllocationLock.Unlock()

	result := make([]resource.ID, 0, count)

	err := retry.Constant(time.Minute).RetryWithContext(ctx, func(ctx context.Context) error {
		machineIDs := rtestutils.ResourceIDs[*omni.MachineStatus](ctx, t, st, state.WithLabelQuery(resource.LabelExists(omni.MachineStatusLabelAvailable)))

		if len(machineIDs) < count {
			return retry.ExpectedErrorf("not enough machines: available %d, requested %d", len(machineIDs), count)
		}

		for _, j := range rand.Perm(len(machineIDs))[:count] {
			result = append(result, machineIDs[j])
		}

		return nil
	})

	require.NoError(t, err)

	f(result)
}

func createOrUpdate[T resource.Resource](ctx context.Context, t *testing.T, s state.State, res T, update func(T) error, createOpts ...state.CreateOption) {
	require := require.New(t)

	cb := func(r T) error {
		for key, value := range res.Metadata().Labels().Raw() {
			r.Metadata().Labels().Set(key, value)
		}

		return update(r)
	}

	// try getting the resource first, and if it exists, skip attempting to create,
	// as relying on create to fail with conflict might not give the expected result due to validation errors
	_, err := s.Get(ctx, res.Metadata())
	notFound := state.IsNotFoundError(err)

	if err != nil && !notFound {
		require.NoError(err)
	}

	if notFound {
		toCreate := res.DeepCopy().(T) //nolint:forcetypeassert,errcheck

		require.NoError(cb(toCreate))

		err = s.Create(ctx, toCreate, createOpts...)
		if err == nil {
			return
		}

		if !state.IsConflictError(err) {
			require.NoError(err)
		}
	}

	if _, err = safe.StateUpdateWithConflicts(ctx, s, res.Metadata(), cb); err != nil {
		require.NoError(err)
	}
}

func waitMachineSetNodesSync(ctx context.Context, t *testing.T, st state.State, options ClusterOptions) {
	machineSets := []resource.ID{
		omni.ControlPlanesResourceID(options.Name),
		omni.WorkersResourceID(options.Name),
	}

	rtestutils.AssertResources(ctx, t, st, machineSets, func(status *omni.MachineSetStatus, assert *assert.Assertions) {
		spec := status.TypedSpec().Value

		ids := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
			resource.LabelEqual(omni.LabelMachineSet, status.Metadata().ID()),
		))

		assert.Equal(int(spec.Machines.Requested), len(ids), resourceDetails(status))
	})
}

func updateMachineClassMachineSets(ctx context.Context, t *testing.T, st state.State, options ClusterOptions, machineClass *omni.MachineClass) {
	machineAllocationLock.Lock()
	defer machineAllocationLock.Unlock()

	for _, role := range []string{omni.LabelControlPlaneRole, omni.LabelWorkerRole} {
		id := omni.WorkersResourceID(options.Name)
		machineCount := options.Workers

		if role == omni.LabelControlPlaneRole {
			id = omni.ControlPlanesResourceID(options.Name)
			machineCount = options.ControlPlanes
		}

		ms := omni.NewMachineSet(resources.DefaultNamespace, id)

		createOrUpdate(ctx, t, st, ms, func(r *omni.MachineSet) error {
			r.Metadata().Labels().Set(omni.LabelCluster, options.Name)
			r.Metadata().Labels().Set(role, "")

			switch {
			case machineClass != nil:
				r.TypedSpec().Value.MachineAllocation = &specs.MachineSetSpec_MachineAllocation{
					MachineCount: uint32(machineCount),
					Name:         machineClass.Metadata().ID(),
				}
			case r.TypedSpec().Value.MachineAllocation != nil:
				r.TypedSpec().Value.MachineAllocation.MachineCount += uint32(machineCount)
			}

			require.NotNilf(t, r.TypedSpec().Value.MachineAllocation, "the machine set doesn't have machine class set")

			r.TypedSpec().Value.UpdateStrategy = specs.MachineSetSpec_Rolling

			return nil
		})
	}

	waitMachineSetNodesSync(ctx, t, st, options)

	ids := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, options.Name)))

	// populate uuid patches for each machine matching the machine class
	for _, machineID := range ids {
		configPatch := omni.NewConfigPatch(
			resources.DefaultNamespace,
			fmt.Sprintf("000-%s-uuid-patch", machineID),
			pair.MakePair(omni.LabelCluster, options.Name),
			pair.MakePair(omni.LabelClusterMachine, machineID),
		)

		createOrUpdate(ctx, t, st, configPatch, func(cps *omni.ConfigPatch) error {
			cps.Metadata().Labels().Set(omni.LabelCluster, options.Name)
			cps.Metadata().Labels().Set(omni.LabelClusterMachine, machineID)

			return cps.TypedSpec().Value.SetUncompressedData([]byte(fmt.Sprintf(`machine:
  kubelet:
    extraArgs:
      node-labels: %s=%s`, nodeLabel, machineID)))
		})
	}
}